1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 """Class to perform translation memory matching from a store of translation units"""
22
23 import heapq
24 import re
25
26 from translate.search import lshtein
27 from translate.search import terminology
28 from translate.storage import base
29 from translate.storage import po
30 from translate.misc.multistring import multistring
31
32
34 """Returns the length of the source string"""
35 return len(unit.source)
36
37
39 def _matches_cmp(x, y):
40
41
42 c = cmp(match_info[x.source]['pos'], match_info[y.source]['pos'])
43 return c and c or cmp(len(y.source), len(x.source))
44 matches.sort(_matches_cmp)
45
46
48 """A class that will do matching and store configuration for the matching process"""
49
50 sort_reverse = False
51
52 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=70, comparer=None, usefuzzy=False):
53 """max_candidates is the maximum number of candidates that should be assembled,
54 min_similarity is the minimum similarity that must be attained to be included in
55 the result, comparer is an optional Comparer with similarity() function"""
56 if comparer is None:
57 comparer = lshtein.LevenshteinComparer(max_length)
58 self.comparer = comparer
59 self.setparameters(max_candidates, min_similarity, max_length)
60 self.usefuzzy = usefuzzy
61 self.inittm(store)
62 self.addpercentage = True
63
78
79 - def inittm(self, stores, reverse=False):
80 """Initialises the memory for later use. We use simple base units for
81 speedup."""
82
83 self.existingunits = {}
84 self.candidates = base.TranslationStore()
85
86 if isinstance(stores, base.TranslationStore):
87 stores = [stores]
88 for store in stores:
89 self.extendtm(store.units, store=store, sort=False)
90 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
91
92
93
94 - def extendtm(self, units, store=None, sort=True):
95 """Extends the memory with extra unit(s).
96
97 @param units: The units to add to the TM.
98 @param store: Optional store from where some metadata can be retrieved
99 and associated with each unit.
100 @param sort: Optional parameter that can be set to False to supress
101 sorting of the candidates list. This should probably only be used in
102 inittm().
103 """
104 if isinstance(units, base.TranslationUnit):
105 units = [units]
106 candidates = filter(self.usable, units)
107 for candidate in candidates:
108 simpleunit = base.TranslationUnit("")
109
110
111 if isinstance(candidate.source, multistring):
112 if len(candidate.source.strings) > 1:
113 simpleunit.orig_source = candidate.source
114 simpleunit.orig_target = candidate.target
115 simpleunit.source = unicode(candidate.source)
116 simpleunit.target = unicode(candidate.target)
117 else:
118 simpleunit.source = candidate.source
119 simpleunit.target = candidate.target
120
121
122
123
124 simpleunit.addnote(candidate.getnotes(origin="translator"))
125 simpleunit.fuzzy = candidate.isfuzzy()
126 self.candidates.units.append(simpleunit)
127 if sort:
128 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
129
130 - def setparameters(self, max_candidates=10, min_similarity=75, max_length=70):
131 """Sets the parameters without reinitialising the tm. If a parameter
132 is not specified, it is set to the default, not ignored"""
133 self.MAX_CANDIDATES = max_candidates
134 self.MIN_SIMILARITY = min_similarity
135 self.MAX_LENGTH = max_length
136
138 """Calculates a length beyond which we are not interested.
139 The extra fat is because we don't use plain character distance only."""
140 return min(len(text) / (min_similarity/100.0), self.MAX_LENGTH)
141
143 """Calculates the minimum length we are interested in.
144 The extra fat is because we don't use plain character distance only."""
145 return max(len(text) * (min_similarity/100.0), 1)
146
148 """Returns a list of possible matches for given source text.
149
150 @type text: String
151 @param text: The text that will be search for in the translation memory
152 @rtype: list
153 @return: a list of units with the source and target strings from the
154 translation memory. If self.addpercentage is true (default) the match
155 quality is given as a percentage in the notes.
156 """
157 bestcandidates = [(0.0, None)]*self.MAX_CANDIDATES
158
159
160 min_similarity = self.MIN_SIMILARITY
161
162
163
164
165
166
167
168 startlength = self.getstartlength(min_similarity, text)
169 startindex = 0
170 endindex = len(self.candidates.units)
171 while startindex < endindex:
172 mid = (startindex + endindex) // 2
173 if sourcelen(self.candidates.units[mid]) < startlength:
174 startindex = mid + 1
175 else:
176 endindex = mid
177
178
179 stoplength = self.getstoplength(min_similarity, text)
180 lowestscore = 0
181
182 for candidate in self.candidates.units[startindex:]:
183 cmpstring = candidate.source
184 if len(cmpstring) > stoplength:
185 break
186 similarity = self.comparer.similarity(text, cmpstring, min_similarity)
187 if similarity < min_similarity:
188 continue
189 if similarity > lowestscore:
190 heapq.heapreplace(bestcandidates, (similarity, candidate))
191 lowestscore = bestcandidates[0][0]
192 if lowestscore >= 100:
193 break
194 if min_similarity < lowestscore:
195 min_similarity = lowestscore
196 stoplength = self.getstoplength(min_similarity, text)
197
198
199 def notzero(item):
200 score = item[0]
201 return score != 0
202 bestcandidates = filter(notzero, bestcandidates)
203
204 bestcandidates.sort(reverse=True)
205 return self.buildunits(bestcandidates)
206
208 """Builds a list of units conforming to base API, with the score in the comment"""
209 units = []
210 for score, candidate in candidates:
211 if hasattr(candidate, "orig_source"):
212 candidate.source = candidate.orig_source
213 candidate.target = candidate.orig_target
214 newunit = po.pounit(candidate.source)
215 newunit.target = candidate.target
216 newunit.markfuzzy(candidate.fuzzy)
217 candidatenotes = candidate.getnotes().strip()
218 if candidatenotes:
219 newunit.addnote(candidatenotes)
220 if self.addpercentage:
221 newunit.addnote("%d%%" % score)
222 units.append(newunit)
223 return units
224
225
226
227
228
229
230
231
232
233 ignorepatterns = [
234 ("y\s*$", "ie"),
235 ("[\s-]+", ""),
236 ("-", " "),
237 (" ", "-"),
238 ]
239 ignorepatterns_re = [(re.compile(a), b) for (a, b) in ignorepatterns]
240
241 context_re = re.compile("\s+\(.*\)\s*$")
242
244 """A matcher with settings specifically for terminology matching"""
245
246 sort_reverse = True
247
248 - def __init__(self, store, max_candidates=10, min_similarity=75, max_length=500, comparer=None):
249 if comparer is None:
250 comparer = terminology.TerminologyComparer(max_length)
251 matcher.__init__(self, store, max_candidates, min_similarity=10, max_length=max_length, comparer=comparer)
252 self.addpercentage = False
253 self.match_info = {}
254
256 """Normal initialisation, but convert all source strings to lower case"""
257 matcher.inittm(self, store)
258 extras = []
259 for unit in self.candidates.units:
260 source = unit.source = context_re.sub("", unit.source).lower()
261 for ignorepattern_re, replacement in ignorepatterns_re:
262 (newterm, occurrences) = ignorepattern_re.subn(replacement, source)
263 if occurrences:
264 new_unit = type(unit).buildfromunit(unit)
265 new_unit.source = newterm
266
267 unit.markfuzzy()
268 extras.append(new_unit)
269 self.candidates.units.sort(key=sourcelen, reverse=self.sort_reverse)
270 if extras:
271
272
273 self.extendtm(extras, sort=False)
274
279
284
286 """Returns whether this translation unit is usable for terminology."""
287 if not unit.istranslated():
288 return False
289 l = len(context_re.sub("", unit.source))
290 return l <= self.MAX_LENGTH and l >= self.getstartlength(None, None)
291
293 """Normal matching after converting text to lower case. Then replace
294 with the original unit to retain comments, etc."""
295 text = text.lower()
296 comparer = self.comparer
297 comparer.match_info = {}
298 match_info = {}
299 matches = []
300 known = set()
301 for cand in self.candidates.units:
302 source = cand.source
303 if (source, cand.target) in known:
304 continue
305 if comparer.similarity(text, source, self.MIN_SIMILARITY):
306 match_info[source] = {'pos': comparer.match_info[source]['pos']}
307 matches.append(cand)
308 known.add((source, cand.target))
309
310 final_matches = []
311 lastend = 0
312 _sort_matches(matches, match_info)
313 for match in matches:
314 start_pos = match_info[match.source]['pos']
315 if start_pos < lastend:
316 continue
317 end = start_pos + len(match.source)
318
319 final_matches.append(match)
320
321
322 for m in matches:
323 if m is match:
324 continue
325 m_info = match_info[m.source]
326 m_end = m_info['pos']
327 if m_end > start_pos:
328
329 break
330 m_end += len(m.source)
331 if start_pos == m_info['pos'] and end == m_end:
332
333 final_matches.append(m)
334
335 lastend = end
336 if final_matches:
337 self.match_info = match_info
338 return final_matches
339
340
341
346
348 """extracts match quality from po comments"""
349 quality = re.search('([0-9]+)%', comment)
350 if quality:
351 return quality.group(1)
352