1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 """reads a set of .po or .pot files to produce a pootle-terminology.pot
21
22 See: http://translate.sourceforge.net/wiki/toolkit/poterminology for examples and
23 usage instructions
24 """
25
26 from translate.lang import factory as lang_factory
27 from translate.misc import optrecurse
28 from translate.storage import po
29 from translate.storage import factory
30 from translate.misc import file_discovery
31 import os
32 import re
33 import sys
34
36 """a specialized Option Parser for the terminology tool..."""
37
38
39 formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]")
40
41 xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>")
42
43 xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);",
44 flags=re.UNICODE|re.IGNORECASE)
45
46 sortorders = [ "frequency", "dictionary", "length" ]
47
48 files = 0
49 units = 0
50
52 """parses the command line options, handling implicit input/output args"""
53 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values)
54
55 if args and not options.input:
56 if not options.output and not options.update and len(args) > 1:
57 options.input = args[:-1]
58 args = args[-1:]
59 else:
60 options.input = args
61 args = []
62
63
64 if args and not options.output and not options.update:
65 if os.path.lexists(args[-1]) and not os.path.isdir(args[-1]):
66 self.error("To overwrite %s, specify it with -o/--output or -u/--update" % (args[-1]))
67 options.output = args[-1]
68 args = args[:-1]
69 if options.output and options.update:
70 self.error("You cannot use both -u/--update and -o/--output")
71 if args:
72 self.error("You have used an invalid combination of -i/--input, -o/--output, -u/--update and freestanding args")
73 if not options.input:
74 self.error("No input file or directory was specified")
75 if isinstance(options.input, list) and len(options.input) == 1:
76 options.input = options.input[0]
77 if options.inputmin == None:
78 options.inputmin = 1
79 elif not isinstance(options.input, list) and not os.path.isdir(options.input):
80 if options.inputmin == None:
81 options.inputmin = 1
82 elif options.inputmin == None:
83 options.inputmin = 2
84 if options.update:
85 options.output = options.update
86 if isinstance(options.input, list):
87 options.input.append(options.update)
88 elif options.input:
89 options.input = [options.input, options.update]
90 else:
91 options.input = options.update
92 if not options.output:
93 options.output = "pootle-terminology.pot"
94 return (options, args)
95
97 """sets the usage string - if usage not given, uses getusagestring for each option"""
98 if usage is None:
99 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \
100 "\n input directory is searched for PO files, terminology PO file is output file"
101 else:
102 super(TerminologyOptionParser, self).set_usage(usage)
103
111
113 """recurse through directories and process files"""
114 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True):
115 if isinstance(options.input, list):
116 inputfiles = self.recurseinputfilelist(options)
117 else:
118 inputfiles = self.recurseinputfiles(options)
119 else:
120 if options.input:
121 inputfiles = [os.path.basename(options.input)]
122 options.input = os.path.dirname(options.input)
123 else:
124 inputfiles = [options.input]
125 if os.path.isdir(options.output):
126 options.output = os.path.join(options.output,"pootle-terminology.pot")
127
128 if self.defaultstopfile:
129 parse_stopword_file(None, "-S", self.defaultstopfile, self)
130 self.glossary = {}
131 self.initprogressbar(inputfiles, options)
132 for inputpath in inputfiles:
133 self.files += 1
134 fullinputpath = self.getfullinputpath(options, inputpath)
135 success = True
136 try:
137 self.processfile(None, options, fullinputpath)
138 except Exception, error:
139 if isinstance(error, KeyboardInterrupt):
140 raise
141 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info())
142 success = False
143 self.reportprogress(inputpath, success)
144 del self.progressbar
145 self.outputterminology(options)
146
147 - def clean(self, string, options):
148 """returns the cleaned string that contains the text to be matched"""
149 for accelerator in options.accelchars:
150 string = string.replace(accelerator, "")
151 string = self.formatpat.sub(" ", string)
152 string = self.xmlelpat.sub(" ", string)
153 string = self.xmlentpat.sub(" ", string)
154 string = string.strip()
155 return string
156
158 """return case-mapped stopword for input word"""
159 if self.stopignorecase or (self.stopfoldtitle and word.istitle()):
160 word = word.lower()
161 return word
162
163 - def stopword(self, word, defaultset=frozenset()):
164 """return stoplist frozenset for input word"""
165 return self.stopwords.get(self.stopmap(word),defaultset)
166
167 - def addphrases(self, words, skips, translation, partials=True):
168 """adds (sub)phrases with non-skipwords and more than one word"""
169 if (len(words) > skips + 1 and
170 'skip' not in self.stopword(words[0]) and
171 'skip' not in self.stopword(words[-1])):
172 self.glossary.setdefault(' '.join(words), []).append(translation)
173 if partials:
174 part = list(words)
175 while len(part) > 2:
176 if 'skip' in self.stopword(part.pop()):
177 skips -= 1
178 if (len(part) > skips + 1 and
179 'skip' not in self.stopword(part[0]) and
180 'skip' not in self.stopword(part[-1])):
181 self.glossary.setdefault(' '.join(part), []).append(translation)
182
183 - def processfile(self, fileprocessor, options, fullinputpath):
184 """process an individual file"""
185 inputfile = self.openinputfile(options, fullinputpath)
186 inputfile = factory.getobject(inputfile)
187 sourcelang = lang_factory.getlanguage(options.sourcelanguage)
188 rematchignore = frozenset(('word','phrase'))
189 defaultignore = frozenset()
190 for unit in inputfile.units:
191 self.units += 1
192 if unit.isheader():
193 continue
194 if unit.hasplural():
195 continue
196 if not options.invert:
197 source = self.clean(unit.source, options)
198 target = self.clean(unit.target, options)
199 else:
200 target = self.clean(unit.source, options)
201 source = self.clean(unit.target, options)
202 if len(source) <= 1:
203 continue
204 for sentence in sourcelang.sentences(source):
205 words = []
206 skips = 0
207 for word in sourcelang.words(sentence):
208 stword = self.stopmap(word)
209 if options.ignorecase or (options.foldtitle and word.istitle()):
210 word = word.lower()
211 ignore = defaultignore
212 if stword in self.stopwords:
213 ignore = self.stopwords[stword]
214 else:
215 for stopre in self.stoprelist:
216 if stopre.match(stword) != None:
217 ignore = rematchignore
218 break
219 translation = (source, target, unit, fullinputpath)
220 if 'word' not in ignore:
221
222 root = word
223 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary:
224 root = word[0:-1]
225 elif len(root) > 2 and root + 's' in self.glossary:
226 self.glossary[root] = self.glossary.pop(root + 's')
227 self.glossary.setdefault(root, []).append(translation)
228 if options.termlength > 1:
229 if 'phrase' in ignore:
230
231 while len(words) > 2:
232 if 'skip' in self.stopword(words.pop(0)):
233 skips -= 1
234 self.addphrases(words, skips, translation)
235 words = []
236 skips = 0
237 else:
238 words.append(word)
239 if 'skip' in ignore:
240 skips += 1
241 if len(words) > options.termlength + skips:
242 while len(words) > options.termlength + skips:
243 if 'skip' in self.stopword(words.pop(0)):
244 skips -= 1
245 self.addphrases(words, skips, translation)
246 else:
247 self.addphrases(words, skips, translation, partials=False)
248 if options.termlength > 1:
249
250 while options.termlength > 1 and len(words) > 2:
251
252 if 'skip' in self.stopword(words.pop(0)):
253 skips -= 1
254 self.addphrases(words, skips, translation)
255
257 """saves the generated terminology glossary"""
258 termfile = po.pofile()
259 terms = {}
260 locre = re.compile(r":[0-9]+$")
261 print >> sys.stderr, ("%d terms from %d units in %d files" %
262 (len(self.glossary), self.units, self.files))
263 for term, translations in self.glossary.iteritems():
264 if len(translations) <= 1:
265 continue
266 filecounts = {}
267 sources = {}
268 termunit = po.pounit(term)
269 locations = {}
270 sourcenotes = {}
271 transnotes = {}
272 targets = {}
273 fullmsg = False
274 for source, target, unit, filename in translations:
275 sources[source] = 1
276 filecounts[filename] = filecounts.setdefault(filename, 0) + 1
277 if term.lower() == self.clean(unit.source, options).lower():
278 fullmsg = True
279 target = self.clean(unit.target, options)
280 if options.ignorecase or (options.foldtitle and target.istitle()):
281 target = target.lower()
282 unit.settarget(target)
283 if target != "":
284 targets.setdefault(target, []).append(filename)
285 if term.lower() == unit.source.strip().lower():
286 sourcenotes[unit.getnotes("source code")] = None
287 transnotes[unit.getnotes("translator")] = None
288 else:
289 unit.settarget("")
290 unit.setsource(term)
291 termunit.merge(unit, overwrite=False, comments=False)
292 for loc in unit.getlocations():
293 locations.setdefault(locre.sub("", loc))
294 numsources = len(sources)
295 numfiles = len(filecounts)
296 numlocs = len(locations)
297 if numfiles < options.inputmin or numlocs < options.locmin:
298 continue
299 if fullmsg:
300 if numsources < options.fullmsgmin:
301 continue
302 elif numsources < options.substrmin:
303 continue
304 if len(targets.keys()) > 1:
305 txt = '; '.join(["%s {%s}" % (target, ', '.join(files))
306 for target, files in targets.iteritems()])
307 if termunit.gettarget().find('};') < 0:
308 termunit.settarget(txt)
309 termunit.markfuzzy()
310 else:
311
312 termunit.addnote(txt, "translator")
313 locmax = 2 * options.locmin
314 if numlocs > locmax:
315 for location in locations.keys()[0:locmax]:
316 termunit.addlocation(location)
317 termunit.addlocation("(poterminology) %d more locations"
318 % (numlocs - locmax))
319 else:
320 for location in locations.keys():
321 termunit.addlocation(location)
322 for sourcenote in sourcenotes.keys():
323 termunit.addnote(sourcenote, "source code")
324 for transnote in transnotes.keys():
325 termunit.addnote(transnote, "translator")
326 for filename, count in filecounts.iteritems():
327 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (filename, count))
328 terms[term] = (((10 * numfiles) + numsources, termunit))
329
330 termlist = terms.keys()
331 print >> sys.stderr, "%d terms after thresholding" % len(termlist)
332 termlist.sort(lambda x, y: cmp(len(x), len(y)))
333 for term in termlist:
334 words = term.split()
335 if len(words) <= 2:
336 continue
337 while len(words) > 2:
338 words.pop()
339 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
340 del terms[' '.join(words)]
341 words = term.split()
342 while len(words) > 2:
343 words.pop(0)
344 if terms[term][0] == terms.get(' '.join(words), [0])[0]:
345 del terms[' '.join(words)]
346 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys())
347 termitems = terms.values()
348 if options.sortorders == None:
349 options.sortorders = self.sortorders
350 while len(options.sortorders) > 0:
351 order = options.sortorders.pop()
352 if order == "frequency":
353 termitems.sort(lambda x, y: cmp(y[0], x[0]))
354 elif order == "dictionary":
355 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower()))
356 elif order == "length":
357 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source)))
358 else:
359 self.warning("unknown sort order %s" % order, options)
360 for count, unit in termitems:
361 termfile.units.append(unit)
362 open(options.output, "w").write(str(termfile))
363
365 parser.values.ignorecase = False
366 parser.values.foldtitle = True
367
369 parser.values.ignorecase = parser.values.foldtitle = False
370
372
373 actions = { '+': frozenset(), ':': frozenset(['skip']),
374 '<': frozenset(['phrase']), '=': frozenset(['word']),
375 '>': frozenset(['word','skip']),
376 '@': frozenset(['word','phrase']) }
377
378 stopfile = open(value, "r")
379 line = 0
380 try:
381 for stopline in stopfile:
382 line += 1
383 stoptype = stopline[0]
384 if stoptype == '#' or stoptype == "\n":
385 continue
386 elif stoptype == '!':
387 if stopline[1] == 'C':
388 parser.stopfoldtitle = False
389 parser.stopignorecase = False
390 elif stopline[1] == 'F':
391 parser.stopfoldtitle = True
392 parser.stopignorecase = False
393 elif stopline[1] == 'I':
394 parser.stopignorecase = True
395 else:
396 parser.warning("%s line %d - bad case mapping directive" % (value, line), parser.values, ("", stopline[:2]))
397 elif stoptype == '/':
398 parser.stoprelist.append(re.compile(stopline[1:-1]+'$'))
399 else:
400 parser.stopwords[stopline[1:-1]] = actions[stoptype]
401 except KeyError, character:
402 parser.warning("%s line %d - bad stopword entry starts with" % (value, line), parser.values, sys.exc_info())
403 parser.warning("%s line %d" % (value, line + 1), parser.values, ("", "all lines after error ignored" ))
404 stopfile.close()
405 parser.defaultstopfile = None
406
408 formats = {"po":("po", None), "pot": ("pot", None), None:("po", None)}
409 parser = TerminologyOptionParser(formats)
410
411 parser.add_option("-u", "--update", type="string", dest="update",
412 metavar="UPDATEFILE", help="update terminology in UPDATEFILE")
413
414 parser.stopwords = {}
415 parser.stoprelist = []
416 parser.stopfoldtitle = True
417 parser.stopignorecase = False
418 parser.defaultstopfile = file_discovery.get_abs_data_filename('stoplist-en')
419 parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE",
420 action="callback", callback=parse_stopword_file,
421 help="read stopword (term exclusion) list from STOPFILE (default %s)" % parser.defaultstopfile,
422 default=parser.defaultstopfile)
423
424 parser.set_defaults(foldtitle = True, ignorecase = False)
425 parser.add_option("-F", "--fold-titlecase", callback=fold_case_option,
426 action="callback", help="fold \"Title Case\" to lowercase (default)")
427 parser.add_option("-C", "--preserve-case", callback=preserve_case_option,
428 action="callback", help="preserve all uppercase/lowercase")
429 parser.add_option("-I", "--ignore-case", dest="ignorecase",
430 action="store_true", help="make all terms lowercase")
431
432 parser.add_option("", "--accelerator", dest="accelchars", default="",
433 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching")
434
435 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3",
436 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH")
437 parser.add_option("", "--inputs-needed", type="int", dest="inputmin",
438 help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN")
439 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1",
440 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN")
441 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2",
442 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN")
443 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2",
444 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN")
445
446 parser.add_option("", "--sort", dest="sortorders", action="append",
447 type="choice", choices=parser.sortorders, metavar="ORDER",
448 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders))
449
450 parser.add_option("", "--source-language", dest="sourcelanguage", default="en",
451 help="the source language code (default 'en')", metavar="LANG")
452 parser.add_option("-v", "--invert", dest="invert",
453 action="store_true", default=False, help="invert the source and target languages for terminology")
454 parser.set_usage()
455 parser.description = __doc__
456 parser.run()
457
458
459 if __name__ == '__main__':
460 main()
461