Package translate :: Package tools :: Module poterminology
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.poterminology

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # This file is part of translate. 
  5  # 
  6  # translate is free software; you can redistribute it and/or modify 
  7  # it under the terms of the GNU General Public License as published by 
  8  # the Free Software Foundation; either version 2 of the License, or 
  9  # (at your option) any later version. 
 10  #  
 11  # translate is distributed in the hope that it will be useful, 
 12  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 13  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 14  # GNU General Public License for more details. 
 15  # 
 16  # You should have received a copy of the GNU General Public License 
 17  # along with translate; if not, write to the Free Software 
 18  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 19   
 20  """reads a set of .po or .pot files to produce a pootle-terminology.pot 
 21   
 22  See: http://translate.sourceforge.net/wiki/toolkit/poterminology for examples and 
 23  usage instructions 
 24  """ 
 25   
 26  from translate.lang import factory as lang_factory 
 27  from translate.misc import optrecurse 
 28  from translate.storage import po 
 29  from translate.storage import factory 
 30  from translate.misc import file_discovery 
 31  import os 
 32  import re 
 33  import sys 
 34   
35 -class TerminologyOptionParser(optrecurse.RecursiveOptionParser):
36 """a specialized Option Parser for the terminology tool...""" 37 38 # handles c-format and python-format 39 formatpat = re.compile(r"%(?:\([^)]+\)|[0-9]+\$)?[-+#0]*[0-9.*]*(?:[hlLzjt][hl])?[EFGXc-ginoprsux]") 40 # handles XML/HTML elements (<foo>text</foo> => text) 41 xmlelpat = re.compile(r"<(?:![[-]|[/?]?[A-Za-z_:])[^>]*>") 42 # handles XML/HTML entities (&#32; &#x20; &amp; &my_entity;) 43 xmlentpat = re.compile(r"&(?:#(?:[0-9]+|x[0-9a-f]+)|[a-z_:][\w.-:]*);", 44 flags=re.UNICODE|re.IGNORECASE) 45 46 sortorders = [ "frequency", "dictionary", "length" ] 47 48 files = 0 49 units = 0 50
51 - def parse_args(self, args=None, values=None):
52 """parses the command line options, handling implicit input/output args""" 53 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 54 # some intelligence as to what reasonable people might give on the command line 55 if args and not options.input: 56 if not options.output and not options.update and len(args) > 1: 57 options.input = args[:-1] 58 args = args[-1:] 59 else: 60 options.input = args 61 args = [] 62 # don't overwrite last freestanding argument file, to avoid accidents 63 # due to shell wildcard expansion 64 if args and not options.output and not options.update: 65 if os.path.lexists(args[-1]) and not os.path.isdir(args[-1]): 66 self.error("To overwrite %s, specify it with -o/--output or -u/--update" % (args[-1])) 67 options.output = args[-1] 68 args = args[:-1] 69 if options.output and options.update: 70 self.error("You cannot use both -u/--update and -o/--output") 71 if args: 72 self.error("You have used an invalid combination of -i/--input, -o/--output, -u/--update and freestanding args") 73 if not options.input: 74 self.error("No input file or directory was specified") 75 if isinstance(options.input, list) and len(options.input) == 1: 76 options.input = options.input[0] 77 if options.inputmin == None: 78 options.inputmin = 1 79 elif not isinstance(options.input, list) and not os.path.isdir(options.input): 80 if options.inputmin == None: 81 options.inputmin = 1 82 elif options.inputmin == None: 83 options.inputmin = 2 84 if options.update: 85 options.output = options.update 86 if isinstance(options.input, list): 87 options.input.append(options.update) 88 elif options.input: 89 options.input = [options.input, options.update] 90 else: 91 options.input = options.update 92 if not options.output: 93 options.output = "pootle-terminology.pot" 94 return (options, args)
95
96 - def set_usage(self, usage=None):
97 """sets the usage string - if usage not given, uses getusagestring for each option""" 98 if usage is None: 99 self.usage = "%prog " + " ".join([self.getusagestring(option) for option in self.option_list]) + \ 100 "\n input directory is searched for PO files, terminology PO file is output file" 101 else: 102 super(TerminologyOptionParser, self).set_usage(usage)
103
104 - def run(self):
105 """parses the arguments, and runs recursiveprocess with the resulting options""" 106 (options, args) = self.parse_args() 107 options.inputformats = self.inputformats 108 options.outputoptions = self.outputoptions 109 self.usepsyco(options) 110 self.recursiveprocess(options)
111
112 - def recursiveprocess(self, options):
113 """recurse through directories and process files""" 114 if self.isrecursive(options.input, 'input') and getattr(options, "allowrecursiveinput", True): 115 if isinstance(options.input, list): 116 inputfiles = self.recurseinputfilelist(options) 117 else: 118 inputfiles = self.recurseinputfiles(options) 119 else: 120 if options.input: 121 inputfiles = [os.path.basename(options.input)] 122 options.input = os.path.dirname(options.input) 123 else: 124 inputfiles = [options.input] 125 if os.path.isdir(options.output): 126 options.output = os.path.join(options.output,"pootle-terminology.pot") 127 # load default stopfile if no -S options were given 128 if self.defaultstopfile: 129 parse_stopword_file(None, "-S", self.defaultstopfile, self) 130 self.glossary = {} 131 self.initprogressbar(inputfiles, options) 132 for inputpath in inputfiles: 133 self.files += 1 134 fullinputpath = self.getfullinputpath(options, inputpath) 135 success = True 136 try: 137 self.processfile(None, options, fullinputpath) 138 except Exception, error: 139 if isinstance(error, KeyboardInterrupt): 140 raise 141 self.warning("Error processing: input %s" % (fullinputpath), options, sys.exc_info()) 142 success = False 143 self.reportprogress(inputpath, success) 144 del self.progressbar 145 self.outputterminology(options)
146
147 - def clean(self, string, options):
148 """returns the cleaned string that contains the text to be matched""" 149 for accelerator in options.accelchars: 150 string = string.replace(accelerator, "") 151 string = self.formatpat.sub(" ", string) 152 string = self.xmlelpat.sub(" ", string) 153 string = self.xmlentpat.sub(" ", string) 154 string = string.strip() 155 return string
156
157 - def stopmap(self, word):
158 """return case-mapped stopword for input word""" 159 if self.stopignorecase or (self.stopfoldtitle and word.istitle()): 160 word = word.lower() 161 return word
162
163 - def stopword(self, word, defaultset=frozenset()):
164 """return stoplist frozenset for input word""" 165 return self.stopwords.get(self.stopmap(word),defaultset)
166
167 - def addphrases(self, words, skips, translation, partials=True):
168 """adds (sub)phrases with non-skipwords and more than one word""" 169 if (len(words) > skips + 1 and 170 'skip' not in self.stopword(words[0]) and 171 'skip' not in self.stopword(words[-1])): 172 self.glossary.setdefault(' '.join(words), []).append(translation) 173 if partials: 174 part = list(words) 175 while len(part) > 2: 176 if 'skip' in self.stopword(part.pop()): 177 skips -= 1 178 if (len(part) > skips + 1 and 179 'skip' not in self.stopword(part[0]) and 180 'skip' not in self.stopword(part[-1])): 181 self.glossary.setdefault(' '.join(part), []).append(translation)
182
183 - def processfile(self, fileprocessor, options, fullinputpath):
184 """process an individual file""" 185 inputfile = self.openinputfile(options, fullinputpath) 186 inputfile = factory.getobject(inputfile) 187 sourcelang = lang_factory.getlanguage(options.sourcelanguage) 188 rematchignore = frozenset(('word','phrase')) 189 defaultignore = frozenset() 190 for unit in inputfile.units: 191 self.units += 1 192 if unit.isheader(): 193 continue 194 if unit.hasplural(): 195 continue 196 if not options.invert: 197 source = self.clean(unit.source, options) 198 target = self.clean(unit.target, options) 199 else: 200 target = self.clean(unit.source, options) 201 source = self.clean(unit.target, options) 202 if len(source) <= 1: 203 continue 204 for sentence in sourcelang.sentences(source): 205 words = [] 206 skips = 0 207 for word in sourcelang.words(sentence): 208 stword = self.stopmap(word) 209 if options.ignorecase or (options.foldtitle and word.istitle()): 210 word = word.lower() 211 ignore = defaultignore 212 if stword in self.stopwords: 213 ignore = self.stopwords[stword] 214 else: 215 for stopre in self.stoprelist: 216 if stopre.match(stword) != None: 217 ignore = rematchignore 218 break 219 translation = (source, target, unit, fullinputpath) 220 if 'word' not in ignore: 221 # reduce plurals 222 root = word 223 if len(word) > 3 and word[-1] == 's' and word[0:-1] in self.glossary: 224 root = word[0:-1] 225 elif len(root) > 2 and root + 's' in self.glossary: 226 self.glossary[root] = self.glossary.pop(root + 's') 227 self.glossary.setdefault(root, []).append(translation) 228 if options.termlength > 1: 229 if 'phrase' in ignore: 230 # add trailing phrases in previous words 231 while len(words) > 2: 232 if 'skip' in self.stopword(words.pop(0)): 233 skips -= 1 234 self.addphrases(words, skips, translation) 235 words = [] 236 skips = 0 237 else: 238 words.append(word) 239 if 'skip' in ignore: 240 skips += 1 241 if len(words) > options.termlength + skips: 242 while len(words) > options.termlength + skips: 243 if 'skip' in self.stopword(words.pop(0)): 244 skips -= 1 245 self.addphrases(words, skips, translation) 246 else: 247 self.addphrases(words, skips, translation, partials=False) 248 if options.termlength > 1: 249 # add trailing phrases in sentence after reaching end 250 while options.termlength > 1 and len(words) > 2: 251 252 if 'skip' in self.stopword(words.pop(0)): 253 skips -= 1 254 self.addphrases(words, skips, translation)
255
256 - def outputterminology(self, options):
257 """saves the generated terminology glossary""" 258 termfile = po.pofile() 259 terms = {} 260 locre = re.compile(r":[0-9]+$") 261 print >> sys.stderr, ("%d terms from %d units in %d files" % 262 (len(self.glossary), self.units, self.files)) 263 for term, translations in self.glossary.iteritems(): 264 if len(translations) <= 1: 265 continue 266 filecounts = {} 267 sources = {} 268 termunit = po.pounit(term) 269 locations = {} 270 sourcenotes = {} 271 transnotes = {} 272 targets = {} 273 fullmsg = False 274 for source, target, unit, filename in translations: 275 sources[source] = 1 276 filecounts[filename] = filecounts.setdefault(filename, 0) + 1 277 if term.lower() == self.clean(unit.source, options).lower(): 278 fullmsg = True 279 target = self.clean(unit.target, options) 280 if options.ignorecase or (options.foldtitle and target.istitle()): 281 target = target.lower() 282 unit.settarget(target) 283 if target != "": 284 targets.setdefault(target, []).append(filename) 285 if term.lower() == unit.source.strip().lower(): 286 sourcenotes[unit.getnotes("source code")] = None 287 transnotes[unit.getnotes("translator")] = None 288 else: 289 unit.settarget("") 290 unit.setsource(term) 291 termunit.merge(unit, overwrite=False, comments=False) 292 for loc in unit.getlocations(): 293 locations.setdefault(locre.sub("", loc)) 294 numsources = len(sources) 295 numfiles = len(filecounts) 296 numlocs = len(locations) 297 if numfiles < options.inputmin or numlocs < options.locmin: 298 continue 299 if fullmsg: 300 if numsources < options.fullmsgmin: 301 continue 302 elif numsources < options.substrmin: 303 continue 304 if len(targets.keys()) > 1: 305 txt = '; '.join(["%s {%s}" % (target, ', '.join(files)) 306 for target, files in targets.iteritems()]) 307 if termunit.gettarget().find('};') < 0: 308 termunit.settarget(txt) 309 termunit.markfuzzy() 310 else: 311 # if annotated multiple terms already present, keep as-is 312 termunit.addnote(txt, "translator") 313 locmax = 2 * options.locmin 314 if numlocs > locmax: 315 for location in locations.keys()[0:locmax]: 316 termunit.addlocation(location) 317 termunit.addlocation("(poterminology) %d more locations" 318 % (numlocs - locmax)) 319 else: 320 for location in locations.keys(): 321 termunit.addlocation(location) 322 for sourcenote in sourcenotes.keys(): 323 termunit.addnote(sourcenote, "source code") 324 for transnote in transnotes.keys(): 325 termunit.addnote(transnote, "translator") 326 for filename, count in filecounts.iteritems(): 327 termunit.othercomments.append("# (poterminology) %s (%d)\n" % (filename, count)) 328 terms[term] = (((10 * numfiles) + numsources, termunit)) 329 # reduce subphrase 330 termlist = terms.keys() 331 print >> sys.stderr, "%d terms after thresholding" % len(termlist) 332 termlist.sort(lambda x, y: cmp(len(x), len(y))) 333 for term in termlist: 334 words = term.split() 335 if len(words) <= 2: 336 continue 337 while len(words) > 2: 338 words.pop() 339 if terms[term][0] == terms.get(' '.join(words), [0])[0]: 340 del terms[' '.join(words)] 341 words = term.split() 342 while len(words) > 2: 343 words.pop(0) 344 if terms[term][0] == terms.get(' '.join(words), [0])[0]: 345 del terms[' '.join(words)] 346 print >> sys.stderr, "%d terms after subphrase reduction" % len(terms.keys()) 347 termitems = terms.values() 348 if options.sortorders == None: 349 options.sortorders = self.sortorders 350 while len(options.sortorders) > 0: 351 order = options.sortorders.pop() 352 if order == "frequency": 353 termitems.sort(lambda x, y: cmp(y[0], x[0])) 354 elif order == "dictionary": 355 termitems.sort(lambda x, y: cmp(x[1].source.lower(), y[1].source.lower())) 356 elif order == "length": 357 termitems.sort(lambda x, y: cmp(len(x[1].source), len(y[1].source))) 358 else: 359 self.warning("unknown sort order %s" % order, options) 360 for count, unit in termitems: 361 termfile.units.append(unit) 362 open(options.output, "w").write(str(termfile))
363
364 -def fold_case_option(option, opt_str, value, parser):
365 parser.values.ignorecase = False 366 parser.values.foldtitle = True
367
368 -def preserve_case_option(option, opt_str, value, parser):
369 parser.values.ignorecase = parser.values.foldtitle = False
370
371 -def parse_stopword_file(option, opt_str, value, parser):
372 373 actions = { '+': frozenset(), ':': frozenset(['skip']), 374 '<': frozenset(['phrase']), '=': frozenset(['word']), 375 '>': frozenset(['word','skip']), 376 '@': frozenset(['word','phrase']) } 377 378 stopfile = open(value, "r") 379 line = 0 380 try: 381 for stopline in stopfile: 382 line += 1 383 stoptype = stopline[0] 384 if stoptype == '#' or stoptype == "\n": 385 continue 386 elif stoptype == '!': 387 if stopline[1] == 'C': 388 parser.stopfoldtitle = False 389 parser.stopignorecase = False 390 elif stopline[1] == 'F': 391 parser.stopfoldtitle = True 392 parser.stopignorecase = False 393 elif stopline[1] == 'I': 394 parser.stopignorecase = True 395 else: 396 parser.warning("%s line %d - bad case mapping directive" % (value, line), parser.values, ("", stopline[:2])) 397 elif stoptype == '/': 398 parser.stoprelist.append(re.compile(stopline[1:-1]+'$')) 399 else: 400 parser.stopwords[stopline[1:-1]] = actions[stoptype] 401 except KeyError, character: 402 parser.warning("%s line %d - bad stopword entry starts with" % (value, line), parser.values, sys.exc_info()) 403 parser.warning("%s line %d" % (value, line + 1), parser.values, ("", "all lines after error ignored" )) 404 stopfile.close() 405 parser.defaultstopfile = None
406
407 -def main():
408 formats = {"po":("po", None), "pot": ("pot", None), None:("po", None)} 409 parser = TerminologyOptionParser(formats) 410 411 parser.add_option("-u", "--update", type="string", dest="update", 412 metavar="UPDATEFILE", help="update terminology in UPDATEFILE") 413 414 parser.stopwords = {} 415 parser.stoprelist = [] 416 parser.stopfoldtitle = True 417 parser.stopignorecase = False 418 parser.defaultstopfile = file_discovery.get_abs_data_filename('stoplist-en') 419 parser.add_option("-S", "--stopword-list", type="string", metavar="STOPFILE", 420 action="callback", callback=parse_stopword_file, 421 help="read stopword (term exclusion) list from STOPFILE (default %s)" % parser.defaultstopfile, 422 default=parser.defaultstopfile) 423 424 parser.set_defaults(foldtitle = True, ignorecase = False) 425 parser.add_option("-F", "--fold-titlecase", callback=fold_case_option, 426 action="callback", help="fold \"Title Case\" to lowercase (default)") 427 parser.add_option("-C", "--preserve-case", callback=preserve_case_option, 428 action="callback", help="preserve all uppercase/lowercase") 429 parser.add_option("-I", "--ignore-case", dest="ignorecase", 430 action="store_true", help="make all terms lowercase") 431 432 parser.add_option("", "--accelerator", dest="accelchars", default="", 433 metavar="ACCELERATORS", help="ignores the given accelerator characters when matching") 434 435 parser.add_option("-t", "--term-words", type="int", dest="termlength", default="3", 436 help="generate terms of up to LENGTH words (default 3)", metavar="LENGTH") 437 parser.add_option("", "--inputs-needed", type="int", dest="inputmin", 438 help="omit terms appearing in less than MIN input files (default 2, or 1 if only one input file)", metavar="MIN") 439 parser.add_option("", "--fullmsg-needed", type="int", dest="fullmsgmin", default="1", 440 help="omit full message terms appearing in less than MIN different messages (default 1)", metavar="MIN") 441 parser.add_option("", "--substr-needed", type="int", dest="substrmin", default="2", 442 help="omit substring-only terms appearing in less than MIN different messages (default 2)", metavar="MIN") 443 parser.add_option("", "--locs-needed", type="int", dest="locmin", default="2", 444 help="omit terms appearing in less than MIN different original source files (default 2)", metavar="MIN") 445 446 parser.add_option("", "--sort", dest="sortorders", action="append", 447 type="choice", choices=parser.sortorders, metavar="ORDER", 448 help="output sort order(s): %s (default is all orders in the above priority)" % ', '.join(parser.sortorders)) 449 450 parser.add_option("", "--source-language", dest="sourcelanguage", default="en", 451 help="the source language code (default 'en')", metavar="LANG") 452 parser.add_option("-v", "--invert", dest="invert", 453 action="store_true", default=False, help="invert the source and target languages for terminology") 454 parser.set_usage() 455 parser.description = __doc__ 456 parser.run()
457 458 459 if __name__ == '__main__': 460 main() 461