Package translate :: Package tools :: Module pogrep
[hide private]
[frames] | no frames]

Source Code for Module translate.tools.pogrep

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Grep XLIFF, Gettext PO and TMX localization files 
 23   
 24  Matches are output to snippet files of the same type which can then be reviewed  
 25  and later merged using pomerge 
 26   
 27  See: http://translate.sourceforge.net/wiki/toolkit/pogrep for examples and 
 28  usage instructions 
 29  """ 
 30   
 31  from translate.storage import factory 
 32  from translate.misc import optrecurse 
 33  from translate.misc.multistring import multistring 
 34  from translate.lang import data 
 35  import re 
 36  import locale 
 37   
 38   
39 -class GrepMatch(object):
40 """Just a small data structure that represents a search match.""" 41 42 # INITIALIZERS #
43 - def __init__(self, unit, part='target', part_n=0, start=0, end=0):
44 self.unit = unit 45 self.part = part 46 self.part_n = part_n 47 self.start = start 48 self.end = end
49 50 # ACCESSORS #
51 - def get_getter(self):
52 if self.part == 'target': 53 if self.unit.hasplural(): 54 getter = lambda: self.unit.target.strings[self.part_n] 55 else: 56 getter = lambda: self.unit.target 57 return getter 58 elif self.part == 'source': 59 if self.unit.hasplural(): 60 getter = lambda: self.unit.source.strings[self.part_n] 61 else: 62 getter = lambda: self.unit.source 63 return getter 64 elif self.part == 'notes': 65 def getter(): 66 return self.unit.getnotes()[self.part_n]
67 return getter 68 elif self.part == 'locations': 69 def getter(): 70 return self.unit.getlocations()[self.part_n]
71 return getter 72
73 - def get_setter(self):
74 if self.part == 'target': 75 if self.unit.hasplural(): 76 def setter(value): 77 strings = self.unit.target.strings 78 strings[self.part_n] = value 79 self.unit.target = strings
80 else: 81 def setter(value): 82 self.unit.target = value 83 return setter 84 85 # SPECIAL METHODS #
86 - def __str__(self):
87 start, end = self.start, self.end 88 if start < 3: 89 start = 3 90 if end > len(self.get_getter()()) - 3: 91 end = len(self.get_getter()()) - 3 92 matchpart = self.get_getter()()[start-2:end+2] 93 return '<GrepMatch "%s" part=%s[%d] start=%d end=%d>' % (matchpart, self.part, self.part_n, self.start, self.end)
94
95 - def __repr__(self):
96 return str(self)
97
98 -def real_index(string, nfc_index):
99 """Calculate the real index in the unnormalized string that corresponds to 100 the index nfc_index in the normalized string.""" 101 length = nfc_index 102 max_length = len(string) 103 while len(data.normalize(string[:length])) <= nfc_index: 104 if length == max_length: 105 return length 106 length += 1 107 return length - 1
108 109
110 -def find_matches(unit, part, strings, re_search):
111 """Return the GrepFilter objects where re_search matches in strings.""" 112 matches = [] 113 part_n = 0 114 for string in strings: 115 normalized = data.normalize(string) 116 for matchobj in re_search.finditer(normalized): 117 start = real_index(string, matchobj.start()) 118 end = real_index(string, matchobj.end()) 119 matches.append(GrepMatch(unit, part=part, part_n=part_n, start=start, end=end)) 120 return matches
121
122 -class GrepFilter:
123 - def __init__(self, searchstring, searchparts, ignorecase=False, useregexp=False, 124 invertmatch=False, accelchar=None, encoding='utf-8', includeheader=False, 125 max_matches=0):
126 """builds a checkfilter using the given checker""" 127 if isinstance(searchstring, unicode): 128 self.searchstring = searchstring 129 else: 130 self.searchstring = searchstring.decode(encoding) 131 self.searchstring = data.normalize(self.searchstring) 132 if searchparts: 133 # For now we still support the old terminology, except for the old 'source' 134 # which has a new meaning now. 135 self.search_source = ('source' in searchparts) or ('msgid' in searchparts) 136 self.search_target = ('target' in searchparts) or ('msgstr' in searchparts) 137 self.search_notes = ('notes' in searchparts) or ('comment' in searchparts) 138 self.search_locations = 'locations' in searchparts 139 else: 140 self.search_source = True 141 self.search_target = True 142 self.search_notes = False 143 self.search_locations = False 144 self.ignorecase = ignorecase 145 if self.ignorecase: 146 self.searchstring = self.searchstring.lower() 147 self.useregexp = useregexp 148 if self.useregexp: 149 self.searchpattern = re.compile(self.searchstring) 150 self.invertmatch = invertmatch 151 self.accelchar = accelchar 152 self.includeheader = includeheader 153 self.max_matches = max_matches
154
155 - def matches(self, teststr):
156 if teststr is None: 157 return False 158 teststr = data.normalize(teststr) 159 if self.ignorecase: 160 teststr = teststr.lower() 161 if self.accelchar: 162 teststr = re.sub(self.accelchar + self.accelchar, "#", teststr) 163 teststr = re.sub(self.accelchar, "", teststr) 164 if self.useregexp: 165 found = self.searchpattern.search(teststr) 166 else: 167 found = teststr.find(self.searchstring) != -1 168 if self.invertmatch: 169 found = not found 170 return found
171
172 - def filterunit(self, unit):
173 """runs filters on an element""" 174 if unit.isheader(): return [] 175 176 if self.search_source: 177 if isinstance(unit.source, multistring): 178 strings = unit.source.strings 179 else: 180 strings = [unit.source] 181 for string in strings: 182 if self.matches(string): 183 return True 184 185 if self.search_target: 186 if isinstance(unit.target, multistring): 187 strings = unit.target.strings 188 else: 189 strings = [unit.target] 190 for string in strings: 191 if self.matches(string): 192 return True 193 194 if self.search_notes: 195 if self.matches(unit.getnotes()): 196 return True 197 if self.search_locations: 198 if self.matches(u" ".join(unit.getlocations())): 199 return True 200 return False
201
202 - def filterfile(self, thefile):
203 """runs filters on a translation file object""" 204 thenewfile = type(thefile)() 205 thenewfile.setsourcelanguage(thefile.sourcelanguage) 206 thenewfile.settargetlanguage(thefile.targetlanguage) 207 for unit in thefile.units: 208 if self.filterunit(unit): 209 thenewfile.addunit(unit) 210 if self.includeheader and thenewfile.units > 0: 211 if thefile.units[0].isheader(): 212 thenewfile.units.insert(0, thefile.units[0]) 213 else: 214 thenewfile.units.insert(0, thenewfile.makeheader()) 215 return thenewfile
216
217 - def getmatches(self, units):
218 if not self.searchstring: 219 return [], [] 220 221 searchstring = self.searchstring 222 flags = re.LOCALE | re.MULTILINE | re.UNICODE 223 224 if self.ignorecase: 225 flags |= re.IGNORECASE 226 if not self.useregexp: 227 searchstring = re.escape(searchstring) 228 self.re_search = re.compile(u'(%s)' % (searchstring), flags) 229 230 matches = [] 231 indexes = [] 232 233 for index, unit in enumerate(units): 234 old_length = len(matches) 235 236 if self.search_target: 237 if unit.hasplural(): 238 targets = unit.target.strings 239 else: 240 targets = [unit.target] 241 matches.extend(find_matches(unit, 'target', targets, self.re_search)) 242 if self.search_source: 243 if unit.hasplural(): 244 sources = unit.source.strings 245 else: 246 sources = [unit.source] 247 matches.extend(find_matches(unit, 'source', sources, self.re_search)) 248 if self.search_notes: 249 matches.extend(find_matches(unit, 'notes', unit.getnotes(), self.re_search)) 250 251 if self.search_locations: 252 matches.extend(find_matches(unit, 'locations', unit.getlocations(), self.re_search)) 253 254 # A search for a single letter or an all-inclusive regular 255 # expression could give enough results to cause performance 256 # problems. The answer is probably not very useful at this scale. 257 if self.max_matches and len(matches) > self.max_matches: 258 raise Exception("Too many matches found") 259 260 if len(matches) > old_length: 261 old_length = len(matches) 262 indexes.append(index) 263 264 return matches, indexes
265
266 -class GrepOptionParser(optrecurse.RecursiveOptionParser):
267 """a specialized Option Parser for the grep tool..."""
268 - def parse_args(self, args=None, values=None):
269 """parses the command line options, handling implicit input/output args""" 270 (options, args) = optrecurse.optparse.OptionParser.parse_args(self, args, values) 271 # some intelligence as to what reasonable people might give on the command line 272 if args: 273 options.searchstring = args[0] 274 args = args[1:] 275 else: 276 self.error("At least one argument must be given for the search string") 277 if args and not options.input: 278 if not options.output: 279 options.input = args[:-1] 280 args = args[-1:] 281 else: 282 options.input = args 283 args = [] 284 if args and not options.output: 285 options.output = args[-1] 286 args = args[:-1] 287 if args: 288 self.error("You have used an invalid combination of --input, --output and freestanding args") 289 if isinstance(options.input, list) and len(options.input) == 1: 290 options.input = options.input[0] 291 return (options, args)
292
293 - def set_usage(self, usage=None):
294 """sets the usage string - if usage not given, uses getusagestring for each option""" 295 if usage is None: 296 self.usage = "%prog searchstring " + " ".join([self.getusagestring(option) for option in self.option_list]) 297 else: 298 super(GrepOptionParser, self).set_usage(usage)
299
300 - def run(self):
301 """parses the arguments, and runs recursiveprocess with the resulting options""" 302 (options, args) = self.parse_args() 303 options.inputformats = self.inputformats 304 options.outputoptions = self.outputoptions 305 options.checkfilter = GrepFilter(options.searchstring, options.searchparts, options.ignorecase, options.useregexp, options.invertmatch, options.accelchar, locale.getpreferredencoding(), options.includeheader) 306 self.usepsyco(options) 307 self.recursiveprocess(options)
308
309 -def rungrep(inputfile, outputfile, templatefile, checkfilter):
310 """reads in inputfile, filters using checkfilter, writes to outputfile""" 311 fromfile = factory.getobject(inputfile) 312 tofile = checkfilter.filterfile(fromfile) 313 if tofile.isempty(): 314 return False 315 outputfile.write(str(tofile)) 316 return True
317
318 -def cmdlineparser():
319 formats = {"po":("po", rungrep), "pot":("pot", rungrep), 320 "mo":("mo", rungrep), "gmo":("gmo", rungrep), 321 "tmx":("tmx", rungrep), 322 "xliff":("xliff", rungrep), "xlf":("xlf", rungrep), "xlff":("xlff", rungrep), 323 None:("po", rungrep)} 324 parser = GrepOptionParser(formats) 325 parser.add_option("", "--search", dest="searchparts", 326 action="append", type="choice", choices=["source", "target", "notes", "locations", "msgid", "msgstr", "comment" ], 327 metavar="SEARCHPARTS", help="searches the given parts (source, target, notes and locations)") 328 parser.add_option("-I", "--ignore-case", dest="ignorecase", 329 action="store_true", default=False, help="ignore case distinctions") 330 parser.add_option("-e", "--regexp", dest="useregexp", 331 action="store_true", default=False, help="use regular expression matching") 332 parser.add_option("-v", "--invert-match", dest="invertmatch", 333 action="store_true", default=False, help="select non-matching lines") 334 parser.add_option("", "--accelerator", dest="accelchar", 335 action="store", type="choice", choices=["&", "_", "~"], 336 metavar="ACCELERATOR", help="ignores the given accelerator when matching") 337 parser.add_option("", "--header", dest="includeheader", 338 action="store_true", default=False, 339 help="include a PO header in the output") 340 parser.set_usage() 341 parser.passthrough.append('checkfilter') 342 parser.description = __doc__ 343 return parser
344
345 -def main():
346 parser = cmdlineparser() 347 parser.run()
348 349 if __name__ == '__main__': 350 main() 351