Package translate :: Package storage :: Module oo
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.oo

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  # 
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  # 
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """ 
 23  Classes that hold units of .oo files (oounit) or entire files (oofile). 
 24   
 25  These are specific .oo files for localisation exported by OpenOffice.org - SDF 
 26  format (previously knows as GSI files). For an overview of the format, see 
 27  U{http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html} 
 28   
 29  The behaviour in terms of escaping is explained in detail in the programming 
 30  comments. 
 31  """ 
 32  # FIXME: add simple test which reads in a file and writes it out again 
 33   
 34  import os 
 35  import re 
 36  import warnings 
 37   
 38  from translate.misc import quote 
 39  from translate.misc import wStringIO 
 40   
 41  # File normalisation 
 42   
 43  normalfilenamechars = "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 
 44  normalizetable = "" 
 45  for i in map(chr, range(256)): 
 46      if i in normalfilenamechars: 
 47          normalizetable += i 
 48      else: 
 49          normalizetable += "_" 
 50   
 51   
52 -class unormalizechar(dict):
53
54 - def __init__(self, normalchars):
55 self.normalchars = {} 56 for char in normalchars: 57 self.normalchars[ord(char)] = char
58
59 - def __getitem__(self, key):
60 return self.normalchars.get(key, u"_")
61 62 unormalizetable = unormalizechar(normalfilenamechars.decode("ascii")) 63 64
65 -def normalizefilename(filename):
66 """converts any non-alphanumeric (standard roman) characters to _""" 67 if isinstance(filename, str): 68 return filename.translate(normalizetable) 69 else: 70 return filename.translate(unormalizetable)
71 72
73 -def makekey(ookey, long_keys):
74 """converts an oo key tuple into a unique identifier 75 76 @param ookey: an oo key 77 @type ookey: tuple 78 @param long_keys: Use long keys 79 @type long_keys: Boolean 80 @rtype: str 81 @return: unique ascii identifier 82 """ 83 project, sourcefile, resourcetype, groupid, localid, platform = ookey 84 sourcefile = sourcefile.replace('\\', '/') 85 if long_keys: 86 sourcebase = os.path.join(project, sourcefile) 87 else: 88 sourceparts = sourcefile.split('/') 89 sourcebase = "".join(sourceparts[-1:]) 90 if len(groupid) == 0 or len(localid) == 0: 91 fullid = groupid + localid 92 else: 93 fullid = groupid + "." + localid 94 if resourcetype: 95 fullid = fullid + "." + resourcetype 96 key = "%s#%s" % (sourcebase, fullid) 97 return normalizefilename(key)
98 99 # These are functions that deal with escaping and unescaping of the text fields 100 # of the SDF file. These should only be applied to the text column. 101 # The fields quickhelptext and title are assumed to carry no escaping. 102 # 103 # The escaping of all strings except those coming from .xhp (helpcontent2) 104 # sourcefiles work as follows: 105 # (newline) -> \n 106 # (carriage return) -> \r 107 # (tab) -> \t 108 # Backslash characters (\) and single quotes (') are not consistently escaped, 109 # and are therefore left as they are. 110 # 111 # For strings coming from .xhp (helpcontent2) sourcefiles the following 112 # characters are escaped inside XML tags only: 113 # < -> \< when used with lowercase tagnames (with some exceptions) 114 # > -> \> when used with lowercase tagnames (with some exceptions) 115 # " -> \" around XML properties 116 # The following is consistently escaped in .xhp strings (not only in XML tags): 117 # \ -> \\ 118 119
120 -def escape_text(text):
121 """Escapes SDF text to be suitable for unit consumption.""" 122 return text.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r")
123 124
125 -def unescape_text(text):
126 """Unescapes SDF text to be suitable for unit consumption.""" 127 return text.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\ 128 replace("\\r", "\r").replace("\a", "\\\\")
129 130 helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''') 131 132
133 -def escape_help_text(text):
134 """Escapes the help text as it would be in an SDF file. 135 136 <, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in 137 lowercase so those are dealt with. Some OpenOffice.org help tags are not 138 escaped. 139 """ 140 text = text.replace("\\", "\\\\") 141 for tag in helptagre.findall(text): 142 escapethistag = False 143 for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]: 144 if tag.startswith("<%s" % escape_tag) or tag == "</%s>" % escape_tag: 145 escapethistag = True 146 if tag in ["<br/>", "<help-id-missing/>"]: 147 escapethistag = True 148 if escapethistag: 149 escaped_tag = ("\\<" + tag[1:-1] + "\\>").replace('"', '\\"') 150 text = text.replace(tag, escaped_tag) 151 return text
152 153
154 -def unescape_help_text(text):
155 """Unescapes normal text to be suitable for writing to the SDF file.""" 156 return text.replace(r"\<", "<").replace(r"\>", ">").replace(r'\"', '"').replace(r"\\", "\\")
157 158
159 -def encode_if_needed_utf8(text):
160 """Encode a Unicode string the the specified encoding""" 161 if isinstance(text, unicode): 162 return text.encode('UTF-8') 163 return text
164 165
166 -class ooline(object):
167 """this represents one line, one translation in an .oo file""" 168
169 - def __init__(self, parts=None):
170 """construct an ooline from its parts""" 171 if parts is None: 172 self.project, self.sourcefile, self.dummy, self.resourcetype, \ 173 self.groupid, self.localid, self.helpid, self.platform, \ 174 self.width, self.languageid, self.text, self.helptext, \ 175 self.quickhelptext, self.title, self.timestamp = [""] * 15 176 else: 177 self.setparts(parts)
178
179 - def setparts(self, parts):
180 """create a line from its tab-delimited parts""" 181 if len(parts) != 15: 182 warnings.warn("oo line contains %d parts, it should contain 15: %r" % \ 183 (len(parts), parts)) 184 newparts = list(parts) 185 if len(newparts) < 15: 186 newparts = newparts + [""] * (15-len(newparts)) 187 else: 188 newparts = newparts[:15] 189 parts = tuple(newparts) 190 self.project, self.sourcefile, self.dummy, self.resourcetype, \ 191 self.groupid, self.localid, self.helpid, self.platform, \ 192 self.width, self.languageid, self._text, self.helptext, \ 193 self.quickhelptext, self.title, self.timestamp = parts
194
195 - def getparts(self):
196 """return a list of parts in this line""" 197 return (self.project, self.sourcefile, self.dummy, self.resourcetype, 198 self.groupid, self.localid, self.helpid, self.platform, 199 self.width, self.languageid, self._text, self.helptext, 200 self.quickhelptext, self.title, self.timestamp)
201
202 - def gettext(self):
203 """Obtains the text column and handle escaping.""" 204 if self.sourcefile.endswith(".xhp"): 205 return unescape_help_text(self._text) 206 else: 207 return unescape_text(self._text)
208
209 - def settext(self, text):
210 """Sets the text column and handle escaping.""" 211 if self.sourcefile.endswith(".xhp"): 212 self._text = escape_help_text(text) 213 else: 214 self._text = escape_text(text)
215 text = property(gettext, settext) 216
217 - def __str__(self):
218 """convert to a string. double check that unicode is handled""" 219 return encode_if_needed_utf8(self.getoutput())
220
221 - def getoutput(self):
222 """return a line in tab-delimited form""" 223 parts = self.getparts() 224 return "\t".join(parts)
225
226 - def getkey(self):
227 """get the key that identifies the resource""" 228 return (self.project, self.sourcefile, self.resourcetype, self.groupid, 229 self.localid, self.platform)
230 231
232 -class oounit:
233 """this represents a number of translations of a resource""" 234
235 - def __init__(self):
236 """construct the oounit""" 237 self.languages = {} 238 self.lines = []
239
240 - def addline(self, line):
241 """add a line to the oounit""" 242 self.languages[line.languageid] = line 243 self.lines.append(line)
244
245 - def __str__(self):
246 """convert to a string. double check that unicode is handled""" 247 return encode_if_needed_utf8(self.getoutput())
248
249 - def getoutput(self, skip_source=False, fallback_lang=None):
250 """return the lines in tab-delimited form""" 251 if skip_source: 252 lines = self.lines[1:] 253 if not lines: 254 # Untranslated, so let's do fall-back: (bug 1883) 255 new_line = ooline(self.lines[0].getparts()) 256 new_line.languageid = fallback_lang 257 lines = [new_line] 258 else: 259 lines = self.lines 260 return "\r\n".join([str(line) for line in lines])
261 262
263 -class oofile:
264 """this represents an entire .oo file""" 265 UnitClass = oounit 266
267 - def __init__(self, input=None):
268 """constructs the oofile""" 269 self.oolines = [] 270 self.units = [] 271 self.ookeys = {} 272 self.filename = "" 273 self.languages = [] 274 if input is not None: 275 self.parse(input)
276
277 - def addline(self, thisline):
278 """adds a parsed line to the file""" 279 key = thisline.getkey() 280 element = self.ookeys.get(key, None) 281 if element is None: 282 element = self.UnitClass() 283 self.units.append(element) 284 self.ookeys[key] = element 285 element.addline(thisline) 286 self.oolines.append(thisline) 287 if thisline.languageid not in self.languages: 288 self.languages.append(thisline.languageid)
289
290 - def parse(self, input):
291 """parses lines and adds them to the file""" 292 if not self.filename: 293 self.filename = getattr(input, 'name', '') 294 if hasattr(input, "read"): 295 src = input.read() 296 input.close() 297 else: 298 src = input 299 for line in src.split("\n"): 300 line = quote.rstripeol(line) 301 if not line: 302 continue 303 parts = line.split("\t") 304 thisline = ooline(parts) 305 self.addline(thisline)
306
307 - def __str__(self, skip_source=False, fallback_lang=None):
308 """convert to a string. double check that unicode is handled""" 309 return encode_if_needed_utf8(self.getoutput(skip_source, fallback_lang))
310
311 - def getoutput(self, skip_source=False, fallback_lang=None):
312 """converts all the lines back to tab-delimited form""" 313 lines = [] 314 for oe in self.units: 315 if len(oe.lines) > 2: 316 warnings.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe.lines), oe.languages)) 317 oekeys = [line.getkey() for line in oe.lines] 318 warnings.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe.lines), oekeys)) 319 oeline = oe.getoutput(skip_source, fallback_lang) + "\r\n" 320 lines.append(oeline) 321 return "".join(lines)
322 323
324 -class oomultifile:
325 """this takes a huge GSI file and represents it as multiple smaller files...""" 326
327 - def __init__(self, filename, mode=None, multifilestyle="single"):
328 """initialises oomultifile from a seekable inputfile or writable outputfile""" 329 self.filename = filename 330 if mode is None: 331 if os.path.exists(filename): 332 mode = 'r' 333 else: 334 mode = 'w' 335 self.mode = mode 336 self.multifilestyle = multifilestyle 337 self.multifilename = os.path.splitext(filename)[0] 338 self.multifile = open(filename, mode) 339 self.subfilelines = {} 340 if mode == "r": 341 self.createsubfileindex()
342
343 - def createsubfileindex(self):
344 """reads in all the lines and works out the subfiles""" 345 linenum = 0 346 for line in self.multifile: 347 subfile = self.getsubfilename(line) 348 if not subfile in self.subfilelines: 349 self.subfilelines[subfile] = [] 350 self.subfilelines[subfile].append(linenum) 351 linenum += 1
352
353 - def getsubfilename(self, line):
354 """looks up the subfile name for the line""" 355 if line.count("\t") < 2: 356 raise ValueError("invalid tab-delimited line: %r" % line) 357 lineparts = line.split("\t", 2) 358 module, filename = lineparts[0], lineparts[1] 359 if self.multifilestyle == "onefile": 360 ooname = self.multifilename 361 elif self.multifilestyle == "toplevel": 362 ooname = module 363 else: 364 filename = filename.replace("\\", "/") 365 fileparts = [module] + filename.split("/") 366 ooname = os.path.join(*fileparts[:-1]) 367 return ooname + os.extsep + "oo"
368
369 - def listsubfiles(self):
370 """returns a list of subfiles in the file""" 371 return self.subfilelines.keys()
372
373 - def __iter__(self):
374 """iterates through the subfile names""" 375 for subfile in self.listsubfiles(): 376 yield subfile
377
378 - def __contains__(self, pathname):
379 """checks if this pathname is a valid subfile""" 380 return pathname in self.subfilelines
381
382 - def getsubfilesrc(self, subfile):
383 """returns the list of lines matching the subfile""" 384 lines = [] 385 requiredlines = dict.fromkeys(self.subfilelines[subfile]) 386 linenum = 0 387 self.multifile.seek(0) 388 for line in self.multifile: 389 if linenum in requiredlines: 390 lines.append(line) 391 linenum += 1 392 return "".join(lines)
393
394 - def openinputfile(self, subfile):
395 """returns a pseudo-file object for the given subfile""" 396 subfilesrc = self.getsubfilesrc(subfile) 397 inputfile = wStringIO.StringIO(subfilesrc) 398 inputfile.filename = subfile 399 return inputfile
400
401 - def openoutputfile(self, subfile):
402 """returns a pseudo-file object for the given subfile""" 403 404 def onclose(contents): 405 self.multifile.write(contents) 406 self.multifile.flush()
407 outputfile = wStringIO.CatchStringOutput(onclose) 408 outputfile.filename = subfile 409 return outputfile
410
411 - def getoofile(self, subfile):
412 """returns an oofile built up from the given subfile's lines""" 413 subfilesrc = self.getsubfilesrc(subfile) 414 oosubfile = oofile() 415 oosubfile.filename = subfile 416 oosubfile.parse(subfilesrc) 417 return oosubfile
418