Package translate :: Package storage :: Module oo
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.oo

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2008 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """ 
 23  Classes that hold units of .oo files (oounit) or entire files (oofile). 
 24   
 25  These are specific .oo files for localisation exported by OpenOffice.org - SDF  
 26  format (previously knows as GSI files). For an overview of the format, see 
 27  U{http://l10n.openoffice.org/L10N_Framework/Intermediate_file_format.html} 
 28   
 29  The behaviour in terms of escaping is explained in detail in the programming 
 30  comments. 
 31  """ 
 32  # FIXME: add simple test which reads in a file and writes it out again 
 33   
 34  import os 
 35  import re 
 36  from translate.misc import quote 
 37  from translate.misc import wStringIO 
 38  import warnings 
 39   
 40  # File normalisation 
 41   
 42  normalfilenamechars = "/#.0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" 
 43  normalizetable = "" 
 44  for i in map(chr, range(256)): 
 45      if i in normalfilenamechars: 
 46          normalizetable += i 
 47      else: 
 48          normalizetable += "_" 
 49   
50 -class unormalizechar(dict):
51 - def __init__(self, normalchars):
52 self.normalchars = {} 53 for char in normalchars: 54 self.normalchars[ord(char)] = char
55 - def __getitem__(self, key):
56 return self.normalchars.get(key, u"_")
57 58 unormalizetable = unormalizechar(normalfilenamechars.decode("ascii")) 59
60 -def normalizefilename(filename):
61 """converts any non-alphanumeric (standard roman) characters to _""" 62 if isinstance(filename, str): 63 return filename.translate(normalizetable) 64 else: 65 return filename.translate(unormalizetable)
66
67 -def makekey(ookey, long_keys):
68 """converts an oo key tuple into a unique identifier 69 70 @param ookey: an oo key 71 @type ookey: tuple 72 @param long_keys: Use long keys 73 @type long_keys: Boolean 74 @rtype: str 75 @return: unique ascii identifier 76 """ 77 project, sourcefile, resourcetype, groupid, localid, platform = ookey 78 sourcefile = sourcefile.replace('\\','/') 79 if long_keys: 80 sourcebase = os.path.join(project, sourcefile) 81 else: 82 sourceparts = sourcefile.split('/') 83 sourcebase = "".join(sourceparts[-1:]) 84 if len(groupid) == 0 or len(localid) == 0: 85 fullid = groupid + localid 86 else: 87 fullid = groupid + "." + localid 88 if resourcetype: 89 fullid = fullid + "." + resourcetype 90 key = "%s#%s" % (sourcebase, fullid) 91 return normalizefilename(key)
92 93 # These are functions that deal with escaping and unescaping of the text fields 94 # of the SDF file. These should only be applied to the text column. 95 # The fields quickhelptext and title are assumed to carry no escaping. 96 # 97 # The escaping of all strings except those coming from .xhp (helpcontent2) 98 # sourcefiles work as follows: 99 # (newline) -> \n 100 # (carriage return) -> \r 101 # (tab) -> \t 102 # Backslash characters (\) and single quotes (') are not consistently escaped, 103 # and are therefore left as they are. 104 # 105 # For strings coming from .xhp (helpcontent2) sourcefiles the following 106 # characters are escaped inside XML tags only: 107 # < -> \< when used with lowercase tagnames (with some exceptions) 108 # > -> \> when used with lowercase tagnames (with some exceptions) 109 # " -> \" around XML properties 110 # The following is consistently escaped in .xhp strings (not only in XML tags): 111 # \ -> \\ 112
113 -def escape_text(text):
114 """Escapes SDF text to be suitable for unit consumption.""" 115 return text.replace("\n", "\\n").replace("\t", "\\t").replace("\r", "\\r")
116
117 -def unescape_text(text):
118 """Unescapes SDF text to be suitable for unit consumption.""" 119 return text.replace("\\\\", "\a").replace("\\n", "\n").replace("\\t", "\t").\ 120 replace("\\r", "\r").replace("\a", "\\\\")
121 122 helptagre = re.compile('''<[/]??[a-z_\-]+?(?:| +[a-z]+?=".*?") *[/]??>''') 123
124 -def escape_help_text(text):
125 """Escapes the help text as it would be in an SDF file. 126 127 <, >, " are only escaped in <[[:lower:]]> tags. Some HTML tags make it in in 128 lowercase so those are dealt with. Some OpenOffice.org help tags are not 129 escaped. 130 """ 131 text = text.replace("\\", "\\\\") 132 for tag in helptagre.findall(text): 133 escapethistag = False 134 for escape_tag in ["ahelp", "link", "item", "emph", "defaultinline", "switchinline", "caseinline", "variable", "bookmark_value", "image", "embedvar", "alt"]: 135 if tag.startswith("<%s" % escape_tag) or tag == "</%s>" % escape_tag: 136 escapethistag = True 137 if tag in ["<br/>", "<help-id-missing/>"]: 138 escapethistag = True 139 if escapethistag: 140 escaped_tag = ("\\<" + tag[1:-1] + "\\>").replace('"', '\\"') 141 text = text.replace(tag, escaped_tag) 142 return text
143
144 -def unescape_help_text(text):
145 """Unescapes normal text to be suitable for writing to the SDF file.""" 146 return text.replace(r"\<", "<").replace(r"\>", ">").replace(r'\"', '"').replace(r"\\", "\\")
147
148 -def encode_if_needed_utf8(text):
149 """Encode a Unicode string the the specified encoding""" 150 if isinstance(text, unicode): 151 return text.encode('UTF-8') 152 return text
153 154
155 -class ooline(object):
156 """this represents one line, one translation in an .oo file"""
157 - def __init__(self, parts=None):
158 """construct an ooline from its parts""" 159 if parts is None: 160 self.project, self.sourcefile, self.dummy, self.resourcetype, \ 161 self.groupid, self.localid, self.helpid, self.platform, \ 162 self.width, self.languageid, self.text, self.helptext, \ 163 self.quickhelptext, self.title, self.timestamp = [""] * 15 164 else: 165 self.setparts(parts)
166
167 - def setparts(self, parts):
168 """create a line from its tab-delimited parts""" 169 if len(parts) != 15: 170 warnings.warn("oo line contains %d parts, it should contain 15: %r" % \ 171 (len(parts), parts)) 172 newparts = list(parts) 173 if len(newparts) < 15: 174 newparts = newparts + [""] * (15-len(newparts)) 175 else: 176 newparts = newparts[:15] 177 parts = tuple(newparts) 178 self.project, self.sourcefile, self.dummy, self.resourcetype, \ 179 self.groupid, self.localid, self.helpid, self.platform, \ 180 self.width, self.languageid, self._text, self.helptext, \ 181 self.quickhelptext, self.title, self.timestamp = parts
182
183 - def getparts(self):
184 """return a list of parts in this line""" 185 return (self.project, self.sourcefile, self.dummy, self.resourcetype, 186 self.groupid, self.localid, self.helpid, self.platform, 187 self.width, self.languageid, self._text, self.helptext, 188 self.quickhelptext, self.title, self.timestamp)
189
190 - def gettext(self):
191 """Obtains the text column and handle escaping.""" 192 if self.sourcefile.endswith(".xhp"): 193 return unescape_help_text(self._text) 194 else: 195 return unescape_text(self._text)
196
197 - def settext(self, text):
198 """Sets the text column and handle escaping.""" 199 if self.sourcefile.endswith(".xhp"): 200 self._text = escape_help_text(text) 201 else: 202 self._text = escape_text(text)
203 text = property(gettext, settext) 204
205 - def __str__(self):
206 """convert to a string. double check that unicode is handled""" 207 return encode_if_needed_utf8(self.getoutput())
208
209 - def getoutput(self):
210 """return a line in tab-delimited form""" 211 parts = self.getparts() 212 return "\t".join(parts)
213
214 - def getkey(self):
215 """get the key that identifies the resource""" 216 return (self.project, self.sourcefile, self.resourcetype, self.groupid, 217 self.localid, self.platform)
218
219 -class oounit:
220 """this represents a number of translations of a resource"""
221 - def __init__(self):
222 """construct the oounit""" 223 self.languages = {} 224 self.lines = []
225
226 - def addline(self, line):
227 """add a line to the oounit""" 228 self.languages[line.languageid] = line 229 self.lines.append(line)
230
231 - def __str__(self):
232 """convert to a string. double check that unicode is handled""" 233 return encode_if_needed_utf8(self.getoutput())
234
235 - def getoutput(self):
236 """return the lines in tab-delimited form""" 237 return "\r\n".join([str(line) for line in self.lines])
238
239 -class oofile:
240 """this represents an entire .oo file""" 241 UnitClass = oounit
242 - def __init__(self, input=None):
243 """constructs the oofile""" 244 self.oolines = [] 245 self.units = [] 246 self.ookeys = {} 247 self.filename = "" 248 self.languages = [] 249 if input is not None: 250 self.parse(input)
251
252 - def addline(self, thisline):
253 """adds a parsed line to the file""" 254 key = thisline.getkey() 255 element = self.ookeys.get(key, None) 256 if element is None: 257 element = self.UnitClass() 258 self.units.append(element) 259 self.ookeys[key] = element 260 element.addline(thisline) 261 self.oolines.append(thisline) 262 if thisline.languageid not in self.languages: 263 self.languages.append(thisline.languageid)
264
265 - def parse(self, input):
266 """parses lines and adds them to the file""" 267 if not self.filename: 268 self.filename = getattr(input, 'name', '') 269 if hasattr(input, "read"): 270 src = input.read() 271 input.close() 272 else: 273 src = input 274 for line in src.split("\n"): 275 line = quote.rstripeol(line) 276 if not line: 277 continue 278 parts = line.split("\t") 279 thisline = ooline(parts) 280 self.addline(thisline)
281
282 - def __str__(self):
283 """convert to a string. double check that unicode is handled""" 284 return encode_if_needed_utf8(self.getoutput())
285
286 - def getoutput(self):
287 """converts all the lines back to tab-delimited form""" 288 lines = [] 289 for oe in self.units: 290 if len(oe.lines) > 2: 291 warnings.warn("contains %d lines (should be 2 at most): languages %r" % (len(oe.lines), oe.languages)) 292 oekeys = [line.getkey() for line in oe.lines] 293 warnings.warn("contains %d lines (should be 2 at most): keys %r" % (len(oe.lines), oekeys)) 294 oeline = str(oe) + "\r\n" 295 lines.append(oeline) 296 return "".join(lines)
297
298 -class oomultifile:
299 """this takes a huge GSI file and represents it as multiple smaller files..."""
300 - def __init__(self, filename, mode=None, multifilestyle="single"):
301 """initialises oomultifile from a seekable inputfile or writable outputfile""" 302 self.filename = filename 303 if mode is None: 304 if os.path.exists(filename): 305 mode = 'r' 306 else: 307 mode = 'w' 308 self.mode = mode 309 self.multifilestyle = multifilestyle 310 self.multifilename = os.path.splitext(filename)[0] 311 self.multifile = open(filename, mode) 312 self.subfilelines = {} 313 if mode == "r": 314 self.createsubfileindex()
315
316 - def createsubfileindex(self):
317 """reads in all the lines and works out the subfiles""" 318 linenum = 0 319 for line in self.multifile: 320 subfile = self.getsubfilename(line) 321 if not subfile in self.subfilelines: 322 self.subfilelines[subfile] = [] 323 self.subfilelines[subfile].append(linenum) 324 linenum += 1
325
326 - def getsubfilename(self, line):
327 """looks up the subfile name for the line""" 328 if line.count("\t") < 2: 329 raise ValueError("invalid tab-delimited line: %r" % line) 330 lineparts = line.split("\t", 2) 331 module, filename = lineparts[0], lineparts[1] 332 if self.multifilestyle == "onefile": 333 ooname = self.multifilename 334 elif self.multifilestyle == "toplevel": 335 ooname = module 336 else: 337 filename = filename.replace("\\", "/") 338 fileparts = [module] + filename.split("/") 339 ooname = os.path.join(*fileparts[:-1]) 340 return ooname + os.extsep + "oo"
341
342 - def listsubfiles(self):
343 """returns a list of subfiles in the file""" 344 return self.subfilelines.keys()
345
346 - def __iter__(self):
347 """iterates through the subfile names""" 348 for subfile in self.listsubfiles(): 349 yield subfile
350
351 - def __contains__(self, pathname):
352 """checks if this pathname is a valid subfile""" 353 return pathname in self.subfilelines
354
355 - def getsubfilesrc(self, subfile):
356 """returns the list of lines matching the subfile""" 357 lines = [] 358 requiredlines = dict.fromkeys(self.subfilelines[subfile]) 359 linenum = 0 360 self.multifile.seek(0) 361 for line in self.multifile: 362 if linenum in requiredlines: 363 lines.append(line) 364 linenum += 1 365 return "".join(lines)
366
367 - def openinputfile(self, subfile):
368 """returns a pseudo-file object for the given subfile""" 369 subfilesrc = self.getsubfilesrc(subfile) 370 inputfile = wStringIO.StringIO(subfilesrc) 371 inputfile.filename = subfile 372 return inputfile
373
374 - def openoutputfile(self, subfile):
375 """returns a pseudo-file object for the given subfile""" 376 def onclose(contents): 377 self.multifile.write(contents) 378 self.multifile.flush()
379 outputfile = wStringIO.CatchStringOutput(onclose) 380 outputfile.filename = subfile 381 return outputfile
382
383 - def getoofile(self, subfile):
384 """returns an oofile built up from the given subfile's lines""" 385 subfilesrc = self.getsubfilesrc(subfile) 386 oosubfile = oofile() 387 oosubfile.filename = subfile 388 oosubfile.parse(subfilesrc) 389 return oosubfile
390