Package translate :: Package storage :: Module mo
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.mo

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007 Zuza Software Foundation 
  5  # 
  6  # the function "__str__" was derived from Python v2.4 
  7  #       (Tools/i18n/msgfmt.py - function "generate"): 
  8  #   Written by Martin v. Löwis <loewis@informatik.hu-berlin.de> 
  9  #   Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 
 10  #   All rights reserved. 
 11  #   original license: Python Software Foundation (version 2) 
 12  # 
 13  # 
 14  # This file is part of translate. 
 15  # 
 16  # translate is free software; you can redistribute it and/or modify 
 17  # it under the terms of the GNU General Public License as published by 
 18  # the Free Software Foundation; either version 2 of the License, or 
 19  # (at your option) any later version. 
 20  # 
 21  # translate is distributed in the hope that it will be useful, 
 22  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 23  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 24  # GNU General Public License for more details. 
 25  # 
 26  # You should have received a copy of the GNU General Public License 
 27  # along with translate; if not, write to the Free Software 
 28  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 29  # 
 30   
 31  """Module for parsing Gettext .mo files for translation. 
 32   
 33  The coding of .mo files was produced from U{Gettext documentation 
 34  <http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>}, 
 35  Pythons msgfmt.py and by observing and testing existing .mo files in the wild. 
 36   
 37  The hash algorithm is implemented for MO files, this should result in 
 38  faster access of the MO file.  The hash is optional for Gettext 
 39  and is not needed for reading or writing MO files, in this implementation 
 40  it is always on and does produce sometimes different results to Gettext 
 41  in very small files. 
 42  """ 
 43   
 44  import array 
 45  import re 
 46  import struct 
 47   
 48  from translate.misc.multistring import multistring 
 49  from translate.storage import base 
 50  from translate.storage import po 
 51  from translate.storage import poheader 
 52   
 53  MO_MAGIC_NUMBER = 0x950412deL 
 54   
 55   
56 -def mounpack(filename='messages.mo'):
57 """Helper to unpack Gettext MO files into a Python string""" 58 f = open(filename) 59 s = f.read() 60 print "\\x%02x" * len(s) % tuple(map(ord, s)) 61 f.close()
62 63
64 -def my_swap4(result):
65 c0 = (result >> 0) & 0xff 66 c1 = (result >> 8) & 0xff 67 c2 = (result >> 16) & 0xff 68 c3 = (result >> 24) & 0xff 69 70 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
71 72
73 -def hashpjw(str_param):
74 HASHWORDBITS = 32 75 hval = 0 76 g = None 77 s = str_param 78 for s in str_param: 79 hval = hval << 4 80 hval += ord(s) 81 g = hval & 0xf << (HASHWORDBITS - 4) 82 if (g != 0): 83 hval = hval ^ g >> (HASHWORDBITS - 8) 84 hval = hval ^ g 85 return hval
86 87
88 -def get_next_prime_number(start):
89 # find the smallest prime number that is greater or equal "start" 90 91 def is_prime(num): 92 # special small numbers 93 if (num < 2) or (num == 4): 94 return False 95 if (num == 2) or (num == 3): 96 return True 97 # check for numbers > 4 98 for divider in range(2, num / 2): 99 if num % divider == 0: 100 return False 101 return True
102 103 candidate = start 104 while not is_prime(candidate): 105 candidate += 1 106 return candidate 107 108
109 -class mounit(base.TranslationUnit):
110 """A class representing a .mo translation message.""" 111
112 - def __init__(self, source=None, encoding=None):
113 #Since the units are really dumb, we ignore encoding for now 114 self.msgctxt = [] 115 self.msgidcomments = [] 116 super(mounit, self).__init__(source)
117
118 - def getcontext(self):
119 """Get the message context""" 120 # Still need to handle KDE comments 121 if self.msgctxt is None: 122 return None 123 return "".join(self.msgctxt)
124
125 - def isheader(self):
126 """Is this a header entry?""" 127 return self.source == u""
128
129 - def istranslatable(self):
130 """Is this message translateable?""" 131 return bool(self.source)
132 133
134 -class mofile(poheader.poheader, base.TranslationStore):
135 """A class representing a .mo file.""" 136 UnitClass = mounit 137 Name = _("Gettext MO file") 138 Mimetypes = ["application/x-gettext-catalog", "application/x-mo"] 139 Extensions = ["mo", "gmo"] 140 _binary = True 141
142 - def __init__(self, inputfile=None, unitclass=mounit):
143 self.UnitClass = unitclass 144 base.TranslationStore.__init__(self, unitclass=unitclass) 145 self.filename = '' 146 self._encoding = "UTF-8" 147 if inputfile is not None: 148 self.parsestring(inputfile)
149
150 - def __str__(self):
151 """Output a string representation of the MO data file""" 152 # check the header of this file for the copyright note of this function 153 154 def add_to_hash_table(string, i): 155 V = hashpjw(string) 156 # Taken from gettext-0.17:gettext-tools/src/write-mo.c:408-409 157 S = hash_size <= 2 and 3 or hash_size 158 hash_cursor = V % S 159 orig_hash_cursor = hash_cursor 160 increment = 1 + (V % (S - 2)) 161 while True: 162 index = hash_table[hash_cursor] 163 if (index == 0): 164 hash_table[hash_cursor] = i + 1 165 break 166 hash_cursor += increment 167 hash_cursor = hash_cursor % S 168 assert (hash_cursor != orig_hash_cursor)
169 170 # hash_size should be the smallest prime number that is greater 171 # or equal (4 / 3 * N) - where N is the number of keys/units. 172 # see gettext-0.17:gettext-tools/src/write-mo.c:406 173 hash_size = get_next_prime_number(int((len(self.units) * 4) / 3)) 174 if hash_size <= 2: 175 hash_size = 3 176 MESSAGES = {} 177 for unit in self.units: 178 # If the unit is not translated, we should rather omit it entirely 179 if not unit.istranslated(): 180 continue 181 if isinstance(unit.source, multistring): 182 source = "".join(unit.msgidcomments) + \ 183 "\0".join(unit.source.strings) 184 else: 185 source = "".join(unit.msgidcomments) + unit.source 186 if unit.msgctxt: 187 source = "".join(unit.msgctxt) + "\x04" + source 188 if isinstance(unit.target, multistring): 189 target = "\0".join(unit.target.strings) 190 else: 191 target = unit.target 192 if unit.target: 193 MESSAGES[source.encode("utf-8")] = target 194 # using "I" works for 32- and 64-bit systems, but not for 16-bit! 195 hash_table = array.array("I", [0] * hash_size) 196 keys = MESSAGES.keys() 197 # the keys are sorted in the .mo file 198 keys.sort() 199 offsets = [] 200 ids = strs = '' 201 for i, id in enumerate(keys): 202 # For each string, we need size and file offset. Each string is 203 # NUL terminated; the NUL does not count into the size. 204 # TODO: We don't do any encoding detection from the PO Header 205 add_to_hash_table(id, i) 206 string = MESSAGES[id] # id already encoded for use as dictionary key 207 if isinstance(string, unicode): 208 string = string.encode('utf-8') 209 offsets.append((len(ids), len(id), len(strs), len(string))) 210 ids = ids + id + '\0' 211 strs = strs + string + '\0' 212 output = '' 213 # The header is 7 32-bit unsigned integers 214 keystart = 7 * 4 + 16 * len(keys) + hash_size * 4 215 # and the values start after the keys 216 valuestart = keystart + len(ids) 217 koffsets = [] 218 voffsets = [] 219 # The string table first has the list of keys, then the list of values. 220 # Each entry has first the size of the string, then the file offset. 221 for o1, l1, o2, l2 in offsets: 222 koffsets = koffsets + [l1, o1 + keystart] 223 voffsets = voffsets + [l2, o2 + valuestart] 224 offsets = koffsets + voffsets 225 output = struct.pack("Iiiiiii", 226 MO_MAGIC_NUMBER, # Magic 227 0, # Version 228 len(keys), # # of entries 229 7 * 4, # start of key index 230 7 * 4 + len(keys) * 8, # start of value index 231 hash_size, # size of hash table 232 7 * 4 + 2 * (len(keys) * 8)) # offset of hash table 233 # additional data is not necessary for empty mo files 234 if (len(keys) > 0): 235 output = output + array.array("i", offsets).tostring() 236 output = output + hash_table.tostring() 237 output = output + ids 238 output = output + strs 239 return output
240
241 - def parse(self, input):
242 """parses the given file or file source string""" 243 if hasattr(input, 'name'): 244 self.filename = input.name 245 elif not getattr(self, 'filename', ''): 246 self.filename = '' 247 if hasattr(input, "read"): 248 mosrc = input.read() 249 input.close() 250 input = mosrc 251 little, = struct.unpack("<L", input[:4]) 252 big, = struct.unpack(">L", input[:4]) 253 if little == MO_MAGIC_NUMBER: 254 endian = "<" 255 elif big == MO_MAGIC_NUMBER: 256 endian = ">" 257 else: 258 raise ValueError("This is not an MO file") 259 magic, version_maj, version_min, lenkeys, startkey, \ 260 startvalue, sizehash, offsethash = struct.unpack("%sLHHiiiii" % endian, 261 input[:(7 * 4)]) 262 if version_maj >= 1: 263 raise base.ParseError("""Unable to process version %d.%d MO files""" % (version_maj, version_min)) 264 for i in range(lenkeys): 265 nextkey = startkey + (i * 2 * 4) 266 nextvalue = startvalue + (i * 2 * 4) 267 klength, koffset = struct.unpack("%sii" % endian, 268 input[nextkey:nextkey + (2 * 4)]) 269 vlength, voffset = struct.unpack("%sii" % endian, 270 input[nextvalue:nextvalue + (2 * 4)]) 271 source = input[koffset:koffset + klength] 272 context = None 273 if "\x04" in source: 274 context, source = source.split("\x04") 275 # Still need to handle KDE comments 276 source = multistring(source.split("\0"), encoding=self._encoding) 277 if source == "": 278 charset = re.search("charset=([^\\s]+)", 279 input[voffset:voffset + vlength]) 280 if charset: 281 self._encoding = po.encodingToUse(charset.group(1)) 282 target = multistring(input[voffset:voffset + vlength].split("\0"), 283 encoding=self._encoding) 284 newunit = mounit(source) 285 newunit.settarget(target) 286 if context is not None: 287 newunit.msgctxt.append(context) 288 self.addunit(newunit)
289