Package translate :: Package storage :: Module mo
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.mo

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  # 
  4  # Copyright 2007 Zuza Software Foundation 
  5  # 
  6  # the function "__str__" was derived from Python v2.4 
  7  #       (Tools/i18n/msgfmt.py - function "generate"): 
  8  #   Written by Martin v. Löwis <loewis@informatik.hu-berlin.de> 
  9  #   Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006 Python Software Foundation. 
 10  #   All rights reserved. 
 11  #   original license: Python Software Foundation (version 2) 
 12  # 
 13  # 
 14  # This file is part of translate. 
 15  # 
 16  # translate is free software; you can redistribute it and/or modify 
 17  # it under the terms of the GNU General Public License as published by 
 18  # the Free Software Foundation; either version 2 of the License, or 
 19  # (at your option) any later version. 
 20  # 
 21  # translate is distributed in the hope that it will be useful, 
 22  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 23  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 24  # GNU General Public License for more details. 
 25  # 
 26  # You should have received a copy of the GNU General Public License 
 27  # along with translate; if not, write to the Free Software 
 28  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 29  # 
 30   
 31  """Module for parsing Gettext .mo files for translation. 
 32   
 33  The coding of .mo files was produced from U{Gettext documentation 
 34  <http://www.gnu.org/software/gettext/manual/gettext.html#MO-Files>}, 
 35  Pythons msgfmt.py and by observing and testing existing .mo files in the wild. 
 36   
 37  The hash algorithm is implemented for MO files, this should result in 
 38  faster access of the MO file.  The hash is optional for Gettext 
 39  and is not needed for reading or writing MO files, in this implementation 
 40  it is always on and does produce sometimes different results to Gettext 
 41  in very small files. 
 42  """ 
 43   
 44  import struct 
 45  import array 
 46  import re 
 47   
 48  from translate.storage import base 
 49  from translate.storage import po 
 50  from translate.storage import poheader 
 51  from translate.misc.multistring import multistring 
 52   
 53  MO_MAGIC_NUMBER = 0x950412deL 
 54   
 55   
56 -def mounpack(filename='messages.mo'):
57 """Helper to unpack Gettext MO files into a Python string""" 58 f = open(filename) 59 s = f.read() 60 print "\\x%02x" * len(s) % tuple(map(ord, s)) 61 f.close()
62 63
64 -def my_swap4(result):
65 c0 = (result >> 0) & 0xff 66 c1 = (result >> 8) & 0xff 67 c2 = (result >> 16) & 0xff 68 c3 = (result >> 24) & 0xff 69 70 return (c0 << 24) | (c1 << 16) | (c2 << 8) | c3
71 72
73 -def hashpjw(str_param):
74 HASHWORDBITS = 32 75 hval = 0 76 g = None 77 s = str_param 78 for s in str_param: 79 hval = hval << 4 80 hval += ord(s) 81 g = hval & 0xf << (HASHWORDBITS - 4) 82 if (g != 0): 83 hval = hval ^ g >> (HASHWORDBITS - 8) 84 hval = hval ^ g 85 return hval
86 87
88 -def get_next_prime_number(start):
89 # find the smallest prime number that is greater or equal "start" 90 91 def is_prime(num): 92 # special small numbers 93 if (num < 2) or (num == 4): 94 return False 95 if (num == 2) or (num == 3): 96 return True 97 # check for numbers > 4 98 for divider in range(2, num / 2): 99 if num % divider == 0: 100 return False 101 return True
102 103 candidate = start 104 while not is_prime(candidate): 105 candidate += 1 106 return candidate 107 108
109 -class mounit(base.TranslationUnit):
110 """A class representing a .mo translation message.""" 111
112 - def __init__(self, source=None, encoding=None):
113 #Since the units are really dumb, we ignore encoding for now 114 self.msgctxt = [] 115 self.msgidcomments = [] 116 super(mounit, self).__init__(source)
117
118 - def getcontext(self):
119 """Get the message context""" 120 # Still need to handle KDE comments 121 if self.msgctxt is None: 122 return None 123 return "".join(self.msgctxt)
124
125 - def isheader(self):
126 """Is this a header entry?""" 127 return self.source == u""
128
129 - def istranslatable(self):
130 """Is this message translateable?""" 131 return bool(self.source)
132 133
134 -class mofile(poheader.poheader, base.TranslationStore):
135 """A class representing a .mo file.""" 136 UnitClass = mounit 137 Name = _("Gettext MO file") 138 Mimetypes = ["application/x-gettext-catalog", "application/x-mo"] 139 Extensions = ["mo", "gmo"] 140 _binary = True 141
142 - def __init__(self, inputfile=None, unitclass=mounit):
143 self.UnitClass = unitclass 144 base.TranslationStore.__init__(self, unitclass=unitclass) 145 self.filename = '' 146 self._encoding = "UTF-8" 147 if inputfile is not None: 148 self.parsestring(inputfile)
149
150 - def __str__(self):
151 """Output a string representation of the MO data file""" 152 # check the header of this file for the copyright note of this function 153 154 def add_to_hash_table(string, i): 155 V = hashpjw(string) 156 # Taken from gettext-0.17:gettext-tools/src/write-mo.c:408-409 157 S = hash_size <= 2 and 3 or hash_size 158 hash_cursor = V % S 159 orig_hash_cursor = hash_cursor 160 increment = 1 + (V % (S - 2)) 161 while True: 162 index = hash_table[hash_cursor] 163 if (index == 0): 164 hash_table[hash_cursor] = i + 1 165 break 166 hash_cursor += increment 167 hash_cursor = hash_cursor % S 168 assert (hash_cursor != orig_hash_cursor)
169 170 # hash_size should be the smallest prime number that is greater 171 # or equal (4 / 3 * N) - where N is the number of keys/units. 172 # see gettext-0.17:gettext-tools/src/write-mo.c:406 173 hash_size = get_next_prime_number(int((len(self.units) * 4) / 3)) 174 if hash_size <= 2: 175 hash_size = 3 176 MESSAGES = {} 177 for unit in self.units: 178 if isinstance(unit.source, multistring): 179 source = "".join(unit.msgidcomments) + \ 180 "\0".join(unit.source.strings) 181 else: 182 source = "".join(unit.msgidcomments) + unit.source 183 if unit.msgctxt: 184 source = "".join(unit.msgctxt) + "\x04" + source 185 if isinstance(unit.target, multistring): 186 target = "\0".join(unit.target.strings) 187 else: 188 target = unit.target 189 if unit.target: 190 MESSAGES[source.encode("utf-8")] = target 191 # using "I" works for 32- and 64-bit systems, but not for 16-bit! 192 hash_table = array.array("I", [0] * hash_size) 193 keys = MESSAGES.keys() 194 # the keys are sorted in the .mo file 195 keys.sort() 196 offsets = [] 197 ids = strs = '' 198 for i, id in enumerate(keys): 199 # For each string, we need size and file offset. Each string is 200 # NUL terminated; the NUL does not count into the size. 201 # TODO: We don't do any encoding detection from the PO Header 202 add_to_hash_table(id, i) 203 string = MESSAGES[id] # id already encoded for use as dictionary key 204 if isinstance(string, unicode): 205 string = string.encode('utf-8') 206 offsets.append((len(ids), len(id), len(strs), len(string))) 207 ids = ids + id + '\0' 208 strs = strs + string + '\0' 209 output = '' 210 # The header is 7 32-bit unsigned integers 211 keystart = 7 * 4 + 16 * len(keys) + hash_size * 4 212 # and the values start after the keys 213 valuestart = keystart + len(ids) 214 koffsets = [] 215 voffsets = [] 216 # The string table first has the list of keys, then the list of values. 217 # Each entry has first the size of the string, then the file offset. 218 for o1, l1, o2, l2 in offsets: 219 koffsets = koffsets + [l1, o1 + keystart] 220 voffsets = voffsets + [l2, o2 + valuestart] 221 offsets = koffsets + voffsets 222 output = struct.pack("Iiiiiii", 223 MO_MAGIC_NUMBER, # Magic 224 0, # Version 225 len(keys), # # of entries 226 7 * 4, # start of key index 227 7 * 4 + len(keys) * 8, # start of value index 228 hash_size, # size of hash table 229 7 * 4 + 2 * (len(keys) * 8)) # offset of hash table 230 # additional data is not necessary for empty mo files 231 if (len(keys) > 0): 232 output = output + array.array("i", offsets).tostring() 233 output = output + hash_table.tostring() 234 output = output + ids 235 output = output + strs 236 return output
237
238 - def parse(self, input):
239 """parses the given file or file source string""" 240 if hasattr(input, 'name'): 241 self.filename = input.name 242 elif not getattr(self, 'filename', ''): 243 self.filename = '' 244 if hasattr(input, "read"): 245 mosrc = input.read() 246 input.close() 247 input = mosrc 248 little, = struct.unpack("<L", input[:4]) 249 big, = struct.unpack(">L", input[:4]) 250 if little == MO_MAGIC_NUMBER: 251 endian = "<" 252 elif big == MO_MAGIC_NUMBER: 253 endian = ">" 254 else: 255 raise ValueError("This is not an MO file") 256 magic, version_maj, version_min, lenkeys, startkey, \ 257 startvalue, sizehash, offsethash = struct.unpack("%sLHHiiiii" % endian, 258 input[:(7 * 4)]) 259 if version_maj >= 1: 260 raise ValueError("""Unable to process version %d.%d MO files""" % (version_maj, version_min)) 261 for i in range(lenkeys): 262 nextkey = startkey + (i * 2 * 4) 263 nextvalue = startvalue + (i * 2 * 4) 264 klength, koffset = struct.unpack("%sii" % endian, 265 input[nextkey:nextkey + (2 * 4)]) 266 vlength, voffset = struct.unpack("%sii" % endian, 267 input[nextvalue:nextvalue + (2 * 4)]) 268 source = input[koffset:koffset + klength] 269 context = None 270 if "\x04" in source: 271 context, source = source.split("\x04") 272 # Still need to handle KDE comments 273 source = multistring(source.split("\0"), encoding=self._encoding) 274 if source == "": 275 charset = re.search("charset=([^\\s]+)", 276 input[voffset:voffset + vlength]) 277 if charset: 278 self._encoding = po.encodingToUse(charset.group(1)) 279 target = multistring(input[voffset:voffset + vlength].split("\0"), 280 encoding=self._encoding) 281 newunit = mounit(source) 282 newunit.settarget(target) 283 if context is not None: 284 newunit.msgctxt.append(context) 285 self.addunit(newunit)
286