Package translate :: Package storage :: Module wordfast
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.wordfast

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2007 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """Manage the Wordfast Translation Memory format 
 23   
 24     Wordfast TM format is the Translation Memory format used by the  
 25     U{Wordfast<http://www.wordfast.net/>} computer aided translation tool. 
 26   
 27     It is a bilingual base class derived format with L{WordfastTMFile} 
 28     and L{WordfastUnit} providing file and unit level access. 
 29   
 30     Wordfast tools 
 31     ============== 
 32     Wordfast is a computer aided translation tool.  It is an application 
 33     built on top of Microsoft Word and is implemented as a rather  
 34     sophisticated set of macros.  Understanding that helps us understand 
 35     many of the seemingly strange choices around this format including: 
 36     encoding, escaping and file naming. 
 37   
 38     Implementation 
 39     ============== 
 40     The implementation covers the full requirements of a Wordfast TM file. 
 41     The files are simple Tab Separated Value (TSV) files that can be read  
 42     by Microsoft Excel and other spreadsheet programs.  They use the .txt  
 43     extension which does make it more difficult to automatically identify  
 44     such files. 
 45   
 46     The dialect of the TSV files is specified by L{WordfastDialect}. 
 47   
 48     Encoding 
 49     -------- 
 50     The files are UTF-16 or ISO-8859-1 (Latin1) encoded.  These choices 
 51     are most likely because Microsoft Word is the base editing tool for 
 52     Wordfast. 
 53   
 54     The format is tab separated so we are able to detect UTF-16 vs Latin-1  
 55     by searching for the occurance of a UTF-16 tab character and then 
 56     continuing with the parsing. 
 57   
 58     Timestamps 
 59     ---------- 
 60     L{WordfastTime} allows for the correct management of the Wordfast 
 61     YYYYMMDD~HHMMSS timestamps.  However, timestamps on individual units are  
 62     not updated when edited. 
 63   
 64     Header 
 65     ------ 
 66     L{WordfastHeader} provides header management support.  The header  
 67     functionality is fully implemented through observing the behaviour of the 
 68     files in real use cases, input from the Wordfast programmers and  
 69     public documentation. 
 70   
 71     Escaping 
 72     -------- 
 73     Wordfast TM implements a form of escaping that covers two aspects: 
 74       1. Placeable: bold, formating, etc.  These are left as is and ignored. 
 75          It is up to the editor and future placeable implementation to manage 
 76          these. 
 77       2. Escapes: items that may confuse Excel or translators are  
 78          escaped as &'XX;. These are fully implemented and are converted to 
 79          and from Unicode.  By observing behaviour and reading documentation 
 80          we where able to observe all possible escapes. Unfortunately the 
 81          escaping differs slightly between Windows and Mac version.  This 
 82          might cause errors in future. 
 83     Functions allow for L{conversion to Unicode<_wf_to_char>} and L{back to  
 84     Wordfast escapes<_char_to_wf>}. 
 85   
 86     Extended Attributes 
 87     ------------------- 
 88     The last 4 columns allow users to define and manage extended attributes. 
 89     These are left as is and are not directly managed byour implemenation. 
 90  """ 
 91   
 92  import csv 
 93  import sys 
 94  import time 
 95  from translate.storage import base 
 96   
 97  WF_TIMEFORMAT = "%Y%m%d~%H%M%S" 
 98  """Time format used by Wordfast""" 
 99   
100  WF_FIELDNAMES_HEADER = ["date", "userlist", "tucount", "src-lang", "version", "target-lang", "license", "attr1list", "attr2list", "attr3list", "attr4list", "attr5list"] 
101  """Field names for the Wordfast header""" 
102   
103  WF_FIELDNAMES = ["date", "user", "reuse", "src-lang", "source", "target-lang", "target", "attr1", "attr2", "attr3", "attr4"] 
104  """Field names for a Wordfast TU""" 
105   
106  WF_FIELDNAMES_HEADER_DEFAULTS = { 
107  "date": "%19000101~121212",  
108  "userlist": "%User ID,TT,TT Translate-Toolkit",  
109  "tucount": "%TU=00000001",  
110  "src-lang": "%EN-US",  
111  "version": "%Wordfast TM v.5.51w9/00",  
112  "target-lang": "",  
113  "license": "%---00000001",  
114  "attr1list": "",  
115  "attr2list": "",  
116  "attr3list": "",  
117  "attr4list": "" } 
118  """Default or minimum header entries for a Wordfast file""" 
119   
120  # TODO Needs validation.  The following need to be checked against a WF TM file to ensure  
121  # that the correct Unicode values have been chosen for the characters. For now these look 
122  # correct and have been taken from Windows CP1252 and Macintosh code points found for 
123  # the respective character sets on Linux. 
124  WF_ESCAPE_MAP = ( 
125                ("&'26;", u"\u0026"), # & - Ampersand (must be first to prevent escaping of escapes) 
126                ("&'82;", u"\u201A"), # ‚ - Single low-9 quotation mark 
127                ("&'85;", u"\u2026"), # … - Elippsis 
128                ("&'91;", u"\u2018"), # ‘ - left single quotation mark 
129                ("&'92;", u"\u2019"), # ’ - right single quotation mark 
130                ("&'93;", u"\u201C"), # “ - left double quotation mark 
131                ("&'94;", u"\u201D"), # ” - right double quotation mark 
132                ("&'96;", u"\u2013"), # – - en dash (validate) 
133                ("&'97;", u"\u2014"), # — - em dash (validate) 
134                ("&'99;", u"\u2122"), # ™ - Trade mark 
135                # Windows only 
136                ("&'A0;", u"\u00A0"), #   - Non breaking space 
137                ("&'A9;", u"\u00A9"), # © - Copyright 
138                ("&'AE;", u"\u00AE"), # ® - Registered 
139                ("&'BC;", u"\u00BC"), # ¼ 
140                ("&'BD;", u"\u00BD"), # ½ 
141                ("&'BE;", u"\u00BE"), # ¾ 
142                # Mac only 
143                ("&'A8;", u"\u00AE"), # ® - Registered 
144                ("&'AA;", u"\u2122"), # ™ - Trade mark 
145                ("&'C7;", u"\u00AB"), # « - Left-pointing double angle quotation mark 
146                ("&'C8;", u"\u00BB"), # » - Right-pointing double angle quotation mark 
147                ("&'C9;", u"\u2026"), # … - Horizontal Elippsis 
148                ("&'CA;", u"\u00A0"), #   - Non breaking space 
149                ("&'D0;", u"\u2013"), # – - en dash (validate) 
150                ("&'D1;", u"\u2014"), # — - em dash (validate) 
151                ("&'D2;", u"\u201C"), # “ - left double quotation mark 
152                ("&'D3;", u"\u201D"), # ” - right double quotation mark 
153                ("&'D4;", u"\u2018"), # ‘ - left single quotation mark 
154                ("&'D5;", u"\u2019"), # ’ - right single quotation mark 
155                ("&'E2;", u"\u201A"), # ‚ - Single low-9 quotation mark 
156                ("&'E3;", u"\u201E"), # „ - Double low-9 quotation mark 
157                # Other markers 
158                #("&'B;", u"\n"), # Soft-break - XXX creates a problem with roundtripping could also be represented by \u2028 
159               ) 
160  """Mapping of Wordfast &'XX; escapes to correct Unicode characters""" 
161   
162  TAB_UTF16 = "\x00\x09" 
163  """The tab \\t character as it would appear in UTF-16 encoding""" 
164   
165 -def _char_to_wf(string):
166 """Char -> Wordfast &'XX; escapes 167 168 Full roundtripping is not possible because of the escaping of NEWLINE \\n 169 and TAB \\t""" 170 # FIXME there is no platform check to ensure that we use Mac encodings when running on a Mac 171 if string: 172 for code, char in WF_ESCAPE_MAP: 173 string = string.replace(char.encode('utf-8'), code) 174 string = string.replace("\n", "\\n").replace("\t", "\\t") 175 return string
176
177 -def _wf_to_char(string):
178 """Wordfast &'XX; escapes -> Char""" 179 if string: 180 for code, char in WF_ESCAPE_MAP: 181 string = string.replace(code, char.encode('utf-8')) 182 string = string.replace("\\n", "\n").replace("\\t", "\t") 183 return string
184
185 -class WordfastDialect(csv.Dialect):
186 """Describe the properties of a Wordfast generated TAB-delimited file.""" 187 delimiter = "\t" 188 lineterminator = "\r\n" 189 quoting = csv.QUOTE_NONE 190 if sys.version_info < (2, 5, 0): 191 # We need to define the following items for csv in Python < 2.5 192 quoting = csv.QUOTE_MINIMAL # Wordfast does not quote anything, since we escape 193 # \t anyway in _char_to_wf this should not be a problem 194 doublequote = False 195 skipinitialspace = False 196 escapechar = None 197 quotechar = '"'
198 csv.register_dialect("wordfast", WordfastDialect) 199
200 -class WordfastTime(object):
201 """Manages time stamps in the Wordfast format of YYYYMMDD~hhmmss"""
202 - def __init__(self, newtime=None):
203 self._time = None 204 if not newtime: 205 self.time = None 206 elif isinstance(newtime, basestring): 207 self.timestring = newtime 208 elif isinstance(newtime, time.struct_time): 209 self.time = newtime
210
211 - def get_timestring(self):
212 """Get the time in the Wordfast time format""" 213 if not self._time: 214 return None 215 else: 216 return time.strftime(WF_TIMEFORMAT, self._time)
217
218 - def set_timestring(self, timestring):
219 """Set the time_sturct object using a Wordfast time formated string 220 221 @param timestring: A Wordfast time string (YYYMMDD~hhmmss) 222 @type timestring: String 223 """ 224 self._time = time.strptime(timestring, WF_TIMEFORMAT)
225 timestring = property(get_timestring, set_timestring) 226
227 - def get_time(self):
228 """Get the time_struct object""" 229 return self._time
230
231 - def set_time(self, newtime):
232 """Set the time_struct object 233 234 @param newtime: a new time object 235 @type newtime: time.time_struct 236 """ 237 if newtime and isinstance(newtime, time.struct_time): 238 self._time = newtime 239 else: 240 self._time = None
241 time = property(get_time, set_time) 242
243 - def __str__(self):
244 if not self.timestring: 245 return "" 246 else: 247 return self.timestring
248
249 -class WordfastHeader(object):
250 """A wordfast translation memory header"""
251 - def __init__(self, header=None):
252 self._header_dict = [] 253 if not header: 254 self.header = self._create_default_header() 255 elif isinstance(header, dict): 256 self.header = header
257
258 - def _create_default_header(self):
259 """Create a default Wordfast header with the date set to the current time""" 260 defaultheader = WF_FIELDNAMES_HEADER_DEFAULTS 261 defaultheader['date'] = '%%%s' % WordfastTime(time.localtime()).timestring 262 return defaultheader
263
264 - def getheader(self):
265 """Get the header dictionary""" 266 return self._header_dict
267
268 - def setheader(self, newheader):
269 self._header_dict = newheader
270 header = property(getheader, setheader) 271
272 - def settargetlang(self, newlang):
273 self._header_dict['target-lang'] = '%%%s' % newlang
274 targetlang = property(None, settargetlang) 275
276 - def settucount(self, count):
277 self._header_dict['tucount'] = '%%TU=%08d' % count
278 tucount = property(None, settucount)
279
280 -class WordfastUnit(base.TranslationUnit):
281 """A Wordfast translation memory unit"""
282 - def __init__(self, source=None):
283 self._dict = {} 284 if source: 285 self.source = source 286 super(WordfastUnit, self).__init__(source)
287
288 - def _update_timestamp(self):
289 """Refresh the timestamp for the unit""" 290 self._dict['date'] = WordfastTime(time.localtime()).timestring
291
292 - def getdict(self):
293 """Get the dictionary of values for a Wordfast line""" 294 return self._dict
295
296 - def setdict(self, newdict):
297 """Set the dictionary of values for a Wordfast line 298 299 @param newdict: a new dictionary with Wordfast line elements 300 @type newdict: Dict 301 """ 302 # TODO First check that the values are OK 303 self._dict = newdict
304 dict = property(getdict, setdict) 305
306 - def _get_source_or_target(self, key):
307 if self._dict.get(key, None) is None: 308 return None 309 elif self._dict[key]: 310 return _wf_to_char(self._dict[key]).decode('utf-8') 311 else: 312 return ""
313
314 - def _set_source_or_target(self, key, newvalue):
315 if newvalue is None: 316 self._dict[key] = None 317 if isinstance(newvalue, unicode): 318 newvalue = newvalue.encode('utf-8') 319 newvalue = _char_to_wf(newvalue) 320 if not key in self._dict or newvalue != self._dict[key]: 321 self._dict[key] = newvalue 322 self._update_timestamp()
323
324 - def getsource(self):
325 return self._get_source_or_target('source')
326
327 - def setsource(self, newsource):
328 return self._set_source_or_target('source', newsource)
329 source = property(getsource, setsource) 330
331 - def gettarget(self):
332 return self._get_source_or_target('target')
333
334 - def settarget(self, newtarget):
335 return self._set_source_or_target('target', newtarget)
336 target = property(gettarget, settarget) 337
338 - def settargetlang(self, newlang):
339 self._dict['target-lang'] = newlang
340 targetlang = property(None, settargetlang) 341
342 - def __str__(self):
343 return str(self._dict)
344
345 - def istranslated(self):
346 if not self._dict.get('source', None): 347 return False 348 return bool(self._dict.get('target', None))
349 350
351 -class WordfastTMFile(base.TranslationStore):
352 """A Wordfast translation memory file""" 353 Name = _("Wordfast Translation Memory") 354 Mimetypes = ["application/x-wordfast"] 355 Extensions = ["txt"]
356 - def __init__(self, inputfile=None, unitclass=WordfastUnit):
357 """construct a Wordfast TM, optionally reading in from inputfile.""" 358 self.UnitClass = unitclass 359 base.TranslationStore.__init__(self, unitclass=unitclass) 360 self.filename = '' 361 self.header = WordfastHeader() 362 self._encoding = 'iso-8859-1' 363 if inputfile is not None: 364 self.parse(inputfile)
365
366 - def parse(self, input):
367 """parsese the given file or file source string""" 368 if hasattr(input, 'name'): 369 self.filename = input.name 370 elif not getattr(self, 'filename', ''): 371 self.filename = '' 372 if hasattr(input, "read"): 373 tmsrc = input.read() 374 input.close() 375 input = tmsrc 376 if TAB_UTF16 in input.split("\n")[0]: 377 self._encoding = 'utf-16' 378 else: 379 self._encoding = 'iso-8859-1' 380 try: 381 input = input.decode(self._encoding).encode('utf-8') 382 except: 383 raise ValueError("Wordfast files are either UTF-16 (UCS2) or ISO-8859-1 encoded") 384 for header in csv.DictReader(input.split("\n")[:1], fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast"): 385 self.header = WordfastHeader(header) 386 lines = csv.DictReader(input.split("\n")[1:], fieldnames=WF_FIELDNAMES, dialect="wordfast") 387 for line in lines: 388 newunit = WordfastUnit() 389 newunit.dict = line 390 self.addunit(newunit)
391
392 - def __str__(self):
393 output = csv.StringIO() 394 header_output = csv.StringIO() 395 writer = csv.DictWriter(output, fieldnames=WF_FIELDNAMES, dialect="wordfast") 396 unit_count = 0 397 for unit in self.units: 398 if unit.istranslated(): 399 unit_count += 1 400 writer.writerow(unit.dict) 401 if unit_count == 0: 402 return "" 403 output.reset() 404 self.header.tucount = unit_count 405 outheader = csv.DictWriter(header_output, fieldnames=WF_FIELDNAMES_HEADER, dialect="wordfast") 406 outheader.writerow(self.header.header) 407 header_output.reset() 408 decoded = "".join(header_output.readlines() + output.readlines()).decode('utf-8') 409 try: 410 return decoded.encode(self._encoding) 411 except UnicodeEncodeError: 412 return decoded.encode('utf-16')
413