Package translate :: Package storage :: Module poparser
[hide private]
[frames] | no frames]

Source Code for Module translate.storage.poparser

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2007 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  import re 
 23   
 24  """ 
 25  From the GNU gettext manual: 
 26       WHITE-SPACE 
 27       #  TRANSLATOR-COMMENTS 
 28       #. AUTOMATIC-COMMENTS 
 29       #| PREVIOUS MSGID                 (Gettext 0.16 - check if this is the correct position - not yet implemented) 
 30       #: REFERENCE... 
 31       #, FLAG... 
 32       msgctxt CONTEXT                   (Gettext 0.15) 
 33       msgid UNTRANSLATED-STRING 
 34       msgstr TRANSLATED-STRING 
 35  """ 
 36   
 37  isspace = str.isspace 
 38  find = str.find 
 39  rfind = str.rfind 
 40  startswith = str.startswith 
 41  append = list.append 
 42  decode = str.decode 
 43   
44 -class ParseState(object):
45 - def __init__(self, input_iterator, UnitClass, encoding = None):
46 self._input_iterator = input_iterator 47 self.next_line = '' 48 self.eof = False 49 self.encoding = encoding 50 self.read_line() 51 self.UnitClass = UnitClass
52
53 - def decode(self, string):
54 if self.encoding is not None: 55 return decode(string, self.encoding) 56 else: 57 return string
58
59 - def read_line(self):
60 current = self.next_line 61 if self.eof: 62 return current 63 try: 64 self.next_line = self._input_iterator.next() 65 while not self.eof and isspace(self.next_line): 66 self.next_line = self._input_iterator.next() 67 except StopIteration: 68 self.next_line = '' 69 self.eof = True 70 return current
71
72 - def new_input(self, _input):
73 return ParseState(_input, self.UnitClass, self.encoding)
74
75 -def read_prevmsgid_lines(parse_state):
76 """Read all the lines belonging starting with #|. These lines contain 77 the previous msgid and msgctxt info. We strip away the leading '#| ' 78 and read until we stop seeing #|.""" 79 prevmsgid_lines = [] 80 next_line = parse_state.next_line 81 while startswith(next_line, '#| '): 82 append(prevmsgid_lines, parse_state.read_line()[3:]) 83 next_line = parse_state.next_line 84 return prevmsgid_lines
85
86 -def parse_prev_msgctxt(parse_state, unit):
87 parse_message(parse_state, 'msgctxt', 7, unit.prev_msgctxt) 88 return len(unit.prev_msgctxt) > 0
89
90 -def parse_prev_msgid(parse_state, unit):
91 parse_message(parse_state, 'msgid', 5, unit.prev_msgid) 92 return len(unit.prev_msgid) > 0
93
94 -def parse_prev_msgid_plural(parse_state, unit):
95 parse_message(parse_state, 'msgid_plural', 12, unit.prev_msgid_plural) 96 return len(unit.prev_msgid_plural) > 0
97
98 -def parse_comment(parse_state, unit):
99 next_line = parse_state.next_line 100 if len(next_line) > 0 and next_line[0] == '#': 101 next_char = next_line[1] 102 if next_char == '.': 103 append(unit.automaticcomments, parse_state.decode(next_line)) 104 elif next_char == '|': 105 # Read all the lines starting with #| 106 prevmsgid_lines = read_prevmsgid_lines(parse_state) 107 # Create a parse state object that holds these lines 108 ps = parse_state.new_input(iter(prevmsgid_lines)) 109 # Parse the msgctxt if any 110 parse_prev_msgctxt(ps, unit) 111 # Parse the msgid if any 112 parse_prev_msgid(ps, unit) 113 # Parse the msgid_plural if any 114 parse_prev_msgid_plural(ps, unit) 115 return parse_state.next_line 116 elif next_char == ':': 117 append(unit.sourcecomments, parse_state.decode(next_line)) 118 elif next_char == ',': 119 append(unit.typecomments, parse_state.decode(next_line)) 120 elif next_char == '~': 121 # Special case: we refuse to parse obsoletes: they are done 122 # elsewhere to ensure we reuse the normal unit parsing code 123 return None 124 else: 125 append(unit.othercomments, parse_state.decode(next_line)) 126 return parse_state.read_line() 127 else: 128 return None
129
130 -def parse_comments(parse_state, unit):
131 if not parse_comment(parse_state, unit): 132 return None 133 else: 134 while parse_comment(parse_state, unit): 135 pass 136 return True
137
138 -def read_obsolete_lines(parse_state):
139 """Read all the lines belonging to the current unit if obsolete.""" 140 obsolete_lines = [] 141 if startswith(parse_state.next_line, '#~ '): 142 append(obsolete_lines, parse_state.read_line()[3:]) 143 else: 144 return obsolete_lines 145 # Be extra careful that we don't start reading into a new unit. We detect 146 # that with #~ msgid followed by a space (to ensure msgid_plural works) 147 next_line = parse_state.next_line 148 if startswith(next_line, '#~ msgid ') and obsolete_lines[-1].startswith('msgctxt'): 149 append(obsolete_lines, parse_state.read_line()[3:]) 150 next_line = parse_state.next_line 151 while startswith(next_line, '#~ ') and not (startswith(next_line, '#~ msgid ') or startswith(next_line, '#~ msgctxt')): 152 append(obsolete_lines, parse_state.read_line()[3:]) 153 next_line = parse_state.next_line 154 return obsolete_lines
155
156 -def parse_obsolete(parse_state, unit):
157 obsolete_lines = read_obsolete_lines(parse_state) 158 if obsolete_lines == []: 159 return None 160 unit = parse_unit(parse_state.new_input(iter(obsolete_lines)), unit) 161 if unit is not None: 162 unit.makeobsolete() 163 return unit
164
165 -def parse_quoted(parse_state, start_pos = 0):
166 line = parse_state.next_line 167 left = find(line, '"', start_pos) 168 if left == start_pos or isspace(line[start_pos:left]): 169 right = rfind(line, '"') 170 if left != right: 171 return parse_state.read_line()[left:right+1] 172 else: # If there is no terminating quote 173 return parse_state.read_line()[left:] + '"' 174 return None
175
176 -def parse_msg_comment(parse_state, msg_comment_list, string):
177 while string is not None: 178 append(msg_comment_list, parse_state.decode(string)) 179 if find(string, '\\n') > -1: 180 return parse_quoted(parse_state) 181 string = parse_quoted(parse_state) 182 return None
183
184 -def parse_multiple_quoted(parse_state, msg_list, msg_comment_list, first_start_pos=0):
185 string = parse_quoted(parse_state, first_start_pos) 186 while string is not None: 187 if not startswith(string, '"_:'): 188 append(msg_list, parse_state.decode(string)) 189 string = parse_quoted(parse_state) 190 else: 191 string = parse_msg_comment(parse_state, msg_comment_list, string)
192
193 -def parse_message(parse_state, start_of_string, start_of_string_len, msg_list, msg_comment_list=None):
194 if msg_comment_list is None: 195 msg_comment_list = [] 196 if startswith(parse_state.next_line, start_of_string): 197 return parse_multiple_quoted(parse_state, msg_list, msg_comment_list, start_of_string_len)
198
199 -def parse_msgctxt(parse_state, unit):
200 parse_message(parse_state, 'msgctxt', 7, unit.msgctxt) 201 return len(unit.msgctxt) > 0
202
203 -def parse_msgid(parse_state, unit):
204 parse_message(parse_state, 'msgid', 5, unit.msgid, unit.msgidcomments) 205 return len(unit.msgid) > 0 or len(unit.msgidcomments) > 0
206
207 -def parse_msgstr(parse_state, unit):
208 parse_message(parse_state, 'msgstr', 6, unit.msgstr) 209 return len(unit.msgstr) > 0
210
211 -def parse_msgid_plural(parse_state, unit):
212 parse_message(parse_state, 'msgid_plural', 12, unit.msgid_plural, unit.msgid_pluralcomments) 213 return len(unit.msgid_plural) > 0 or len(unit.msgid_pluralcomments) > 0
214 215 MSGSTR_ARRAY_ENTRY_LEN = len('msgstr[') 216
217 -def add_to_dict(msgstr_dict, line, right_bracket_pos, entry):
218 index = int(line[MSGSTR_ARRAY_ENTRY_LEN:right_bracket_pos]) 219 if index not in msgstr_dict: 220 msgstr_dict[index] = [] 221 msgstr_dict[index].extend(entry)
222
223 -def get_entry(parse_state, right_bracket_pos):
224 entry = [] 225 parse_message(parse_state, 'msgstr[', right_bracket_pos + 1, entry) 226 return entry
227
228 -def parse_msgstr_array_entry(parse_state, msgstr_dict):
229 line = parse_state.next_line 230 right_bracket_pos = find(line, ']', MSGSTR_ARRAY_ENTRY_LEN) 231 if right_bracket_pos >= 0: 232 entry = get_entry(parse_state, right_bracket_pos) 233 if len(entry) > 0: 234 add_to_dict(msgstr_dict, line, right_bracket_pos, entry) 235 return True 236 else: 237 return False 238 else: 239 return False
240
241 -def parse_msgstr_array(parse_state, unit):
242 msgstr_dict = {} 243 result = parse_msgstr_array_entry(parse_state, msgstr_dict) 244 if not result: # We require at least one result 245 return False 246 while parse_msgstr_array_entry(parse_state, msgstr_dict): 247 pass 248 unit.msgstr = msgstr_dict 249 return True
250
251 -def parse_plural(parse_state, unit):
252 if parse_msgid_plural(parse_state, unit) and \ 253 (parse_msgstr_array(parse_state, unit) or parse_msgstr(parse_state, unit)): 254 return True 255 else: 256 return False
257
258 -def parse_msg_entries(parse_state, unit):
259 parse_msgctxt(parse_state, unit) 260 if parse_msgid(parse_state, unit) and \ 261 (parse_msgstr(parse_state, unit) or parse_plural(parse_state, unit)): 262 return True 263 else: 264 return False
265
266 -def parse_unit(parse_state, unit=None):
267 unit = unit or parse_state.UnitClass() 268 parsed_comments = parse_comments(parse_state, unit) 269 obsolete_unit = parse_obsolete(parse_state, unit) 270 if obsolete_unit is not None: 271 return obsolete_unit 272 parsed_msg_entries = parse_msg_entries(parse_state, unit) 273 if parsed_comments or parsed_msg_entries: 274 return unit 275 else: 276 return None
277
278 -def set_encoding(parse_state, store, unit):
279 charset = None 280 if isinstance(unit.msgstr, list) and len(unit.msgstr) > 0 and isinstance(unit.msgstr[0], str): 281 charset = re.search("charset=([^\\s\\\\n]+)", "".join(unit.msgstr)) 282 if charset: 283 encoding = charset.group(1) 284 if encoding != 'CHARSET': 285 store._encoding = encoding 286 else: 287 store._encoding = 'utf-8' 288 else: 289 store._encoding = 'utf-8' 290 parse_state.encoding = store._encoding
291
292 -def decode_list(lst, decode):
293 return [decode(item) for item in lst]
294
295 -def decode_header(unit, decode):
296 for attr in ('msgctxt', 'msgid', 'msgid_pluralcomments', 297 'msgid_plural', 'msgstr', 'obsoletemsgctxt', 298 'obsoletemsgid', 'obsoletemsgid_pluralcomments', 299 'obsoletemsgid_plural', 'obsoletemsgstr', 300 'othercomments', 'automaticcomments', 'sourcecomments', 301 'typecomments', 'msgidcomments', 'obsoletemsgidcomments'): 302 element = getattr(unit, attr) 303 if isinstance(element, list): 304 setattr(unit, attr, decode_list(element, decode)) 305 else: 306 setattr(unit, attr, dict([(key, decode_list(value, decode)) for key, value in element.items()]))
307
308 -def parse_header(parse_state, store):
309 first_unit = parse_unit(parse_state) 310 if first_unit is None: 311 return None 312 set_encoding(parse_state, store, first_unit) 313 decode_header(first_unit, parse_state.decode) 314 return first_unit
315
316 -def parse_units(parse_state, store):
317 unit = parse_header(parse_state, store) 318 while unit: 319 store.addunit(unit) 320 unit = parse_unit(parse_state) 321 return parse_state.eof
322