Package translate :: Package misc :: Module quote
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.quote

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3  #  
  4  # Copyright 2002-2006 Zuza Software Foundation 
  5  #  
  6  # This file is part of translate. 
  7  # 
  8  # translate is free software; you can redistribute it and/or modify 
  9  # it under the terms of the GNU General Public License as published by 
 10  # the Free Software Foundation; either version 2 of the License, or 
 11  # (at your option) any later version. 
 12  #  
 13  # translate is distributed in the hope that it will be useful, 
 14  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 15  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 16  # GNU General Public License for more details. 
 17  # 
 18  # You should have received a copy of the GNU General Public License 
 19  # along with translate; if not, write to the Free Software 
 20  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 21   
 22  """string processing utilities for extracting strings with various kinds of delimiters""" 
 23   
 24  import logging 
 25  import htmlentitydefs 
 26   
27 -def find_all(searchin, substr):
28 """returns a list of locations where substr occurs in searchin 29 locations are not allowed to overlap""" 30 location = 0 31 locations = [] 32 while location != -1: 33 location = searchin.find(substr, location) 34 if location != -1: 35 locations.append(location) 36 location += len(substr) 37 return locations
38
39 -def extract(source, startdelim, enddelim, escape=None, startinstring=False, allowreentry=True):
40 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping 41 returns tuple of (quoted string with quotes, still in string at end)""" 42 # note that this returns the quote characters as well... even internally 43 instring = startinstring 44 enteredonce = False 45 lenstart = len(startdelim) 46 lenend = len(enddelim) 47 startdelim_places = find_all(source, startdelim) 48 if startdelim == enddelim: 49 enddelim_places = startdelim_places[:] 50 else: 51 enddelim_places = find_all(source, enddelim) 52 if escape is not None: 53 lenescape = len(escape) 54 escape_places = find_all(source, escape) 55 last_escape_pos = -1 56 # filter escaped escapes 57 true_escape = False 58 true_escape_places = [] 59 for escape_pos in escape_places: 60 if escape_pos - lenescape in escape_places: 61 true_escape = not true_escape 62 else: 63 true_escape = True 64 if true_escape: 65 true_escape_places.append(escape_pos) 66 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 67 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 68 else: 69 enddelim_places = [pos + lenend for pos in enddelim_places] 70 # get a unique sorted list of the significant places in the string 71 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 72 significant_places.sort() 73 extracted = "" 74 lastpos = None 75 for pos in significant_places: 76 if instring and pos in enddelim_places: 77 # make sure that if startdelim == enddelim we don't get confused and count the same string as start and end 78 if lastpos == pos - lenstart and lastpos in startdelim_places: 79 continue 80 extracted += source[lastpos:pos] 81 instring = False 82 lastpos = pos 83 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 84 instring = True 85 enteredonce = True 86 lastpos = pos 87 if instring: 88 extracted += source[lastpos:] 89 return (extracted, instring)
90
91 -def extractfromlines(lines, startdelim, enddelim, escape):
92 """Calls extract over multiple lines, remembering whether in the string or not""" 93 result = "" 94 instring = 0 95 for line in lines: 96 (string, instring) = extract(line, startdelim, enddelim, escape, instring) 97 result += string 98 if not instring: break 99 return result
100
101 -def extractstr(source):
102 "Extracts a doublequote-delimited string from a string, allowing for backslash-escaping" 103 (string, instring) = extract(source, '"', '"', '\\') 104 return string
105
106 -def extractcomment(lines):
107 "Extracts <!-- > XML comments from lines" 108 return extractfromlines(lines, "<!--", "-->", None)
109
110 -def extractwithoutquotes(source, startdelim, enddelim, escape=None, startinstring=False, includeescapes=True, allowreentry=True):
111 """Extracts a doublequote-delimited string from a string, allowing for backslash-escaping 112 includeescapes can also be a function that takes the whole escaped string and returns the replaced version""" 113 instring = startinstring 114 enteredonce = False 115 lenstart = len(startdelim) 116 lenend = len(enddelim) 117 startdelim_places = find_all(source, startdelim) 118 if startdelim == enddelim: 119 enddelim_places = startdelim_places[:] 120 else: 121 enddelim_places = find_all(source, enddelim) 122 #hell slow because it is called far too often 123 if escape is not None: 124 lenescape = len(escape) 125 escape_places = find_all(source, escape) 126 last_escape_pos = -1 127 # filter escaped escapes 128 true_escape = False 129 true_escape_places = [] 130 for escape_pos in escape_places: 131 if escape_pos - lenescape in escape_places: 132 true_escape = not true_escape 133 else: 134 true_escape = True 135 if true_escape: 136 true_escape_places.append(escape_pos) 137 startdelim_places = [pos for pos in startdelim_places if pos - lenescape not in true_escape_places] 138 enddelim_places = [pos + lenend for pos in enddelim_places if pos - lenescape not in true_escape_places] 139 else: 140 enddelim_places = [pos + lenend for pos in enddelim_places] 141 # get a unique sorted list of the significant places in the string 142 significant_places = [0] + startdelim_places + enddelim_places + [len(source)-1] 143 significant_places.sort() 144 extracted = "" 145 lastpos = 0 146 callable_includeescapes = callable(includeescapes) 147 checkescapes = callable_includeescapes or not includeescapes 148 for pos in significant_places: 149 if instring and pos in enddelim_places and lastpos != pos - lenstart: 150 section_start, section_end = lastpos + len(startdelim), pos - len(enddelim) 151 section = source[section_start:section_end] 152 if escape is not None and checkescapes: 153 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos <= section_end] 154 new_section = "" 155 last_epos = 0 156 for epos in escape_list: 157 new_section += section[last_epos:epos] 158 if callable_includeescapes: 159 replace_escape = includeescapes(section[epos:epos+lenescape+1]) 160 # TODO: deprecate old method of returning boolean from includeescape, by removing this if block 161 if not isinstance(replace_escape, basestring): 162 if replace_escape: 163 replace_escape = section[epos:epos+lenescape+1] 164 else: 165 replace_escape = section[epos+lenescape:epos+lenescape+1] 166 new_section += replace_escape 167 last_epos = epos + lenescape + 1 168 else: 169 last_epos = epos + lenescape 170 section = new_section + section[last_epos:] 171 extracted += section 172 instring = False 173 lastpos = pos 174 if (not instring) and pos in startdelim_places and not (enteredonce and not allowreentry): 175 instring = True 176 enteredonce = True 177 lastpos = pos 178 if instring: 179 section_start = lastpos + len(startdelim) 180 section = source[section_start:] 181 if escape is not None and not includeescapes: 182 escape_list = [epos - section_start for epos in true_escape_places if section_start <= epos] 183 new_section = "" 184 last_epos = 0 185 for epos in escape_list: 186 new_section += section[last_epos:epos] 187 if callable_includeescapes and includeescapes(section[epos:epos+lenescape+1]): 188 last_epos = epos 189 else: 190 last_epos = epos + lenescape 191 section = new_section + section[last_epos:] 192 extracted += section 193 return (extracted, instring)
194
195 -def escapequotes(source, escapeescapes=0):
196 "Returns the same string, with double quotes escaped with backslash" 197 if escapeescapes: 198 return source.replace('\\', '\\\\').replace('"', '\\"') 199 else: 200 return source.replace('"','\\"')
201
202 -def escapesinglequotes(source):
203 "Returns the same string, with single quotes doubled" 204 return source.replace("'","''")
205
206 -def htmlentityencode(source):
207 """encodes source using HTML entities e.g. © -> &copy;""" 208 output = "" 209 for char in source: 210 charnum = ord(char) 211 if charnum in htmlentitydefs.codepoint2name: 212 output += "&%s;" % htmlentitydefs.codepoint2name[charnum] 213 else: 214 output += str(char) 215 return output
216
217 -def htmlentitydecode(source):
218 """decodes source using HTML entities e.g. &copy; -> ©""" 219 output = u"" 220 inentity = False 221 for char in source: 222 if char == "&": 223 inentity = True 224 possibleentity = "" 225 continue 226 if inentity: 227 if char == ";": 228 if len(possibleentity) > 0 and possibleentity in htmlentitydefs.name2codepoint: 229 output += unichr(htmlentitydefs.name2codepoint[possibleentity]) 230 inentity = False 231 else: 232 output += "&" + possibleentity + ";" 233 inentity = False 234 elif char == " ": 235 output += "&" + possibleentity + char 236 inentity = False 237 else: 238 possibleentity += char 239 else: 240 output += char 241 return output
242
243 -def javapropertiesencode(source):
244 """encodes source in the escaped-unicode encoding used by Java .properties files""" 245 output = u"" 246 for char in source: 247 charnum = ord(char) 248 if char in controlchars: 249 output += controlchars[char] 250 elif 0 <= charnum < 128: 251 output += str(char) 252 else: 253 output += u"\\u%04X" % charnum 254 return output
255
256 -def mozillapropertiesencode(source):
257 """encodes source in the escaped-unicode encoding used by Mozilla .properties files""" 258 output = u"" 259 for char in source: 260 charnum = ord(char) 261 if char in controlchars: 262 output += controlchars[char] 263 else: 264 output += char 265 return output
266 267 propertyescapes = { 268 # escapes that are self-escaping 269 "\\": "\\", "'": "'", '"': '"', 270 # control characters that we keep 271 "f": "\f", "n": "\n", "r": "\r", "t": "\t", 272 } 273 274 controlchars = { 275 # the reverse of the above... 276 "\\": "\\\\", 277 "\f": "\\f", "\n": "\\n", "\r": "\\r", "\t": "\\t" 278 } 279
280 -def escapecontrols(source):
281 """escape control characters in the given string""" 282 for key, value in controlchars.iteritems(): 283 source = source.replace(key, value) 284 return source
285
286 -def propertiesdecode(source):
287 """Decodes source from the escaped-unicode encoding used by .properties files. 288 289 Java uses Latin1 by default, and Mozilla uses UTF-8 by default.""" 290 # since the .decode("unicode-escape") routine decodes everything, and we don't want to 291 # we reimplemented the algorithm from Python Objects/unicode.c in Python here 292 # and modified it to retain escaped control characters 293 output = u"" 294 s = 0 295 if isinstance(source, str): 296 source = source.decode(encoding) 297 def unichr2(i): 298 """Returns a Unicode string of one character with ordinal 32 <= i, otherwise an escaped control character""" 299 if 32 <= i: 300 return unichr(i) 301 elif unichr(i) in controlchars: 302 # we just return the character, unescaped 303 # if people want to escape them they can use escapecontrols 304 return unichr(i) 305 else: 306 return "\\u%04x" % i
307 while s < len(source): 308 c = source[s] 309 if c != '\\': 310 output += c 311 s += 1 312 continue 313 s += 1 314 if s >= len(source): 315 # this is an escape at the end of the line, which implies a continuation... 316 # return the escape to inform the parser 317 output += c 318 continue 319 c = source[s] 320 s += 1 321 if c == '\n': pass 322 # propertyescapes lookups 323 elif c in propertyescapes: output += propertyescapes[c] 324 # \uXXXX escapes 325 # \UXXXX escapes 326 elif c in "uU": 327 digits = 4 328 x = 0 329 for digit in range(digits): 330 x <<= 4 331 if s + digit >= len(source): 332 digits = digit 333 break 334 c = source[s+digit].lower() 335 if c.isdigit(): 336 x += ord(c) - ord('0') 337 elif c in "abcdef": 338 x += ord(c) - ord('a') + 10 339 else: 340 break 341 s += digits 342 output += unichr2(x) 343 elif c == "N": 344 if source[s] != "{": 345 logging.warn("Invalid named unicode escape: no { after \\N") 346 output += "\\" + c 347 continue 348 s += 1 349 e = source.find("}", s) 350 if e == -1: 351 logging.warn("Invalid named unicode escape: no } after \\N{") 352 output += "\\" + c 353 continue 354 import unicodedata 355 name = source[s:e] 356 output += unicodedata.lookup(name) 357 s = e + 1 358 else: 359 output += c # Drop any \ that we don't specifically handle 360 return output 361
362 -def quotestr(source, escapeescapes=0):
363 "Returns a doublequote-delimited quoted string, escaping double quotes with backslash" 364 if isinstance(source, list): 365 firstline = True 366 for line in source: 367 if firstline: 368 newsource = '"' + escapequotes(line, escapeescapes) + '"' 369 firstline = False 370 else: 371 newsource = newsource + '\n' + '"' + escapequotes(line, escapeescapes) + '"' 372 return newsource 373 else: 374 return '"' + escapequotes(source, escapeescapes) + '"'
375
376 -def singlequotestr(source):
377 "Returns a doublequote-delimited quoted string, escaping single quotes with themselves" 378 return "'" + escapesinglequotes(source) + "'"
379
380 -def eitherquotestr(source):
381 "Returns a singlequote- or doublequote-delimited string, depending on what quotes it contains" 382 if '"' in source: 383 return singlequotestr(source) 384 else: 385 return quotestr(source)
386
387 -def findend(string, substring):
388 s = string.find(substring) 389 if s != -1: 390 s += len(substring) 391 return s
392
393 -def rstripeol(string):
394 return string.rstrip("\r\n")
395
396 -def stripcomment(comment, startstring="<!--", endstring="-->"):
397 cstart = comment.find(startstring) 398 if cstart == -1: 399 cstart = 0 400 else: 401 cstart += len(startstring) 402 cend = comment.find(endstring, cstart) 403 return comment[cstart:cend].strip()
404
405 -def unstripcomment(comment, startstring="<!-- ", endstring=" -->\n"):
406 return startstring+comment.strip()+endstring
407
408 -def encodewithdict(unencoded, encodedict):
409 """encodes certain characters in the string using an encode dictionary""" 410 encoded = unencoded 411 for key, value in encodedict.iteritems(): 412 if key in encoded: 413 encoded = encoded.replace(key, value) 414 return encoded
415
416 -def makeutf8(d):
417 """convert numbers to utf8 codes in the values of a dictionary""" 418 for key, value in d.items(): 419 if type(value) == int: 420 d[key] = unichr(value).encode('utf8') 421 return d
422
423 -def testcase():
424 x = ' "this" " is " "a" " test!" ' 425 print extract(x, '"', '"', None) 426 print extract(x, '"', '"', '!') 427 print extractwithoutquotes(x, '"', '"', None) 428 print extractwithoutquotes(x, '"', '"', '!') 429 print extractwithoutquotes(x, '"', '"', '!', includeescapes=False)
430 431 if __name__ == '__main__': 432 testcase() 433