Package translate :: Package misc :: Module sparse
[hide private]
[frames] | no frames]

Source Code for Module translate.misc.sparse

  1  #!/usr/bin/env python 
  2  # -*- coding: utf-8 -*- 
  3   
  4  """simple parser / string tokenizer 
  5  rather than returning a list of token types etc, we simple return a list of tokens... 
  6  each tokenizing function takes a string as input and returns a list of tokens 
  7  """ 
  8   
  9  # Copyright 2002, 2003 St James Software 
 10  #  
 11  # This file is part of translate. 
 12  # 
 13  # translate is free software; you can redistribute it and/or modify 
 14  # it under the terms of the GNU General Public License as published by 
 15  # the Free Software Foundation; either version 2 of the License, or 
 16  # (at your option) any later version. 
 17  #  
 18  # translate is distributed in the hope that it will be useful, 
 19  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 20  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 21  # GNU General Public License for more details. 
 22  # 
 23  # You should have received a copy of the GNU General Public License 
 24  # along with translate; if not, write to the Free Software 
 25  # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA 
 26   
27 -def stringeval(text):
28 """takes away repeated quotes (escapes) and returns the string represented by the text""" 29 stringchar = text[0] 30 if text[-1] != stringchar or stringchar not in ("'",'"'): 31 # scratch your head 32 raise ValueError, "error parsing escaped string: %r" % text 33 return text[1:-1].replace(stringchar+stringchar,stringchar)
34
35 -def stringquote(text):
36 """escapes quotes as neccessary and returns a string representing the text""" 37 if "'" in text: 38 if '"' in text: 39 return '"' + text.replace('"', '""') + '"' 40 else: 41 return '"' + text + '"' 42 else: 43 return "'" + text + "'"
44
45 -class ParserError(ValueError):
46 """Intelligent parser error"""
47 - def __init__(self, parser, message, tokennum):
48 """takes a message and the number of the token that caused the error""" 49 tokenpos = parser.findtokenpos(tokennum) 50 line, charpos = parser.getlinepos(tokenpos) 51 ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \ 52 (message, line, charpos, parser.tokens[tokennum])) 53 self.parser = parser 54 self.tokennum = tokennum
55
56 -class SimpleParser:
57 """this is a simple parser"""
58 - def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0):
59 if defaulttokenlist is None: 60 self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>'] 61 self.defaulttokenlist.extend('(),[]:=+-') 62 else: 63 self.defaulttokenlist = defaulttokenlist 64 self.whitespacechars = whitespacechars 65 self.includewhitespacetokens = includewhitespacetokens 66 self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens] 67 self.quotechars = ('"', "'") 68 self.endquotechars = {'"':'"',"'":"'"} 69 self.stringescaping = 1
70
71 - def stringtokenize(self, text):
72 """makes strings in text into tokens...""" 73 tokens = [] 74 laststart = 0 75 instring = 0 76 endstringchar, escapechar = '', '\\' 77 gotclose, gotescape = 0, 0 78 for pos in range(len(text)): 79 char = text[pos] 80 if instring: 81 if self.stringescaping and (gotescape or char == escapechar) and not gotclose: 82 gotescape = not gotescape 83 elif char == endstringchar: 84 gotclose = not gotclose 85 elif gotclose: 86 tokens.append(text[laststart:pos]) 87 instring, laststart, endstringchar = 0, pos, '' 88 if not instring: 89 if char in self.quotechars: 90 if pos > laststart: tokens.append(text[laststart:pos]) 91 instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0 92 if laststart < len(text): tokens.append(text[laststart:]) 93 return tokens
94
95 - def keeptogether(self, text):
96 """checks whether a token should be kept together""" 97 return self.isstringtoken(text)
98
99 - def isstringtoken(self, text):
100 """checks whether a token is a string token""" 101 return text[:1] in self.quotechars
102
103 - def separatetokens(self, text, tokenlist = None):
104 """this separates out tokens in tokenlist from whitespace etc""" 105 if self.keeptogether(text): return [text] 106 if tokenlist is None: 107 tokenlist = self.defaulttokenlist 108 # loop through and put tokens into a list 109 tokens = [] 110 pos = 0 111 laststart = 0 112 lentext = len(text) 113 while pos < lentext: 114 foundtoken = 0 115 for token in tokenlist: 116 lentoken = len(token) 117 if text[pos:pos+lentoken] == token: 118 if laststart < pos: tokens.append(text[laststart:pos]) 119 tokens.append(token) 120 pos += lentoken 121 foundtoken, laststart = 1, pos 122 break 123 if not foundtoken: pos += 1 124 if laststart < lentext: tokens.append(text[laststart:]) 125 return tokens
126
127 - def removewhitespace(self, text):
128 """this removes whitespace but lets it separate things out into separate tokens""" 129 if self.keeptogether(text): return [text] 130 # loop through and put tokens into a list 131 tokens = [] 132 pos = 0 133 inwhitespace = 0 134 laststart = 0 135 for pos in range(len(text)): 136 char = text[pos] 137 if inwhitespace: 138 if char not in self.whitespacechars: 139 if laststart < pos and self.includewhitespacetokens: tokens.append(text[laststart:pos]) 140 inwhitespace, laststart = 0, pos 141 else: 142 if char in self.whitespacechars: 143 if laststart < pos: tokens.append(text[laststart:pos]) 144 inwhitespace, laststart = 1, pos 145 if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens): 146 tokens.append(text[laststart:]) 147 return tokens
148
149 - def applytokenizer(self, inputlist, tokenizer):
150 """apply a tokenizer to a set of text, flattening the result""" 151 tokenizedlists = [tokenizer(text) for text in inputlist] 152 joined = [] 153 map(joined.extend, tokenizedlists) 154 return joined
155
156 - def applytokenizers(self, inputlist, tokenizers):
157 """apply a set of tokenizers to a set of text, flattening each time""" 158 for tokenizer in tokenizers: 159 inputlist = self.applytokenizer(inputlist, tokenizer) 160 return inputlist
161
162 - def tokenize(self, source, tokenizers=None):
163 """tokenize the text string with the standard tokenizers""" 164 self.source = source 165 if tokenizers is None: 166 tokenizers = self.standardtokenizers 167 self.tokens = self.applytokenizers([self.source], tokenizers) 168 return self.tokens
169
170 - def findtokenpos(self, tokennum):
171 """finds the position of the given token in the text""" 172 currenttokenpos = 0 173 for currenttokennum in range(tokennum+1): 174 currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos) 175 return currenttokenpos
176
177 - def getlinepos(self, tokenpos):
178 """finds the line and character position of the given character""" 179 sourcecut = self.source[:tokenpos] 180 line = sourcecut.count("\n")+1 181 charpos = tokenpos - sourcecut.rfind("\n") 182 return line, charpos
183
184 - def raiseerror(self, message, tokennum):
185 """raises a ParserError""" 186 raise ParserError(self, message, tokennum)
187