1
2
3
4 """simple parser / string tokenizer
5 rather than returning a list of token types etc, we simple return a list of tokens...
6 each tokenizing function takes a string as input and returns a list of tokens
7 """
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
29 """takes away repeated quotes (escapes) and returns the string represented by the text"""
30 stringchar = text[0]
31 if text[-1] != stringchar or stringchar not in ("'", '"'):
32
33 raise ValueError("error parsing escaped string: %r" % text)
34 return text[1:-1].replace(stringchar+stringchar, stringchar)
35
36
38 """escapes quotes as neccessary and returns a string representing the text"""
39 if "'" in text:
40 if '"' in text:
41 return '"' + text.replace('"', '""') + '"'
42 else:
43 return '"' + text + '"'
44 else:
45 return "'" + text + "'"
46
47
49 """Intelligent parser error"""
50
51 - def __init__(self, parser, message, tokennum):
52 """takes a message and the number of the token that caused the error"""
53 tokenpos = parser.findtokenpos(tokennum)
54 line, charpos = parser.getlinepos(tokenpos)
55 ValueError.__init__(self, "%s at line %d, char %d (token %r)" % \
56 (message, line, charpos, parser.tokens[tokennum]))
57 self.parser = parser
58 self.tokennum = tokennum
59
60
62 """this is a simple parser"""
63
64 - def __init__(self, defaulttokenlist=None, whitespacechars=" \t\r\n", includewhitespacetokens=0):
65 if defaulttokenlist is None:
66 self.defaulttokenlist = ['<=', '>=', '==', '!=', '+=', '-=', '*=', '/=', '<>']
67 self.defaulttokenlist.extend('(),[]:=+-')
68 else:
69 self.defaulttokenlist = defaulttokenlist
70 self.whitespacechars = whitespacechars
71 self.includewhitespacetokens = includewhitespacetokens
72 self.standardtokenizers = [self.stringtokenize, self.removewhitespace, self.separatetokens]
73 self.quotechars = ('"', "'")
74 self.endquotechars = {'"': '"', "'": "'"}
75 self.stringescaping = 1
76
78 """makes strings in text into tokens..."""
79 tokens = []
80 laststart = 0
81 instring = 0
82 endstringchar, escapechar = '', '\\'
83 gotclose, gotescape = 0, 0
84 for pos in range(len(text)):
85 char = text[pos]
86 if instring:
87 if self.stringescaping and (gotescape or char == escapechar) and not gotclose:
88 gotescape = not gotescape
89 elif char == endstringchar:
90 gotclose = not gotclose
91 elif gotclose:
92 tokens.append(text[laststart:pos])
93 instring, laststart, endstringchar = 0, pos, ''
94 if not instring:
95 if char in self.quotechars:
96 if pos > laststart:
97 tokens.append(text[laststart:pos])
98 instring, laststart, endstringchar, gotclose = 1, pos, self.endquotechars[char], 0
99 if laststart < len(text):
100 tokens.append(text[laststart:])
101 return tokens
102
104 """checks whether a token should be kept together"""
105 return self.isstringtoken(text)
106
108 """checks whether a token is a string token"""
109 return text[:1] in self.quotechars
110
112 """this separates out tokens in tokenlist from whitespace etc"""
113 if self.keeptogether(text):
114 return [text]
115 if tokenlist is None:
116 tokenlist = self.defaulttokenlist
117
118 tokens = []
119 pos = 0
120 laststart = 0
121 lentext = len(text)
122 while pos < lentext:
123 foundtoken = 0
124 for token in tokenlist:
125 lentoken = len(token)
126 if text[pos:pos+lentoken] == token:
127 if laststart < pos:
128 tokens.append(text[laststart:pos])
129 tokens.append(token)
130 pos += lentoken
131 foundtoken, laststart = 1, pos
132 break
133 if not foundtoken:
134 pos += 1
135 if laststart < lentext:
136 tokens.append(text[laststart:])
137 return tokens
138
140 """this removes whitespace but lets it separate things out into separate tokens"""
141 if self.keeptogether(text):
142 return [text]
143
144 tokens = []
145 pos = 0
146 inwhitespace = 0
147 laststart = 0
148 for pos in range(len(text)):
149 char = text[pos]
150 if inwhitespace:
151 if char not in self.whitespacechars:
152 if laststart < pos and self.includewhitespacetokens:
153 tokens.append(text[laststart:pos])
154 inwhitespace, laststart = 0, pos
155 else:
156 if char in self.whitespacechars:
157 if laststart < pos:
158 tokens.append(text[laststart:pos])
159 inwhitespace, laststart = 1, pos
160 if laststart < len(text) and (not inwhitespace or self.includewhitespacetokens):
161 tokens.append(text[laststart:])
162 return tokens
163
165 """apply a tokenizer to a set of text, flattening the result"""
166 tokenizedlists = [tokenizer(text) for text in inputlist]
167 joined = []
168 map(joined.extend, tokenizedlists)
169 return joined
170
172 """apply a set of tokenizers to a set of text, flattening each time"""
173 for tokenizer in tokenizers:
174 inputlist = self.applytokenizer(inputlist, tokenizer)
175 return inputlist
176
177 - def tokenize(self, source, tokenizers=None):
178 """tokenize the text string with the standard tokenizers"""
179 self.source = source
180 if tokenizers is None:
181 tokenizers = self.standardtokenizers
182 self.tokens = self.applytokenizers([self.source], tokenizers)
183 return self.tokens
184
186 """finds the position of the given token in the text"""
187 currenttokenpos = 0
188 for currenttokennum in range(tokennum+1):
189 currenttokenpos = self.source.find(self.tokens[currenttokennum], currenttokenpos)
190 return currenttokenpos
191
193 """finds the line and character position of the given character"""
194 sourcecut = self.source[:tokenpos]
195 line = sourcecut.count("\n")+1
196 charpos = tokenpos - sourcecut.rfind("\n")
197 return line, charpos
198
200 """raises a ParserError"""
201 raise ParserError(self, message, tokennum)
202