1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """module for parsing html files for translation"""
24
25 import re
26 from htmlentitydefs import name2codepoint
27 import HTMLParser
28
29 from translate.storage import base
30 from translate.storage.base import ParseError
31
32
33
34 HTMLParser.piclose = re.compile('\?>')
35
36
37 strip_html_re = re.compile(r'''
38 (?s)^ # We allow newlines, and match start of line
39 <(?P<tag>[^\s?>]+) # Match start of tag and the first character (not ? or >)
40 (?:
41 (?:
42 [^>] # Anything that's not a > is valid tag material
43 |
44 (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid
45 )* # Repeat over valid tag material
46 [^?>] # If we have > 1 char, the last char can't be ? or >
47 )? # The repeated chars are optional, so that <a>, <p> work
48 > # Match ending > of opening tag
49
50 (.*) # Match actual contents of tag
51
52 </(?P=tag)> # Match ending tag; can't end with ?> and must be >=1 char
53 $ # Match end of line
54 ''', re.VERBOSE)
55
56
58 """Strip unnecessary html from the text.
59
60 HTML tags are deemed unnecessary if it fully encloses the translatable
61 text, eg. '<a href="index.html">Home Page</a>'.
62
63 HTML tags that occurs within the normal flow of text will not be removed,
64 eg. 'This is a link to the <a href="index.html">Home Page</a>.'
65 """
66 text = text.strip()
67
68
69 result = re.findall('(?s)^<\?.*?\?>$', text)
70 if len(result) == 1:
71 return ""
72
73 result = strip_html_re.findall(text)
74 if len(result) == 1:
75 text = strip_html(result[0][1])
76 return text
77
78
79 normalize_re = re.compile("\s\s+")
80
81
85
86
88 """Escape &, < and >"""
89
90
91
92 return re.sub("&(?![a-zA-Z0-9]+;)", "&", html)
93
94
96 """A unit of translatable/localisable HTML content"""
97
101
107
111 source = property(getsource, setsource)
112
114 self.locations.append(location)
115
117 return self.locations
118
119
120 -class htmlfile(HTMLParser.HTMLParser, base.TranslationStore):
121 UnitClass = htmlunit
122
123 MARKINGTAGS = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th",
124 "td", "div", "li", "dt", "dd", "address", "caption", "pre"]
125 """Text in these tags that will be extracted from the HTML document"""
126
127 MARKINGATTRS = []
128 """Text from tags with these attributes will be extracted from the HTML
129 document"""
130
131 INCLUDEATTRS = ["alt", "summary", "standby", "abbr", "content"]
132 """Text from these attributes are extracted"""
133
134 SELF_CLOSING_TAGS = [u"area", u"base", u"basefont", u"br", u"col",
135 u"frame", u"hr", u"img", u"input", u"link", u"meta",
136 u"param"]
137 """HTML self-closing tags. Tags that should be specified as <img /> but
138 might be <img>.
139 U{Reference<http://learnwebsitemaking.com/htmlselfclosingtags.html>}"""
140
141 - def __init__(self, includeuntaggeddata=None, inputfile=None,
142 callback=None):
143 self.units = []
144 self.filename = getattr(inputfile, 'name', None)
145 self.currentblock = u""
146 self.currentcomment = u""
147 self.currenttag = None
148 self.currentpos = -1
149 self.tag_path = []
150 self.filesrc = u""
151 self.currentsrc = u""
152 self.pidict = {}
153 if callback is None:
154 self.callback = self._simple_callback
155 else:
156 self.callback = callback
157 self.includeuntaggeddata = includeuntaggeddata
158 HTMLParser.HTMLParser.__init__(self)
159
160 if inputfile is not None:
161 htmlsrc = inputfile.read()
162 inputfile.close()
163 self.parse(htmlsrc)
164
167
168 ENCODING_RE = re.compile('''<meta.*
169 content.*=.*?charset.*?=\s*?
170 ([^\s]*)
171 \s*?["']\s*?>
172 ''', re.VERBOSE | re.IGNORECASE)
173
175 """Returns the encoding of the html text.
176
177 We look for 'charset=' within a meta tag to do this.
178 """
179
180 result = self.ENCODING_RE.findall(htmlsrc)
181 encoding = None
182 if result:
183 encoding = result[0]
184 return encoding
185
187 """Return the html text properly encoded based on a charset."""
188 charset = self.guess_encoding(htmlsrc)
189 if charset:
190 return htmlsrc.decode(charset)
191 else:
192 return htmlsrc.decode('utf-8')
193
195 """Replaces all instances of process instruction with placeholders,
196 and returns the new text and a dictionary of tags. The current
197 implementation replaces <?foo?> with <?md5(foo)?>. The hash => code
198 conversions are stored in self.pidict for later use in restoring the
199 real PHP.
200
201 The purpose of this is to remove all potential "tag-like" code from
202 inside PHP. The hash looks nothing like an HTML tag, but the following
203 PHP::
204 $a < $b ? $c : ($d > $e ? $f : $g)
205 looks like it contains an HTML tag::
206 < $b ? $c : ($d >
207 to nearly any regex. Hence, we replace all contents of PHP with simple
208 strings to help our regexes out.
209
210 """
211 result = re.findall('(?s)<\?(.*?)\?>', text)
212 for pi in result:
213 pi_escaped = pi.replace("<", "%lt;").replace(">", "%gt;")
214 self.pidict[pi_escaped] = pi
215 text = text.replace(pi, pi_escaped)
216 return text
217
219 """Replaces the PHP placeholders in text with the real code"""
220 for pi_escaped, pi in self.pidict.items():
221 text = text.replace(pi_escaped, pi)
222 return text
223
224 - def parse(self, htmlsrc):
225 htmlsrc = self.do_encoding(htmlsrc)
226 htmlsrc = self.pi_escape(htmlsrc)
227 self.feed(htmlsrc)
228
239
241 """Check if the supplied HTML snippet has any content that needs to be
242 translated."""
243
244 text = text.strip()
245 result = re.findall('(?i).*(charset.*=.*)', text)
246 if len(result) == 1:
247 return False
248
249
250 if text == ' ':
251 return False
252
253 pattern = '<\?.*?\?>'
254 result = re.sub(pattern, '', text).strip()
255 pattern = '<[^>]*>'
256 result = re.sub(pattern, '', result).strip()
257 if result:
258 return True
259 else:
260 return False
261
262 - def buildtag(self, tag, attrs=None, startend=False):
263 """Create an HTML tag"""
264 selfclosing = u""
265 if startend:
266 selfclosing = u" /"
267 if attrs != [] and attrs is not None:
268 return u"<%(tag)s %(attrs)s%(selfclosing)s>" % \
269 {"tag": tag,
270 "attrs": " ".join(['%s="%s"' % pair for pair in attrs]),
271 "selfclosing": selfclosing}
272 else:
273 return u"<%(tag)s%(selfclosing)s>" % {"tag": tag,
274 "selfclosing": selfclosing}
275
276
277
279 self.addhtmlblock(self.currentblock)
280 if self.callback(normalize_html(strip_html(self.currentsrc))):
281 self.filesrc += self.currentsrc.replace(strip_html(self.currentsrc),
282 self.callback(normalize_html(strip_html(self.currentsrc)).replace("\n", " ")))
283 else:
284 self.filesrc += self.currentsrc
285 self.currentblock = ""
286 self.currentcomment = ""
287 self.currenttag = tag
288 self.currentpos = self.getpos()[0]
289 self.currentsrc = self.buildtag(tag, attrs)
290
292 self.addhtmlblock(self.currentblock)
293 if self.callback(normalize_html(strip_html(self.currentsrc))) is not None:
294 self.filesrc += self.currentsrc.replace(strip_html(self.currentsrc),
295 self.callback(normalize_html(strip_html(self.currentsrc).replace("\n", " "))))
296 else:
297 self.filesrc += self.currentsrc
298 self.currentblock = ""
299 self.currentcomment = ""
300 self.currenttag = None
301 self.currentpos = -1
302 self.currentsrc = ""
303
305 newblock = False
306 if self.tag_path != [] \
307 and self.tag_path[-1:][0] in self.SELF_CLOSING_TAGS:
308 self.tag_path.pop()
309 self.tag_path.append(tag)
310 if tag in self.MARKINGTAGS:
311 newblock = True
312 for i, attr in enumerate(attrs):
313 attrname, attrvalue = attr
314 if attrname in self.MARKINGATTRS:
315 newblock = True
316 if attrname in self.INCLUDEATTRS and self.currentblock == "":
317 self.addhtmlblock(attrvalue)
318 attrs[i] = (attrname,
319 self.callback(normalize_html(attrvalue).replace("\n", " ")))
320
321 if newblock:
322 self.startblock(tag, attrs)
323 elif self.currenttag is not None:
324 self.currentblock += self.get_starttag_text()
325 self.currentsrc += self.get_starttag_text()
326 else:
327 self.filesrc += self.buildtag(tag, attrs)
328
330 for i, attr in enumerate(attrs):
331 attrname, attrvalue = attr
332 if attrname in self.INCLUDEATTRS and self.currentblock == "":
333 self.addhtmlblock(attrvalue)
334 attrs[i] = (attrname,
335 self.callback(normalize_html(attrvalue).replace("\n", " ")))
336 if self.currenttag is not None:
337 self.currentblock += self.get_starttag_text()
338 self.currentsrc += self.get_starttag_text()
339 else:
340 self.filesrc += self.buildtag(tag, attrs, startend=True)
341
343 if tag == self.currenttag:
344 self.currentsrc += "</%(tag)s>" % {"tag": tag}
345 self.endblock()
346 elif self.currenttag is not None:
347 self.currentblock += '</%s>' % tag
348 self.currentsrc += '</%s>' % tag
349 else:
350 self.filesrc += '</%s>' % tag
351 try:
352 popped = self.tag_path.pop()
353 except IndexError:
354 raise ParseError("Mismatched tags: no more tags: line %s" %
355 self.getpos()[0])
356 while popped in self.SELF_CLOSING_TAGS:
357 popped = self.tag_path.pop()
358 if popped != tag:
359 raise ParseError("Mismatched closing tag: "
360 "expected '%s' got '%s' at line %s" %
361 (popped, tag, self.getpos()[0]))
362
364 if self.currenttag is not None:
365 self.currentblock += data
366 self.currentsrc += self.callback(data)
367 elif self.includeuntaggeddata:
368 self.startblock(None)
369 self.currentblock += data
370 self.currentsrc += data
371 else:
372 self.filesrc += self.callback(data)
373
375 """Handle entries in the form &#NNNN; e.g. ⃡"""
376 self.handle_data(unichr(int(name)))
377
379 """Handle named entities of the form &aaaa; e.g. ’"""
380 if name in ['gt', 'lt', 'amp']:
381 self.handle_data("&%s;" % name)
382 else:
383 self.handle_data(unichr(name2codepoint.get(name, u"&%s;" % name)))
384
392
395
396
399