1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """module for parsing html files for translation"""
24
25 import re
26 from translate.storage import base
27 from HTMLParser import HTMLParser
28
30 """A unit of translatable/localisable HTML content"""
34
38
41 source = property(getsource, setsource)
42
44 self.locations.append(location)
45
48
49
50 -class htmlfile(HTMLParser, base.TranslationStore):
51 UnitClass = htmlunit
52 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
53 markingattrs = []
54 includeattrs = ["alt", "summary", "standby", "abbr", "content"]
55
56 - def __init__(self, includeuntaggeddata=None, inputfile=None):
57 self.units = []
58 self.filename = getattr(inputfile, 'name', None)
59 self.currentblock = ""
60 self.currentblocknum = 0
61 self.currenttag = None
62 self.includeuntaggeddata = includeuntaggeddata
63 HTMLParser.__init__(self)
64
65 if inputfile is not None:
66 htmlsrc = inputfile.read()
67 inputfile.close()
68 self.parse(htmlsrc)
69
71 """Returns the encoding of the html text.
72
73 We look for 'charset=' within a meta tag to do this.
74 """
75
76 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
77 result = re.findall(pattern, htmlsrc)
78 encoding = None
79 if result:
80 encoding = result[0]
81 return encoding
82
84 """Return the html text properly encoded based on a charset."""
85 charset = self.guess_encoding(htmlsrc)
86 if charset:
87 return htmlsrc.decode(charset)
88 else:
89 return htmlsrc
90
92 """Replaces all instances of PHP with placeholder tags, and returns
93 the new text and a dictionary of tags. The current implementation
94 replaces <?foo?> with <?md5(foo)?>. The hash => code conversions
95 are stored in self.phpdict for later use in restoring the real PHP.
96
97 The purpose of this is to remove all potential "tag-like" code from
98 inside PHP. The hash looks nothing like an HTML tag, but the following
99 PHP::
100 $a < $b ? $c : ($d > $e ? $f : $g)
101 looks like it contains an HTML tag::
102 < $b ? $c : ($d >
103 to nearly any regex. Hence, we replace all contents of PHP with simple
104 strings to help our regexes out.
105
106 """
107
108 from translate.misc import hash
109
110 self.phpdict = {}
111 result = re.findall('(?s)<\?(.*?)\?>', text)
112 for cmd in result:
113 h = hash.md5_f(cmd).hexdigest()
114 self.phpdict[h] = cmd
115 text = text.replace(cmd, h)
116 return text
117
123
124 - def parse(self, htmlsrc):
125 htmlsrc = self.do_encoding(htmlsrc)
126 htmlsrc = self.phprep(htmlsrc)
127 self.feed(htmlsrc)
128
136
138 """Strip unnecessary html from the text.
139
140 HTML tags are deemed unnecessary if it fully encloses the translatable
141 text, eg. '<a href="index.html">Home Page</a>'.
142
143 HTML tags that occurs within the normal flow of text will not be removed,
144 eg. 'This is a link to the <a href="index.html">Home Page</a>.'
145 """
146 text = text.strip()
147
148
149 result = re.findall('(?s)^<\?.*?\?>$', text)
150 if len(result) == 1:
151 return ""
152
153
154
155 pattern = re.compile(r'''
156 (?s)^ # We allow newlines, and match start of line
157 <[^?>] # Match start of tag and the first character (not ? or >)
158 (?:
159 (?:
160 [^>] # Anything that's not a > is valid tag material
161 |
162 (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid
163 )* # Repeat over valid tag material
164 [^?>] # If we have > 1 char, the last char can't be ? or >
165 )? # The repeated chars are optional, so that <a>, <p> work
166 > # Match ending > of opening tag
167
168 (.*) # Match actual contents of tag
169
170 </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char
171 $ # Match end of line
172 ''', re.VERBOSE)
173 result = re.findall(pattern, text)
174 if len(result) == 1:
175 text = self.strip_html(result[0])
176 return text
177
179 """Check if the supplied HTML snippet has any content that needs to be translated."""
180
181 text = text.strip()
182 result = re.findall('(?i).*(charset.*=.*)', text)
183 if len(result) == 1:
184 return False
185
186
187 if text == ' ':
188 return False
189
190 pattern = '<\?.*?\?>'
191 result = re.sub(pattern, '', text).strip()
192 pattern = '<[^>]*>'
193 result = re.sub(pattern, '', result).strip()
194 if result:
195 return True
196 else:
197 return False
198
199
200
202 self.addhtmlblock(self.currentblock)
203 self.currentblock = ""
204 self.currenttag = tag
205
207 self.addhtmlblock(self.currentblock)
208 self.currentblock = ""
209 self.currenttag = None
210
212 newblock = 0
213 if tag in self.markingtags:
214 newblock = 1
215 for attrname, attrvalue in attrs:
216 if attrname in self.markingattrs:
217 newblock = 1
218 if attrname in self.includeattrs:
219 self.addhtmlblock(attrvalue)
220
221 if newblock:
222 self.startblock(tag)
223 elif self.currenttag is not None:
224 self.currentblock += self.get_starttag_text()
225
227 for attrname, attrvalue in attrs:
228 if attrname in self.includeattrs:
229 self.addhtmlblock(attrvalue)
230 if self.currenttag is not None:
231 self.currentblock += self.get_starttag_text()
232
234 if tag == self.currenttag:
235 self.endblock()
236 elif self.currenttag is not None:
237 self.currentblock += '</%s>' % tag
238
240 if self.currenttag is not None:
241 self.currentblock += data
242 elif self.includeuntaggeddata:
243 self.startblock(None)
244 self.currentblock += data
245
248
251
255
258
261