1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """module for parsing html files for translation"""
24
25 import re
26 from translate.storage import base
27 from HTMLParser import HTMLParser
28
30 """A unit of translatable/localisable HTML content"""
34
38
42 source = property(getsource, setsource)
43
45 self.locations.append(location)
46
49
50
51 -class htmlfile(HTMLParser, base.TranslationStore):
52 UnitClass = htmlunit
53 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
54 markingattrs = []
55 includeattrs = ["alt", "summary", "standby", "abbr", "content"]
56
57 - def __init__(self, includeuntaggeddata=None, inputfile=None):
58 self.units = []
59 self.filename = getattr(inputfile, 'name', None)
60 self.currentblock = ""
61 self.currentblocknum = 0
62 self.currentcomment = ""
63 self.currenttag = None
64 self.includeuntaggeddata = includeuntaggeddata
65 HTMLParser.__init__(self)
66
67 if inputfile is not None:
68 htmlsrc = inputfile.read()
69 inputfile.close()
70 self.parse(htmlsrc)
71
73 """Returns the encoding of the html text.
74
75 We look for 'charset=' within a meta tag to do this.
76 """
77
78 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
79 result = re.findall(pattern, htmlsrc)
80 encoding = None
81 if result:
82 encoding = result[0]
83 return encoding
84
86 """Return the html text properly encoded based on a charset."""
87 charset = self.guess_encoding(htmlsrc)
88 if charset:
89 return htmlsrc.decode(charset)
90 else:
91 return htmlsrc
92
94 """Replaces all instances of PHP with placeholder tags, and returns
95 the new text and a dictionary of tags. The current implementation
96 replaces <?foo?> with <?md5(foo)?>. The hash => code conversions
97 are stored in self.phpdict for later use in restoring the real PHP.
98
99 The purpose of this is to remove all potential "tag-like" code from
100 inside PHP. The hash looks nothing like an HTML tag, but the following
101 PHP::
102 $a < $b ? $c : ($d > $e ? $f : $g)
103 looks like it contains an HTML tag::
104 < $b ? $c : ($d >
105 to nearly any regex. Hence, we replace all contents of PHP with simple
106 strings to help our regexes out.
107
108 """
109
110 from translate.misc import hash
111
112 self.phpdict = {}
113 result = re.findall('(?s)<\?(.*?)\?>', text)
114 for cmd in result:
115 h = hash.md5_f(cmd).hexdigest()
116 self.phpdict[h] = cmd
117 text = text.replace(cmd, h)
118 return text
119
125
126 - def parse(self, htmlsrc):
127 htmlsrc = self.do_encoding(htmlsrc)
128 htmlsrc = self.phprep(htmlsrc)
129 self.feed(htmlsrc)
130
139
141 """Strip unnecessary html from the text.
142
143 HTML tags are deemed unnecessary if it fully encloses the translatable
144 text, eg. '<a href="index.html">Home Page</a>'.
145
146 HTML tags that occurs within the normal flow of text will not be removed,
147 eg. 'This is a link to the <a href="index.html">Home Page</a>.'
148 """
149 text = text.strip()
150
151
152 result = re.findall('(?s)^<\?.*?\?>$', text)
153 if len(result) == 1:
154 return ""
155
156
157
158 pattern = re.compile(r'''
159 (?s)^ # We allow newlines, and match start of line
160 <[^?>] # Match start of tag and the first character (not ? or >)
161 (?:
162 (?:
163 [^>] # Anything that's not a > is valid tag material
164 |
165 (?:<\?.*?\?>) # Matches <? foo ?> lazily; PHP is valid
166 )* # Repeat over valid tag material
167 [^?>] # If we have > 1 char, the last char can't be ? or >
168 )? # The repeated chars are optional, so that <a>, <p> work
169 > # Match ending > of opening tag
170
171 (.*) # Match actual contents of tag
172
173 </.*[^?]> # Match ending tag; can't end with ?> and must be >=1 char
174 $ # Match end of line
175 ''', re.VERBOSE)
176 result = re.findall(pattern, text)
177 if len(result) == 1:
178 text = self.strip_html(result[0])
179 return text
180
182 """Check if the supplied HTML snippet has any content that needs to be translated."""
183
184 text = text.strip()
185 result = re.findall('(?i).*(charset.*=.*)', text)
186 if len(result) == 1:
187 return False
188
189
190 if text == ' ':
191 return False
192
193 pattern = '<\?.*?\?>'
194 result = re.sub(pattern, '', text).strip()
195 pattern = '<[^>]*>'
196 result = re.sub(pattern, '', result).strip()
197 if result:
198 return True
199 else:
200 return False
201
202
203
205 self.addhtmlblock(self.currentblock)
206 self.currentblock = ""
207 self.currentcomment = ""
208 self.currenttag = tag
209
211 self.addhtmlblock(self.currentblock)
212 self.currentblock = ""
213 self.currentcomment = ""
214 self.currenttag = None
215
217 newblock = 0
218 if tag in self.markingtags:
219 newblock = 1
220 for attrname, attrvalue in attrs:
221 if attrname in self.markingattrs:
222 newblock = 1
223 if attrname in self.includeattrs:
224 self.addhtmlblock(attrvalue)
225
226 if newblock:
227 self.startblock(tag)
228 elif self.currenttag is not None:
229 self.currentblock += self.get_starttag_text()
230
232 for attrname, attrvalue in attrs:
233 if attrname in self.includeattrs:
234 self.addhtmlblock(attrvalue)
235 if self.currenttag is not None:
236 self.currentblock += self.get_starttag_text()
237
239 if tag == self.currenttag:
240 self.endblock()
241 elif self.currenttag is not None:
242 self.currentblock += '</%s>' % tag
243
245 if self.currenttag is not None:
246 self.currentblock += data
247 elif self.includeuntaggeddata:
248 self.startblock(None)
249 self.currentblock += data
250
253
256
263
266
269