]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.10/Lib/HTMLParser.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / HTMLParser.py
CommitLineData
3257aa99
DM
1"""A parser for HTML and XHTML."""\r
2\r
3# This file is based on sgmllib.py, but the API is slightly different.\r
4\r
5# XXX There should be a way to distinguish between PCDATA (parsed\r
6# character data -- the normal case), RCDATA (replaceable character\r
7# data -- only char and entity references and end tags are special)\r
8# and CDATA (character data -- only end tags are special).\r
9\r
10\r
11import markupbase\r
12import re\r
13\r
14# Regular expressions used for parsing\r
15\r
16interesting_normal = re.compile('[&<]')\r
17incomplete = re.compile('&[a-zA-Z#]')\r
18\r
19entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')\r
20charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')\r
21\r
22starttagopen = re.compile('<[a-zA-Z]')\r
23piclose = re.compile('>')\r
24commentclose = re.compile(r'--\s*>')\r
25\r
26# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state\r
27# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state\r
28# note: if you change tagfind/attrfind remember to update locatestarttagend too\r
29tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')\r
30# this regex is currently unused, but left for backward compatibility\r
31tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')\r
32\r
33attrfind = re.compile(\r
34 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'\r
35 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')\r
36\r
37locatestarttagend = re.compile(r"""\r
38 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name\r
39 (?:[\s/]* # optional whitespace before attribute name\r
40 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name\r
41 (?:\s*=+\s* # value indicator\r
42 (?:'[^']*' # LITA-enclosed value\r
43 |"[^"]*" # LIT-enclosed value\r
44 |(?!['"])[^>\s]* # bare value\r
45 )\r
46 )?(?:\s|/(?!>))*\r
47 )*\r
48 )?\r
49 \s* # trailing whitespace\r
50""", re.VERBOSE)\r
51endendtag = re.compile('>')\r
52# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between\r
53# </ and the tag name, so maybe this should be fixed\r
54endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')\r
55\r
56\r
57class HTMLParseError(Exception):\r
58 """Exception raised for all parse errors."""\r
59\r
60 def __init__(self, msg, position=(None, None)):\r
61 assert msg\r
62 self.msg = msg\r
63 self.lineno = position[0]\r
64 self.offset = position[1]\r
65\r
66 def __str__(self):\r
67 result = self.msg\r
68 if self.lineno is not None:\r
69 result = result + ", at line %d" % self.lineno\r
70 if self.offset is not None:\r
71 result = result + ", column %d" % (self.offset + 1)\r
72 return result\r
73\r
74\r
75class HTMLParser(markupbase.ParserBase):\r
76 """Find tags and other markup and call handler functions.\r
77\r
78 Usage:\r
79 p = HTMLParser()\r
80 p.feed(data)\r
81 ...\r
82 p.close()\r
83\r
84 Start tags are handled by calling self.handle_starttag() or\r
85 self.handle_startendtag(); end tags by self.handle_endtag(). The\r
86 data between tags is passed from the parser to the derived class\r
87 by calling self.handle_data() with the data as argument (the data\r
88 may be split up in arbitrary chunks). Entity references are\r
89 passed by calling self.handle_entityref() with the entity\r
90 reference as the argument. Numeric character references are\r
91 passed to self.handle_charref() with the string containing the\r
92 reference as the argument.\r
93 """\r
94\r
95 CDATA_CONTENT_ELEMENTS = ("script", "style")\r
96\r
97\r
98 def __init__(self):\r
99 """Initialize and reset this instance."""\r
100 self.reset()\r
101\r
102 def reset(self):\r
103 """Reset this instance. Loses all unprocessed data."""\r
104 self.rawdata = ''\r
105 self.lasttag = '???'\r
106 self.interesting = interesting_normal\r
107 self.cdata_elem = None\r
108 markupbase.ParserBase.reset(self)\r
109\r
110 def feed(self, data):\r
111 r"""Feed data to the parser.\r
112\r
113 Call this as often as you want, with as little or as much text\r
114 as you want (may include '\n').\r
115 """\r
116 self.rawdata = self.rawdata + data\r
117 self.goahead(0)\r
118\r
119 def close(self):\r
120 """Handle any buffered data."""\r
121 self.goahead(1)\r
122\r
123 def error(self, message):\r
124 raise HTMLParseError(message, self.getpos())\r
125\r
126 __starttag_text = None\r
127\r
128 def get_starttag_text(self):\r
129 """Return full source of start tag: '<...>'."""\r
130 return self.__starttag_text\r
131\r
132 def set_cdata_mode(self, elem):\r
133 self.cdata_elem = elem.lower()\r
134 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)\r
135\r
136 def clear_cdata_mode(self):\r
137 self.interesting = interesting_normal\r
138 self.cdata_elem = None\r
139\r
140 # Internal -- handle data as far as reasonable. May leave state\r
141 # and data to be processed by a subsequent call. If 'end' is\r
142 # true, force handling all data as if followed by EOF marker.\r
143 def goahead(self, end):\r
144 rawdata = self.rawdata\r
145 i = 0\r
146 n = len(rawdata)\r
147 while i < n:\r
148 match = self.interesting.search(rawdata, i) # < or &\r
149 if match:\r
150 j = match.start()\r
151 else:\r
152 if self.cdata_elem:\r
153 break\r
154 j = n\r
155 if i < j: self.handle_data(rawdata[i:j])\r
156 i = self.updatepos(i, j)\r
157 if i == n: break\r
158 startswith = rawdata.startswith\r
159 if startswith('<', i):\r
160 if starttagopen.match(rawdata, i): # < + letter\r
161 k = self.parse_starttag(i)\r
162 elif startswith("</", i):\r
163 k = self.parse_endtag(i)\r
164 elif startswith("<!--", i):\r
165 k = self.parse_comment(i)\r
166 elif startswith("<?", i):\r
167 k = self.parse_pi(i)\r
168 elif startswith("<!", i):\r
169 k = self.parse_html_declaration(i)\r
170 elif (i + 1) < n:\r
171 self.handle_data("<")\r
172 k = i + 1\r
173 else:\r
174 break\r
175 if k < 0:\r
176 if not end:\r
177 break\r
178 k = rawdata.find('>', i + 1)\r
179 if k < 0:\r
180 k = rawdata.find('<', i + 1)\r
181 if k < 0:\r
182 k = i + 1\r
183 else:\r
184 k += 1\r
185 self.handle_data(rawdata[i:k])\r
186 i = self.updatepos(i, k)\r
187 elif startswith("&#", i):\r
188 match = charref.match(rawdata, i)\r
189 if match:\r
190 name = match.group()[2:-1]\r
191 self.handle_charref(name)\r
192 k = match.end()\r
193 if not startswith(';', k-1):\r
194 k = k - 1\r
195 i = self.updatepos(i, k)\r
196 continue\r
197 else:\r
198 if ";" in rawdata[i:]: # bail by consuming '&#'\r
199 self.handle_data(rawdata[i:i+2])\r
200 i = self.updatepos(i, i+2)\r
201 break\r
202 elif startswith('&', i):\r
203 match = entityref.match(rawdata, i)\r
204 if match:\r
205 name = match.group(1)\r
206 self.handle_entityref(name)\r
207 k = match.end()\r
208 if not startswith(';', k-1):\r
209 k = k - 1\r
210 i = self.updatepos(i, k)\r
211 continue\r
212 match = incomplete.match(rawdata, i)\r
213 if match:\r
214 # match.group() will contain at least 2 chars\r
215 if end and match.group() == rawdata[i:]:\r
216 self.error("EOF in middle of entity or char ref")\r
217 # incomplete\r
218 break\r
219 elif (i + 1) < n:\r
220 # not the end of the buffer, and can't be confused\r
221 # with some other construct\r
222 self.handle_data("&")\r
223 i = self.updatepos(i, i + 1)\r
224 else:\r
225 break\r
226 else:\r
227 assert 0, "interesting.search() lied"\r
228 # end while\r
229 if end and i < n and not self.cdata_elem:\r
230 self.handle_data(rawdata[i:n])\r
231 i = self.updatepos(i, n)\r
232 self.rawdata = rawdata[i:]\r
233\r
234 # Internal -- parse html declarations, return length or -1 if not terminated\r
235 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state\r
236 # See also parse_declaration in _markupbase\r
237 def parse_html_declaration(self, i):\r
238 rawdata = self.rawdata\r
239 if rawdata[i:i+2] != '<!':\r
240 self.error('unexpected call to parse_html_declaration()')\r
241 if rawdata[i:i+4] == '<!--':\r
242 # this case is actually already handled in goahead()\r
243 return self.parse_comment(i)\r
244 elif rawdata[i:i+3] == '<![':\r
245 return self.parse_marked_section(i)\r
246 elif rawdata[i:i+9].lower() == '<!doctype':\r
247 # find the closing >\r
248 gtpos = rawdata.find('>', i+9)\r
249 if gtpos == -1:\r
250 return -1\r
251 self.handle_decl(rawdata[i+2:gtpos])\r
252 return gtpos+1\r
253 else:\r
254 return self.parse_bogus_comment(i)\r
255\r
256 # Internal -- parse bogus comment, return length or -1 if not terminated\r
257 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state\r
258 def parse_bogus_comment(self, i, report=1):\r
259 rawdata = self.rawdata\r
260 if rawdata[i:i+2] not in ('<!', '</'):\r
261 self.error('unexpected call to parse_comment()')\r
262 pos = rawdata.find('>', i+2)\r
263 if pos == -1:\r
264 return -1\r
265 if report:\r
266 self.handle_comment(rawdata[i+2:pos])\r
267 return pos + 1\r
268\r
269 # Internal -- parse processing instr, return end or -1 if not terminated\r
270 def parse_pi(self, i):\r
271 rawdata = self.rawdata\r
272 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'\r
273 match = piclose.search(rawdata, i+2) # >\r
274 if not match:\r
275 return -1\r
276 j = match.start()\r
277 self.handle_pi(rawdata[i+2: j])\r
278 j = match.end()\r
279 return j\r
280\r
281 # Internal -- handle starttag, return end or -1 if not terminated\r
282 def parse_starttag(self, i):\r
283 self.__starttag_text = None\r
284 endpos = self.check_for_whole_start_tag(i)\r
285 if endpos < 0:\r
286 return endpos\r
287 rawdata = self.rawdata\r
288 self.__starttag_text = rawdata[i:endpos]\r
289\r
290 # Now parse the data between i+1 and j into a tag and attrs\r
291 attrs = []\r
292 match = tagfind.match(rawdata, i+1)\r
293 assert match, 'unexpected call to parse_starttag()'\r
294 k = match.end()\r
295 self.lasttag = tag = match.group(1).lower()\r
296\r
297 while k < endpos:\r
298 m = attrfind.match(rawdata, k)\r
299 if not m:\r
300 break\r
301 attrname, rest, attrvalue = m.group(1, 2, 3)\r
302 if not rest:\r
303 attrvalue = None\r
304 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \\r
305 attrvalue[:1] == '"' == attrvalue[-1:]:\r
306 attrvalue = attrvalue[1:-1]\r
307 if attrvalue:\r
308 attrvalue = self.unescape(attrvalue)\r
309 attrs.append((attrname.lower(), attrvalue))\r
310 k = m.end()\r
311\r
312 end = rawdata[k:endpos].strip()\r
313 if end not in (">", "/>"):\r
314 lineno, offset = self.getpos()\r
315 if "\n" in self.__starttag_text:\r
316 lineno = lineno + self.__starttag_text.count("\n")\r
317 offset = len(self.__starttag_text) \\r
318 - self.__starttag_text.rfind("\n")\r
319 else:\r
320 offset = offset + len(self.__starttag_text)\r
321 self.handle_data(rawdata[i:endpos])\r
322 return endpos\r
323 if end.endswith('/>'):\r
324 # XHTML-style empty tag: <span attr="value" />\r
325 self.handle_startendtag(tag, attrs)\r
326 else:\r
327 self.handle_starttag(tag, attrs)\r
328 if tag in self.CDATA_CONTENT_ELEMENTS:\r
329 self.set_cdata_mode(tag)\r
330 return endpos\r
331\r
332 # Internal -- check to see if we have a complete starttag; return end\r
333 # or -1 if incomplete.\r
334 def check_for_whole_start_tag(self, i):\r
335 rawdata = self.rawdata\r
336 m = locatestarttagend.match(rawdata, i)\r
337 if m:\r
338 j = m.end()\r
339 next = rawdata[j:j+1]\r
340 if next == ">":\r
341 return j + 1\r
342 if next == "/":\r
343 if rawdata.startswith("/>", j):\r
344 return j + 2\r
345 if rawdata.startswith("/", j):\r
346 # buffer boundary\r
347 return -1\r
348 # else bogus input\r
349 self.updatepos(i, j + 1)\r
350 self.error("malformed empty start tag")\r
351 if next == "":\r
352 # end of input\r
353 return -1\r
354 if next in ("abcdefghijklmnopqrstuvwxyz=/"\r
355 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):\r
356 # end of input in or before attribute value, or we have the\r
357 # '/' from a '/>' ending\r
358 return -1\r
359 if j > i:\r
360 return j\r
361 else:\r
362 return i + 1\r
363 raise AssertionError("we should not get here!")\r
364\r
365 # Internal -- parse endtag, return end or -1 if incomplete\r
366 def parse_endtag(self, i):\r
367 rawdata = self.rawdata\r
368 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"\r
369 match = endendtag.search(rawdata, i+1) # >\r
370 if not match:\r
371 return -1\r
372 gtpos = match.end()\r
373 match = endtagfind.match(rawdata, i) # </ + tag + >\r
374 if not match:\r
375 if self.cdata_elem is not None:\r
376 self.handle_data(rawdata[i:gtpos])\r
377 return gtpos\r
378 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state\r
379 namematch = tagfind.match(rawdata, i+2)\r
380 if not namematch:\r
381 # w3.org/TR/html5/tokenization.html#end-tag-open-state\r
382 if rawdata[i:i+3] == '</>':\r
383 return i+3\r
384 else:\r
385 return self.parse_bogus_comment(i)\r
386 tagname = namematch.group(1).lower()\r
387 # consume and ignore other stuff between the name and the >\r
388 # Note: this is not 100% correct, since we might have things like\r
389 # </tag attr=">">, but looking for > after tha name should cover\r
390 # most of the cases and is much simpler\r
391 gtpos = rawdata.find('>', namematch.end())\r
392 self.handle_endtag(tagname)\r
393 return gtpos+1\r
394\r
395 elem = match.group(1).lower() # script or style\r
396 if self.cdata_elem is not None:\r
397 if elem != self.cdata_elem:\r
398 self.handle_data(rawdata[i:gtpos])\r
399 return gtpos\r
400\r
401 self.handle_endtag(elem)\r
402 self.clear_cdata_mode()\r
403 return gtpos\r
404\r
405 # Overridable -- finish processing of start+end tag: <tag.../>\r
406 def handle_startendtag(self, tag, attrs):\r
407 self.handle_starttag(tag, attrs)\r
408 self.handle_endtag(tag)\r
409\r
410 # Overridable -- handle start tag\r
411 def handle_starttag(self, tag, attrs):\r
412 pass\r
413\r
414 # Overridable -- handle end tag\r
415 def handle_endtag(self, tag):\r
416 pass\r
417\r
418 # Overridable -- handle character reference\r
419 def handle_charref(self, name):\r
420 pass\r
421\r
422 # Overridable -- handle entity reference\r
423 def handle_entityref(self, name):\r
424 pass\r
425\r
426 # Overridable -- handle data\r
427 def handle_data(self, data):\r
428 pass\r
429\r
430 # Overridable -- handle comment\r
431 def handle_comment(self, data):\r
432 pass\r
433\r
434 # Overridable -- handle declaration\r
435 def handle_decl(self, decl):\r
436 pass\r
437\r
438 # Overridable -- handle processing instruction\r
439 def handle_pi(self, data):\r
440 pass\r
441\r
442 def unknown_decl(self, data):\r
443 pass\r
444\r
445 # Internal -- helper to remove special character quoting\r
446 entitydefs = None\r
447 def unescape(self, s):\r
448 if '&' not in s:\r
449 return s\r
450 def replaceEntities(s):\r
451 s = s.groups()[0]\r
452 try:\r
453 if s[0] == "#":\r
454 s = s[1:]\r
455 if s[0] in ['x','X']:\r
456 c = int(s[1:], 16)\r
457 else:\r
458 c = int(s)\r
459 return unichr(c)\r
460 except ValueError:\r
461 return '&#'+s+';'\r
462 else:\r
463 # Cannot use name2codepoint directly, because HTMLParser supports apos,\r
464 # which is not part of HTML 4\r
465 import htmlentitydefs\r
466 if HTMLParser.entitydefs is None:\r
467 entitydefs = HTMLParser.entitydefs = {'apos':u"'"}\r
468 for k, v in htmlentitydefs.name2codepoint.iteritems():\r
469 entitydefs[k] = unichr(v)\r
470 try:\r
471 return self.entitydefs[s]\r
472 except KeyError:\r
473 return '&'+s+';'\r
474\r
475 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)\r