]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Lib/HTMLParser.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / HTMLParser.py
CommitLineData
4710c53d 1"""A parser for HTML and XHTML."""\r
2\r
3# This file is based on sgmllib.py, but the API is slightly different.\r
4\r
5# XXX There should be a way to distinguish between PCDATA (parsed\r
6# character data -- the normal case), RCDATA (replaceable character\r
7# data -- only char and entity references and end tags are special)\r
8# and CDATA (character data -- only end tags are special).\r
9\r
10\r
11import markupbase\r
12import re\r
13\r
14# Regular expressions used for parsing\r
15\r
16interesting_normal = re.compile('[&<]')\r
17interesting_cdata = re.compile(r'<(/|\Z)')\r
18incomplete = re.compile('&[a-zA-Z#]')\r
19\r
20entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')\r
21charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')\r
22\r
23starttagopen = re.compile('<[a-zA-Z]')\r
24piclose = re.compile('>')\r
25commentclose = re.compile(r'--\s*>')\r
26tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*')\r
27attrfind = re.compile(\r
28 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*'\r
29 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?')\r
30\r
31locatestarttagend = re.compile(r"""\r
32 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name\r
33 (?:\s+ # whitespace before attribute name\r
34 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name\r
35 (?:\s*=\s* # value indicator\r
36 (?:'[^']*' # LITA-enclosed value\r
37 |\"[^\"]*\" # LIT-enclosed value\r
38 |[^'\">\s]+ # bare value\r
39 )\r
40 )?\r
41 )\r
42 )*\r
43 \s* # trailing whitespace\r
44""", re.VERBOSE)\r
45endendtag = re.compile('>')\r
46endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')\r
47\r
48\r
49class HTMLParseError(Exception):\r
50 """Exception raised for all parse errors."""\r
51\r
52 def __init__(self, msg, position=(None, None)):\r
53 assert msg\r
54 self.msg = msg\r
55 self.lineno = position[0]\r
56 self.offset = position[1]\r
57\r
58 def __str__(self):\r
59 result = self.msg\r
60 if self.lineno is not None:\r
61 result = result + ", at line %d" % self.lineno\r
62 if self.offset is not None:\r
63 result = result + ", column %d" % (self.offset + 1)\r
64 return result\r
65\r
66\r
67class HTMLParser(markupbase.ParserBase):\r
68 """Find tags and other markup and call handler functions.\r
69\r
70 Usage:\r
71 p = HTMLParser()\r
72 p.feed(data)\r
73 ...\r
74 p.close()\r
75\r
76 Start tags are handled by calling self.handle_starttag() or\r
77 self.handle_startendtag(); end tags by self.handle_endtag(). The\r
78 data between tags is passed from the parser to the derived class\r
79 by calling self.handle_data() with the data as argument (the data\r
80 may be split up in arbitrary chunks). Entity references are\r
81 passed by calling self.handle_entityref() with the entity\r
82 reference as the argument. Numeric character references are\r
83 passed to self.handle_charref() with the string containing the\r
84 reference as the argument.\r
85 """\r
86\r
87 CDATA_CONTENT_ELEMENTS = ("script", "style")\r
88\r
89\r
90 def __init__(self):\r
91 """Initialize and reset this instance."""\r
92 self.reset()\r
93\r
94 def reset(self):\r
95 """Reset this instance. Loses all unprocessed data."""\r
96 self.rawdata = ''\r
97 self.lasttag = '???'\r
98 self.interesting = interesting_normal\r
99 markupbase.ParserBase.reset(self)\r
100\r
101 def feed(self, data):\r
102 r"""Feed data to the parser.\r
103\r
104 Call this as often as you want, with as little or as much text\r
105 as you want (may include '\n').\r
106 """\r
107 self.rawdata = self.rawdata + data\r
108 self.goahead(0)\r
109\r
110 def close(self):\r
111 """Handle any buffered data."""\r
112 self.goahead(1)\r
113\r
114 def error(self, message):\r
115 raise HTMLParseError(message, self.getpos())\r
116\r
117 __starttag_text = None\r
118\r
119 def get_starttag_text(self):\r
120 """Return full source of start tag: '<...>'."""\r
121 return self.__starttag_text\r
122\r
123 def set_cdata_mode(self):\r
124 self.interesting = interesting_cdata\r
125\r
126 def clear_cdata_mode(self):\r
127 self.interesting = interesting_normal\r
128\r
129 # Internal -- handle data as far as reasonable. May leave state\r
130 # and data to be processed by a subsequent call. If 'end' is\r
131 # true, force handling all data as if followed by EOF marker.\r
132 def goahead(self, end):\r
133 rawdata = self.rawdata\r
134 i = 0\r
135 n = len(rawdata)\r
136 while i < n:\r
137 match = self.interesting.search(rawdata, i) # < or &\r
138 if match:\r
139 j = match.start()\r
140 else:\r
141 j = n\r
142 if i < j: self.handle_data(rawdata[i:j])\r
143 i = self.updatepos(i, j)\r
144 if i == n: break\r
145 startswith = rawdata.startswith\r
146 if startswith('<', i):\r
147 if starttagopen.match(rawdata, i): # < + letter\r
148 k = self.parse_starttag(i)\r
149 elif startswith("</", i):\r
150 k = self.parse_endtag(i)\r
151 elif startswith("<!--", i):\r
152 k = self.parse_comment(i)\r
153 elif startswith("<?", i):\r
154 k = self.parse_pi(i)\r
155 elif startswith("<!", i):\r
156 k = self.parse_declaration(i)\r
157 elif (i + 1) < n:\r
158 self.handle_data("<")\r
159 k = i + 1\r
160 else:\r
161 break\r
162 if k < 0:\r
163 if end:\r
164 self.error("EOF in middle of construct")\r
165 break\r
166 i = self.updatepos(i, k)\r
167 elif startswith("&#", i):\r
168 match = charref.match(rawdata, i)\r
169 if match:\r
170 name = match.group()[2:-1]\r
171 self.handle_charref(name)\r
172 k = match.end()\r
173 if not startswith(';', k-1):\r
174 k = k - 1\r
175 i = self.updatepos(i, k)\r
176 continue\r
177 else:\r
178 if ";" in rawdata[i:]: #bail by consuming &#\r
179 self.handle_data(rawdata[0:2])\r
180 i = self.updatepos(i, 2)\r
181 break\r
182 elif startswith('&', i):\r
183 match = entityref.match(rawdata, i)\r
184 if match:\r
185 name = match.group(1)\r
186 self.handle_entityref(name)\r
187 k = match.end()\r
188 if not startswith(';', k-1):\r
189 k = k - 1\r
190 i = self.updatepos(i, k)\r
191 continue\r
192 match = incomplete.match(rawdata, i)\r
193 if match:\r
194 # match.group() will contain at least 2 chars\r
195 if end and match.group() == rawdata[i:]:\r
196 self.error("EOF in middle of entity or char ref")\r
197 # incomplete\r
198 break\r
199 elif (i + 1) < n:\r
200 # not the end of the buffer, and can't be confused\r
201 # with some other construct\r
202 self.handle_data("&")\r
203 i = self.updatepos(i, i + 1)\r
204 else:\r
205 break\r
206 else:\r
207 assert 0, "interesting.search() lied"\r
208 # end while\r
209 if end and i < n:\r
210 self.handle_data(rawdata[i:n])\r
211 i = self.updatepos(i, n)\r
212 self.rawdata = rawdata[i:]\r
213\r
214 # Internal -- parse processing instr, return end or -1 if not terminated\r
215 def parse_pi(self, i):\r
216 rawdata = self.rawdata\r
217 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'\r
218 match = piclose.search(rawdata, i+2) # >\r
219 if not match:\r
220 return -1\r
221 j = match.start()\r
222 self.handle_pi(rawdata[i+2: j])\r
223 j = match.end()\r
224 return j\r
225\r
226 # Internal -- handle starttag, return end or -1 if not terminated\r
227 def parse_starttag(self, i):\r
228 self.__starttag_text = None\r
229 endpos = self.check_for_whole_start_tag(i)\r
230 if endpos < 0:\r
231 return endpos\r
232 rawdata = self.rawdata\r
233 self.__starttag_text = rawdata[i:endpos]\r
234\r
235 # Now parse the data between i+1 and j into a tag and attrs\r
236 attrs = []\r
237 match = tagfind.match(rawdata, i+1)\r
238 assert match, 'unexpected call to parse_starttag()'\r
239 k = match.end()\r
240 self.lasttag = tag = rawdata[i+1:k].lower()\r
241\r
242 while k < endpos:\r
243 m = attrfind.match(rawdata, k)\r
244 if not m:\r
245 break\r
246 attrname, rest, attrvalue = m.group(1, 2, 3)\r
247 if not rest:\r
248 attrvalue = None\r
249 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \\r
250 attrvalue[:1] == '"' == attrvalue[-1:]:\r
251 attrvalue = attrvalue[1:-1]\r
252 attrvalue = self.unescape(attrvalue)\r
253 attrs.append((attrname.lower(), attrvalue))\r
254 k = m.end()\r
255\r
256 end = rawdata[k:endpos].strip()\r
257 if end not in (">", "/>"):\r
258 lineno, offset = self.getpos()\r
259 if "\n" in self.__starttag_text:\r
260 lineno = lineno + self.__starttag_text.count("\n")\r
261 offset = len(self.__starttag_text) \\r
262 - self.__starttag_text.rfind("\n")\r
263 else:\r
264 offset = offset + len(self.__starttag_text)\r
265 self.error("junk characters in start tag: %r"\r
266 % (rawdata[k:endpos][:20],))\r
267 if end.endswith('/>'):\r
268 # XHTML-style empty tag: <span attr="value" />\r
269 self.handle_startendtag(tag, attrs)\r
270 else:\r
271 self.handle_starttag(tag, attrs)\r
272 if tag in self.CDATA_CONTENT_ELEMENTS:\r
273 self.set_cdata_mode()\r
274 return endpos\r
275\r
276 # Internal -- check to see if we have a complete starttag; return end\r
277 # or -1 if incomplete.\r
278 def check_for_whole_start_tag(self, i):\r
279 rawdata = self.rawdata\r
280 m = locatestarttagend.match(rawdata, i)\r
281 if m:\r
282 j = m.end()\r
283 next = rawdata[j:j+1]\r
284 if next == ">":\r
285 return j + 1\r
286 if next == "/":\r
287 if rawdata.startswith("/>", j):\r
288 return j + 2\r
289 if rawdata.startswith("/", j):\r
290 # buffer boundary\r
291 return -1\r
292 # else bogus input\r
293 self.updatepos(i, j + 1)\r
294 self.error("malformed empty start tag")\r
295 if next == "":\r
296 # end of input\r
297 return -1\r
298 if next in ("abcdefghijklmnopqrstuvwxyz=/"\r
299 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):\r
300 # end of input in or before attribute value, or we have the\r
301 # '/' from a '/>' ending\r
302 return -1\r
303 self.updatepos(i, j)\r
304 self.error("malformed start tag")\r
305 raise AssertionError("we should not get here!")\r
306\r
307 # Internal -- parse endtag, return end or -1 if incomplete\r
308 def parse_endtag(self, i):\r
309 rawdata = self.rawdata\r
310 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"\r
311 match = endendtag.search(rawdata, i+1) # >\r
312 if not match:\r
313 return -1\r
314 j = match.end()\r
315 match = endtagfind.match(rawdata, i) # </ + tag + >\r
316 if not match:\r
317 self.error("bad end tag: %r" % (rawdata[i:j],))\r
318 tag = match.group(1)\r
319 self.handle_endtag(tag.lower())\r
320 self.clear_cdata_mode()\r
321 return j\r
322\r
323 # Overridable -- finish processing of start+end tag: <tag.../>\r
324 def handle_startendtag(self, tag, attrs):\r
325 self.handle_starttag(tag, attrs)\r
326 self.handle_endtag(tag)\r
327\r
328 # Overridable -- handle start tag\r
329 def handle_starttag(self, tag, attrs):\r
330 pass\r
331\r
332 # Overridable -- handle end tag\r
333 def handle_endtag(self, tag):\r
334 pass\r
335\r
336 # Overridable -- handle character reference\r
337 def handle_charref(self, name):\r
338 pass\r
339\r
340 # Overridable -- handle entity reference\r
341 def handle_entityref(self, name):\r
342 pass\r
343\r
344 # Overridable -- handle data\r
345 def handle_data(self, data):\r
346 pass\r
347\r
348 # Overridable -- handle comment\r
349 def handle_comment(self, data):\r
350 pass\r
351\r
352 # Overridable -- handle declaration\r
353 def handle_decl(self, decl):\r
354 pass\r
355\r
356 # Overridable -- handle processing instruction\r
357 def handle_pi(self, data):\r
358 pass\r
359\r
360 def unknown_decl(self, data):\r
361 self.error("unknown declaration: %r" % (data,))\r
362\r
363 # Internal -- helper to remove special character quoting\r
364 entitydefs = None\r
365 def unescape(self, s):\r
366 if '&' not in s:\r
367 return s\r
368 def replaceEntities(s):\r
369 s = s.groups()[0]\r
370 try:\r
371 if s[0] == "#":\r
372 s = s[1:]\r
373 if s[0] in ['x','X']:\r
374 c = int(s[1:], 16)\r
375 else:\r
376 c = int(s)\r
377 return unichr(c)\r
378 except ValueError:\r
379 return '&#'+s+';'\r
380 else:\r
381 # Cannot use name2codepoint directly, because HTMLParser supports apos,\r
382 # which is not part of HTML 4\r
383 import htmlentitydefs\r
384 if HTMLParser.entitydefs is None:\r
385 entitydefs = HTMLParser.entitydefs = {'apos':u"'"}\r
386 for k, v in htmlentitydefs.name2codepoint.iteritems():\r
387 entitydefs[k] = unichr(v)\r
388 try:\r
389 return self.entitydefs[s]\r
390 except KeyError:\r
391 return '&'+s+';'\r
392\r
393 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)\r