]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Lib/HTMLParser.py
AppPkg: Removing ipf which is no longer supported from edk2.
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / HTMLParser.py
1 """A parser for HTML and XHTML."""
2
3 # This file is based on sgmllib.py, but the API is slightly different.
4
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special).
9
10
11 import markupbase
12 import re
13
14 # Regular expressions used for parsing
15
16 interesting_normal = re.compile('[&<]')
17 incomplete = re.compile('&[a-zA-Z#]')
18
19 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
20 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
21
22 starttagopen = re.compile('<[a-zA-Z]')
23 piclose = re.compile('>')
24 commentclose = re.compile(r'--\s*>')
25
26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
28 # note: if you change tagfind/attrfind remember to update locatestarttagend too
29 tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
30 # this regex is currently unused, but left for backward compatibility
31 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
32
33 attrfind = re.compile(
34 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
35 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
36
37 locatestarttagend = re.compile(r"""
38 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name
39 (?:[\s/]* # optional whitespace before attribute name
40 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name
41 (?:\s*=+\s* # value indicator
42 (?:'[^']*' # LITA-enclosed value
43 |"[^"]*" # LIT-enclosed value
44 |(?!['"])[^>\s]* # bare value
45 )
46 )?(?:\s|/(?!>))*
47 )*
48 )?
49 \s* # trailing whitespace
50 """, re.VERBOSE)
51 endendtag = re.compile('>')
52 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
53 # </ and the tag name, so maybe this should be fixed
54 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
55
56
57 class HTMLParseError(Exception):
58 """Exception raised for all parse errors."""
59
60 def __init__(self, msg, position=(None, None)):
61 assert msg
62 self.msg = msg
63 self.lineno = position[0]
64 self.offset = position[1]
65
66 def __str__(self):
67 result = self.msg
68 if self.lineno is not None:
69 result = result + ", at line %d" % self.lineno
70 if self.offset is not None:
71 result = result + ", column %d" % (self.offset + 1)
72 return result
73
74
75 class HTMLParser(markupbase.ParserBase):
76 """Find tags and other markup and call handler functions.
77
78 Usage:
79 p = HTMLParser()
80 p.feed(data)
81 ...
82 p.close()
83
84 Start tags are handled by calling self.handle_starttag() or
85 self.handle_startendtag(); end tags by self.handle_endtag(). The
86 data between tags is passed from the parser to the derived class
87 by calling self.handle_data() with the data as argument (the data
88 may be split up in arbitrary chunks). Entity references are
89 passed by calling self.handle_entityref() with the entity
90 reference as the argument. Numeric character references are
91 passed to self.handle_charref() with the string containing the
92 reference as the argument.
93 """
94
95 CDATA_CONTENT_ELEMENTS = ("script", "style")
96
97
98 def __init__(self):
99 """Initialize and reset this instance."""
100 self.reset()
101
102 def reset(self):
103 """Reset this instance. Loses all unprocessed data."""
104 self.rawdata = ''
105 self.lasttag = '???'
106 self.interesting = interesting_normal
107 self.cdata_elem = None
108 markupbase.ParserBase.reset(self)
109
110 def feed(self, data):
111 r"""Feed data to the parser.
112
113 Call this as often as you want, with as little or as much text
114 as you want (may include '\n').
115 """
116 self.rawdata = self.rawdata + data
117 self.goahead(0)
118
119 def close(self):
120 """Handle any buffered data."""
121 self.goahead(1)
122
123 def error(self, message):
124 raise HTMLParseError(message, self.getpos())
125
126 __starttag_text = None
127
128 def get_starttag_text(self):
129 """Return full source of start tag: '<...>'."""
130 return self.__starttag_text
131
132 def set_cdata_mode(self, elem):
133 self.cdata_elem = elem.lower()
134 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
135
136 def clear_cdata_mode(self):
137 self.interesting = interesting_normal
138 self.cdata_elem = None
139
140 # Internal -- handle data as far as reasonable. May leave state
141 # and data to be processed by a subsequent call. If 'end' is
142 # true, force handling all data as if followed by EOF marker.
143 def goahead(self, end):
144 rawdata = self.rawdata
145 i = 0
146 n = len(rawdata)
147 while i < n:
148 match = self.interesting.search(rawdata, i) # < or &
149 if match:
150 j = match.start()
151 else:
152 if self.cdata_elem:
153 break
154 j = n
155 if i < j: self.handle_data(rawdata[i:j])
156 i = self.updatepos(i, j)
157 if i == n: break
158 startswith = rawdata.startswith
159 if startswith('<', i):
160 if starttagopen.match(rawdata, i): # < + letter
161 k = self.parse_starttag(i)
162 elif startswith("</", i):
163 k = self.parse_endtag(i)
164 elif startswith("<!--", i):
165 k = self.parse_comment(i)
166 elif startswith("<?", i):
167 k = self.parse_pi(i)
168 elif startswith("<!", i):
169 k = self.parse_html_declaration(i)
170 elif (i + 1) < n:
171 self.handle_data("<")
172 k = i + 1
173 else:
174 break
175 if k < 0:
176 if not end:
177 break
178 k = rawdata.find('>', i + 1)
179 if k < 0:
180 k = rawdata.find('<', i + 1)
181 if k < 0:
182 k = i + 1
183 else:
184 k += 1
185 self.handle_data(rawdata[i:k])
186 i = self.updatepos(i, k)
187 elif startswith("&#", i):
188 match = charref.match(rawdata, i)
189 if match:
190 name = match.group()[2:-1]
191 self.handle_charref(name)
192 k = match.end()
193 if not startswith(';', k-1):
194 k = k - 1
195 i = self.updatepos(i, k)
196 continue
197 else:
198 if ";" in rawdata[i:]: # bail by consuming '&#'
199 self.handle_data(rawdata[i:i+2])
200 i = self.updatepos(i, i+2)
201 break
202 elif startswith('&', i):
203 match = entityref.match(rawdata, i)
204 if match:
205 name = match.group(1)
206 self.handle_entityref(name)
207 k = match.end()
208 if not startswith(';', k-1):
209 k = k - 1
210 i = self.updatepos(i, k)
211 continue
212 match = incomplete.match(rawdata, i)
213 if match:
214 # match.group() will contain at least 2 chars
215 if end and match.group() == rawdata[i:]:
216 self.error("EOF in middle of entity or char ref")
217 # incomplete
218 break
219 elif (i + 1) < n:
220 # not the end of the buffer, and can't be confused
221 # with some other construct
222 self.handle_data("&")
223 i = self.updatepos(i, i + 1)
224 else:
225 break
226 else:
227 assert 0, "interesting.search() lied"
228 # end while
229 if end and i < n and not self.cdata_elem:
230 self.handle_data(rawdata[i:n])
231 i = self.updatepos(i, n)
232 self.rawdata = rawdata[i:]
233
234 # Internal -- parse html declarations, return length or -1 if not terminated
235 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
236 # See also parse_declaration in _markupbase
237 def parse_html_declaration(self, i):
238 rawdata = self.rawdata
239 if rawdata[i:i+2] != '<!':
240 self.error('unexpected call to parse_html_declaration()')
241 if rawdata[i:i+4] == '<!--':
242 # this case is actually already handled in goahead()
243 return self.parse_comment(i)
244 elif rawdata[i:i+3] == '<![':
245 return self.parse_marked_section(i)
246 elif rawdata[i:i+9].lower() == '<!doctype':
247 # find the closing >
248 gtpos = rawdata.find('>', i+9)
249 if gtpos == -1:
250 return -1
251 self.handle_decl(rawdata[i+2:gtpos])
252 return gtpos+1
253 else:
254 return self.parse_bogus_comment(i)
255
256 # Internal -- parse bogus comment, return length or -1 if not terminated
257 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
258 def parse_bogus_comment(self, i, report=1):
259 rawdata = self.rawdata
260 if rawdata[i:i+2] not in ('<!', '</'):
261 self.error('unexpected call to parse_comment()')
262 pos = rawdata.find('>', i+2)
263 if pos == -1:
264 return -1
265 if report:
266 self.handle_comment(rawdata[i+2:pos])
267 return pos + 1
268
269 # Internal -- parse processing instr, return end or -1 if not terminated
270 def parse_pi(self, i):
271 rawdata = self.rawdata
272 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
273 match = piclose.search(rawdata, i+2) # >
274 if not match:
275 return -1
276 j = match.start()
277 self.handle_pi(rawdata[i+2: j])
278 j = match.end()
279 return j
280
281 # Internal -- handle starttag, return end or -1 if not terminated
282 def parse_starttag(self, i):
283 self.__starttag_text = None
284 endpos = self.check_for_whole_start_tag(i)
285 if endpos < 0:
286 return endpos
287 rawdata = self.rawdata
288 self.__starttag_text = rawdata[i:endpos]
289
290 # Now parse the data between i+1 and j into a tag and attrs
291 attrs = []
292 match = tagfind.match(rawdata, i+1)
293 assert match, 'unexpected call to parse_starttag()'
294 k = match.end()
295 self.lasttag = tag = match.group(1).lower()
296
297 while k < endpos:
298 m = attrfind.match(rawdata, k)
299 if not m:
300 break
301 attrname, rest, attrvalue = m.group(1, 2, 3)
302 if not rest:
303 attrvalue = None
304 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
305 attrvalue[:1] == '"' == attrvalue[-1:]:
306 attrvalue = attrvalue[1:-1]
307 if attrvalue:
308 attrvalue = self.unescape(attrvalue)
309 attrs.append((attrname.lower(), attrvalue))
310 k = m.end()
311
312 end = rawdata[k:endpos].strip()
313 if end not in (">", "/>"):
314 lineno, offset = self.getpos()
315 if "\n" in self.__starttag_text:
316 lineno = lineno + self.__starttag_text.count("\n")
317 offset = len(self.__starttag_text) \
318 - self.__starttag_text.rfind("\n")
319 else:
320 offset = offset + len(self.__starttag_text)
321 self.handle_data(rawdata[i:endpos])
322 return endpos
323 if end.endswith('/>'):
324 # XHTML-style empty tag: <span attr="value" />
325 self.handle_startendtag(tag, attrs)
326 else:
327 self.handle_starttag(tag, attrs)
328 if tag in self.CDATA_CONTENT_ELEMENTS:
329 self.set_cdata_mode(tag)
330 return endpos
331
332 # Internal -- check to see if we have a complete starttag; return end
333 # or -1 if incomplete.
334 def check_for_whole_start_tag(self, i):
335 rawdata = self.rawdata
336 m = locatestarttagend.match(rawdata, i)
337 if m:
338 j = m.end()
339 next = rawdata[j:j+1]
340 if next == ">":
341 return j + 1
342 if next == "/":
343 if rawdata.startswith("/>", j):
344 return j + 2
345 if rawdata.startswith("/", j):
346 # buffer boundary
347 return -1
348 # else bogus input
349 self.updatepos(i, j + 1)
350 self.error("malformed empty start tag")
351 if next == "":
352 # end of input
353 return -1
354 if next in ("abcdefghijklmnopqrstuvwxyz=/"
355 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
356 # end of input in or before attribute value, or we have the
357 # '/' from a '/>' ending
358 return -1
359 if j > i:
360 return j
361 else:
362 return i + 1
363 raise AssertionError("we should not get here!")
364
365 # Internal -- parse endtag, return end or -1 if incomplete
366 def parse_endtag(self, i):
367 rawdata = self.rawdata
368 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
369 match = endendtag.search(rawdata, i+1) # >
370 if not match:
371 return -1
372 gtpos = match.end()
373 match = endtagfind.match(rawdata, i) # </ + tag + >
374 if not match:
375 if self.cdata_elem is not None:
376 self.handle_data(rawdata[i:gtpos])
377 return gtpos
378 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
379 namematch = tagfind.match(rawdata, i+2)
380 if not namematch:
381 # w3.org/TR/html5/tokenization.html#end-tag-open-state
382 if rawdata[i:i+3] == '</>':
383 return i+3
384 else:
385 return self.parse_bogus_comment(i)
386 tagname = namematch.group(1).lower()
387 # consume and ignore other stuff between the name and the >
388 # Note: this is not 100% correct, since we might have things like
389 # </tag attr=">">, but looking for > after tha name should cover
390 # most of the cases and is much simpler
391 gtpos = rawdata.find('>', namematch.end())
392 self.handle_endtag(tagname)
393 return gtpos+1
394
395 elem = match.group(1).lower() # script or style
396 if self.cdata_elem is not None:
397 if elem != self.cdata_elem:
398 self.handle_data(rawdata[i:gtpos])
399 return gtpos
400
401 self.handle_endtag(elem)
402 self.clear_cdata_mode()
403 return gtpos
404
405 # Overridable -- finish processing of start+end tag: <tag.../>
406 def handle_startendtag(self, tag, attrs):
407 self.handle_starttag(tag, attrs)
408 self.handle_endtag(tag)
409
410 # Overridable -- handle start tag
411 def handle_starttag(self, tag, attrs):
412 pass
413
414 # Overridable -- handle end tag
415 def handle_endtag(self, tag):
416 pass
417
418 # Overridable -- handle character reference
419 def handle_charref(self, name):
420 pass
421
422 # Overridable -- handle entity reference
423 def handle_entityref(self, name):
424 pass
425
426 # Overridable -- handle data
427 def handle_data(self, data):
428 pass
429
430 # Overridable -- handle comment
431 def handle_comment(self, data):
432 pass
433
434 # Overridable -- handle declaration
435 def handle_decl(self, decl):
436 pass
437
438 # Overridable -- handle processing instruction
439 def handle_pi(self, data):
440 pass
441
442 def unknown_decl(self, data):
443 pass
444
445 # Internal -- helper to remove special character quoting
446 entitydefs = None
447 def unescape(self, s):
448 if '&' not in s:
449 return s
450 def replaceEntities(s):
451 s = s.groups()[0]
452 try:
453 if s[0] == "#":
454 s = s[1:]
455 if s[0] in ['x','X']:
456 c = int(s[1:], 16)
457 else:
458 c = int(s)
459 return unichr(c)
460 except ValueError:
461 return '&#'+s+';'
462 else:
463 # Cannot use name2codepoint directly, because HTMLParser supports apos,
464 # which is not part of HTML 4
465 import htmlentitydefs
466 if HTMLParser.entitydefs is None:
467 entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
468 for k, v in htmlentitydefs.name2codepoint.iteritems():
469 entitydefs[k] = unichr(v)
470 try:
471 return self.entitydefs[s]
472 except KeyError:
473 return '&'+s+';'
474
475 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)