AppPkg/Applications/Python/Python-2.7.10/Lib/HTMLParser.py

   1 """A parser for HTML and XHTML."""
   2
   3 # This file is based on sgmllib.py, but the API is slightly different.
   4
   5 # XXX There should be a way to distinguish between PCDATA (parsed
   6 # character data -- the normal case), RCDATA (replaceable character
   7 # data -- only char and entity references and end tags are special)
   8 # and CDATA (character data -- only end tags are special).
   9
  10
  11 import markupbase
  12 import re
  13
  14 # Regular expressions used for parsing
  15
  16 interesting_normal = re.compile('[&<]')
  17 incomplete = re.compile('&[a-zA-Z#]')
  18
  19 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
  20 charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]')
  21
  22 starttagopen = re.compile('<[a-zA-Z]')
  23 piclose = re.compile('>')
  24 commentclose = re.compile(r'--\s*>')
  25
  26 # see http://www.w3.org/TR/html5/tokenization.html#tag-open-state
  27 # and http://www.w3.org/TR/html5/tokenization.html#tag-name-state
  28 # note: if you change tagfind/attrfind remember to update locatestarttagend too
  29 tagfind = re.compile('([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*')
  30 # this regex is currently unused, but left for backward compatibility
  31 tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*')
  32
  33 attrfind = re.compile(
  34     r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*'
  35     r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*')
  36
  37 locatestarttagend = re.compile(r"""
  38   <[a-zA-Z][^\t\n\r\f />\x00]*       # tag name
  39   (?:[\s/]*                          # optional whitespace before attribute name
  40     (?:(?<=['"\s/])[^\s/>][^\s/=>]*  # attribute name
  41       (?:\s*=+\s*                    # value indicator
  42         (?:'[^']*'                   # LITA-enclosed value
  43           |"[^"]*"                   # LIT-enclosed value
  44           |(?!['"])[^>\s]*           # bare value
  45          )
  46        )?(?:\s|/(?!>))*
  47      )*
  48    )?
  49   \s*                                # trailing whitespace
  50 """, re.VERBOSE)
  51 endendtag = re.compile('>')
  52 # the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between
  53 # </ and the tag name, so maybe this should be fixed
  54 endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>')
  55
  56
  57 class HTMLParseError(Exception):
  58     """Exception raised for all parse errors."""
  59
  60     def __init__(self, msg, position=(None, None)):
  61         assert msg
  62         self.msg = msg
  63         self.lineno = position[0]
  64         self.offset = position[1]
  65
  66     def __str__(self):
  67         result = self.msg
  68         if self.lineno is not None:
  69             result = result + ", at line %d" % self.lineno
  70         if self.offset is not None:
  71             result = result + ", column %d" % (self.offset + 1)
  72         return result
  73
  74
  75 class HTMLParser(markupbase.ParserBase):
  76     """Find tags and other markup and call handler functions.
  77
  78     Usage:
  79         p = HTMLParser()
  80         p.feed(data)
  81         ...
  82         p.close()
  83
  84     Start tags are handled by calling self.handle_starttag() or
  85     self.handle_startendtag(); end tags by self.handle_endtag().  The
  86     data between tags is passed from the parser to the derived class
  87     by calling self.handle_data() with the data as argument (the data
  88     may be split up in arbitrary chunks).  Entity references are
  89     passed by calling self.handle_entityref() with the entity
  90     reference as the argument.  Numeric character references are
  91     passed to self.handle_charref() with the string containing the
  92     reference as the argument.
  93     """
  94
  95     CDATA_CONTENT_ELEMENTS = ("script", "style")
  96
  97
  98     def __init__(self):
  99         """Initialize and reset this instance."""
 100         self.reset()
 101
 102     def reset(self):
 103         """Reset this instance.  Loses all unprocessed data."""
 104         self.rawdata = ''
 105         self.lasttag = '???'
 106         self.interesting = interesting_normal
 107         self.cdata_elem = None
 108         markupbase.ParserBase.reset(self)
 109
 110     def feed(self, data):
 111         r"""Feed data to the parser.
 112
 113         Call this as often as you want, with as little or as much text
 114         as you want (may include '\n').
 115         """
 116         self.rawdata = self.rawdata + data
 117         self.goahead(0)
 118
 119     def close(self):
 120         """Handle any buffered data."""
 121         self.goahead(1)
 122
 123     def error(self, message):
 124         raise HTMLParseError(message, self.getpos())
 125
 126     __starttag_text = None
 127
 128     def get_starttag_text(self):
 129         """Return full source of start tag: '<...>'."""
 130         return self.__starttag_text
 131
 132     def set_cdata_mode(self, elem):
 133         self.cdata_elem = elem.lower()
 134         self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
 135
 136     def clear_cdata_mode(self):
 137         self.interesting = interesting_normal
 138         self.cdata_elem = None
 139
 140     # Internal -- handle data as far as reasonable.  May leave state
 141     # and data to be processed by a subsequent call.  If 'end' is
 142     # true, force handling all data as if followed by EOF marker.
 143     def goahead(self, end):
 144         rawdata = self.rawdata
 145         i = 0
 146         n = len(rawdata)
 147         while i < n:
 148             match = self.interesting.search(rawdata, i) # < or &
 149             if match:
 150                 j = match.start()
 151             else:
 152                 if self.cdata_elem:
 153                     break
 154                 j = n
 155             if i < j: self.handle_data(rawdata[i:j])
 156             i = self.updatepos(i, j)
 157             if i == n: break
 158             startswith = rawdata.startswith
 159             if startswith('<', i):
 160                 if starttagopen.match(rawdata, i): # < + letter
 161                     k = self.parse_starttag(i)
 162                 elif startswith("</", i):
 163                     k = self.parse_endtag(i)
 164                 elif startswith("<!--", i):
 165                     k = self.parse_comment(i)
 166                 elif startswith("<?", i):
 167                     k = self.parse_pi(i)
 168                 elif startswith("<!", i):
 169                     k = self.parse_html_declaration(i)
 170                 elif (i + 1) < n:
 171                     self.handle_data("<")
 172                     k = i + 1
 173                 else:
 174                     break
 175                 if k < 0:
 176                     if not end:
 177                         break
 178                     k = rawdata.find('>', i + 1)
 179                     if k < 0:
 180                         k = rawdata.find('<', i + 1)
 181                         if k < 0:
 182                             k = i + 1
 183                     else:
 184                         k += 1
 185                     self.handle_data(rawdata[i:k])
 186                 i = self.updatepos(i, k)
 187             elif startswith("&#", i):
 188                 match = charref.match(rawdata, i)
 189                 if match:
 190                     name = match.group()[2:-1]
 191                     self.handle_charref(name)
 192                     k = match.end()
 193                     if not startswith(';', k-1):
 194                         k = k - 1
 195                     i = self.updatepos(i, k)
 196                     continue
 197                 else:
 198                     if ";" in rawdata[i:]:  # bail by consuming '&#'
 199                         self.handle_data(rawdata[i:i+2])
 200                         i = self.updatepos(i, i+2)
 201                     break
 202             elif startswith('&', i):
 203                 match = entityref.match(rawdata, i)
 204                 if match:
 205                     name = match.group(1)
 206                     self.handle_entityref(name)
 207                     k = match.end()
 208                     if not startswith(';', k-1):
 209                         k = k - 1
 210                     i = self.updatepos(i, k)
 211                     continue
 212                 match = incomplete.match(rawdata, i)
 213                 if match:
 214                     # match.group() will contain at least 2 chars
 215                     if end and match.group() == rawdata[i:]:
 216                         self.error("EOF in middle of entity or char ref")
 217                     # incomplete
 218                     break
 219                 elif (i + 1) < n:
 220                     # not the end of the buffer, and can't be confused
 221                     # with some other construct
 222                     self.handle_data("&")
 223                     i = self.updatepos(i, i + 1)
 224                 else:
 225                     break
 226             else:
 227                 assert 0, "interesting.search() lied"
 228         # end while
 229         if end and i < n and not self.cdata_elem:
 230             self.handle_data(rawdata[i:n])
 231             i = self.updatepos(i, n)
 232         self.rawdata = rawdata[i:]
 233
 234     # Internal -- parse html declarations, return length or -1 if not terminated
 235     # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state
 236     # See also parse_declaration in _markupbase
 237     def parse_html_declaration(self, i):
 238         rawdata = self.rawdata
 239         if rawdata[i:i+2] != '<!':
 240             self.error('unexpected call to parse_html_declaration()')
 241         if rawdata[i:i+4] == '<!--':
 242             # this case is actually already handled in goahead()
 243             return self.parse_comment(i)
 244         elif rawdata[i:i+3] == '<![':
 245             return self.parse_marked_section(i)
 246         elif rawdata[i:i+9].lower() == '<!doctype':
 247             # find the closing >
 248             gtpos = rawdata.find('>', i+9)
 249             if gtpos == -1:
 250                 return -1
 251             self.handle_decl(rawdata[i+2:gtpos])
 252             return gtpos+1
 253         else:
 254             return self.parse_bogus_comment(i)
 255
 256     # Internal -- parse bogus comment, return length or -1 if not terminated
 257     # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state
 258     def parse_bogus_comment(self, i, report=1):
 259         rawdata = self.rawdata
 260         if rawdata[i:i+2] not in ('<!', '</'):
 261             self.error('unexpected call to parse_comment()')
 262         pos = rawdata.find('>', i+2)
 263         if pos == -1:
 264             return -1
 265         if report:
 266             self.handle_comment(rawdata[i+2:pos])
 267         return pos + 1
 268
 269     # Internal -- parse processing instr, return end or -1 if not terminated
 270     def parse_pi(self, i):
 271         rawdata = self.rawdata
 272         assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'
 273         match = piclose.search(rawdata, i+2) # >
 274         if not match:
 275             return -1
 276         j = match.start()
 277         self.handle_pi(rawdata[i+2: j])
 278         j = match.end()
 279         return j
 280
 281     # Internal -- handle starttag, return end or -1 if not terminated
 282     def parse_starttag(self, i):
 283         self.__starttag_text = None
 284         endpos = self.check_for_whole_start_tag(i)
 285         if endpos < 0:
 286             return endpos
 287         rawdata = self.rawdata
 288         self.__starttag_text = rawdata[i:endpos]
 289
 290         # Now parse the data between i+1 and j into a tag and attrs
 291         attrs = []
 292         match = tagfind.match(rawdata, i+1)
 293         assert match, 'unexpected call to parse_starttag()'
 294         k = match.end()
 295         self.lasttag = tag = match.group(1).lower()
 296
 297         while k < endpos:
 298             m = attrfind.match(rawdata, k)
 299             if not m:
 300                 break
 301             attrname, rest, attrvalue = m.group(1, 2, 3)
 302             if not rest:
 303                 attrvalue = None
 304             elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
 305                  attrvalue[:1] == '"' == attrvalue[-1:]:
 306                 attrvalue = attrvalue[1:-1]
 307             if attrvalue:
 308                 attrvalue = self.unescape(attrvalue)
 309             attrs.append((attrname.lower(), attrvalue))
 310             k = m.end()
 311
 312         end = rawdata[k:endpos].strip()
 313         if end not in (">", "/>"):
 314             lineno, offset = self.getpos()
 315             if "\n" in self.__starttag_text:
 316                 lineno = lineno + self.__starttag_text.count("\n")
 317                 offset = len(self.__starttag_text) \
 318                          - self.__starttag_text.rfind("\n")
 319             else:
 320                 offset = offset + len(self.__starttag_text)
 321             self.handle_data(rawdata[i:endpos])
 322             return endpos
 323         if end.endswith('/>'):
 324             # XHTML-style empty tag: <span attr="value" />
 325             self.handle_startendtag(tag, attrs)
 326         else:
 327             self.handle_starttag(tag, attrs)
 328             if tag in self.CDATA_CONTENT_ELEMENTS:
 329                 self.set_cdata_mode(tag)
 330         return endpos
 331
 332     # Internal -- check to see if we have a complete starttag; return end
 333     # or -1 if incomplete.
 334     def check_for_whole_start_tag(self, i):
 335         rawdata = self.rawdata
 336         m = locatestarttagend.match(rawdata, i)
 337         if m:
 338             j = m.end()
 339             next = rawdata[j:j+1]
 340             if next == ">":
 341                 return j + 1
 342             if next == "/":
 343                 if rawdata.startswith("/>", j):
 344                     return j + 2
 345                 if rawdata.startswith("/", j):
 346                     # buffer boundary
 347                     return -1
 348                 # else bogus input
 349                 self.updatepos(i, j + 1)
 350                 self.error("malformed empty start tag")
 351             if next == "":
 352                 # end of input
 353                 return -1
 354             if next in ("abcdefghijklmnopqrstuvwxyz=/"
 355                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):
 356                 # end of input in or before attribute value, or we have the
 357                 # '/' from a '/>' ending
 358                 return -1
 359             if j > i:
 360                 return j
 361             else:
 362                 return i + 1
 363         raise AssertionError("we should not get here!")
 364
 365     # Internal -- parse endtag, return end or -1 if incomplete
 366     def parse_endtag(self, i):
 367         rawdata = self.rawdata
 368         assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"
 369         match = endendtag.search(rawdata, i+1) # >
 370         if not match:
 371             return -1
 372         gtpos = match.end()
 373         match = endtagfind.match(rawdata, i) # </ + tag + >
 374         if not match:
 375             if self.cdata_elem is not None:
 376                 self.handle_data(rawdata[i:gtpos])
 377                 return gtpos
 378             # find the name: w3.org/TR/html5/tokenization.html#tag-name-state
 379             namematch = tagfind.match(rawdata, i+2)
 380             if not namematch:
 381                 # w3.org/TR/html5/tokenization.html#end-tag-open-state
 382                 if rawdata[i:i+3] == '</>':
 383                     return i+3
 384                 else:
 385                     return self.parse_bogus_comment(i)
 386             tagname = namematch.group(1).lower()
 387             # consume and ignore other stuff between the name and the >
 388             # Note: this is not 100% correct, since we might have things like
 389             # </tag attr=">">, but looking for > after tha name should cover
 390             # most of the cases and is much simpler
 391             gtpos = rawdata.find('>', namematch.end())
 392             self.handle_endtag(tagname)
 393             return gtpos+1
 394
 395         elem = match.group(1).lower() # script or style
 396         if self.cdata_elem is not None:
 397             if elem != self.cdata_elem:
 398                 self.handle_data(rawdata[i:gtpos])
 399                 return gtpos
 400
 401         self.handle_endtag(elem)
 402         self.clear_cdata_mode()
 403         return gtpos
 404
 405     # Overridable -- finish processing of start+end tag: <tag.../>
 406     def handle_startendtag(self, tag, attrs):
 407         self.handle_starttag(tag, attrs)
 408         self.handle_endtag(tag)
 409
 410     # Overridable -- handle start tag
 411     def handle_starttag(self, tag, attrs):
 412         pass
 413
 414     # Overridable -- handle end tag
 415     def handle_endtag(self, tag):
 416         pass
 417
 418     # Overridable -- handle character reference
 419     def handle_charref(self, name):
 420         pass
 421
 422     # Overridable -- handle entity reference
 423     def handle_entityref(self, name):
 424         pass
 425
 426     # Overridable -- handle data
 427     def handle_data(self, data):
 428         pass
 429
 430     # Overridable -- handle comment
 431     def handle_comment(self, data):
 432         pass
 433
 434     # Overridable -- handle declaration
 435     def handle_decl(self, decl):
 436         pass
 437
 438     # Overridable -- handle processing instruction
 439     def handle_pi(self, data):
 440         pass
 441
 442     def unknown_decl(self, data):
 443         pass
 444
 445     # Internal -- helper to remove special character quoting
 446     entitydefs = None
 447     def unescape(self, s):
 448         if '&' not in s:
 449             return s
 450         def replaceEntities(s):
 451             s = s.groups()[0]
 452             try:
 453                 if s[0] == "#":
 454                     s = s[1:]
 455                     if s[0] in ['x','X']:
 456                         c = int(s[1:], 16)
 457                     else:
 458                         c = int(s)
 459                     return unichr(c)
 460             except ValueError:
 461                 return '&#'+s+';'
 462             else:
 463                 # Cannot use name2codepoint directly, because HTMLParser supports apos,
 464                 # which is not part of HTML 4
 465                 import htmlentitydefs
 466                 if HTMLParser.entitydefs is None:
 467                     entitydefs = HTMLParser.entitydefs = {'apos':u"'"}
 468                     for k, v in htmlentitydefs.name2codepoint.iteritems():
 469                         entitydefs[k] = unichr(v)
 470                 try:
 471                     return self.entitydefs[s]
 472                 except KeyError:
 473                     return '&'+s+';'
 474
 475         return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));", replaceEntities, s)