]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.2/Lib/sgmllib.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / sgmllib.py
1 """A parser for SGML, using the derived class as a static DTD."""
2
3 # XXX This only supports those SGML features used by HTML.
4
5 # XXX There should be a way to distinguish between PCDATA (parsed
6 # character data -- the normal case), RCDATA (replaceable character
7 # data -- only char and entity references and end tags are special)
8 # and CDATA (character data -- only end tags are special). RCDATA is
9 # not supported at all.
10
11
12 from warnings import warnpy3k
13 warnpy3k("the sgmllib module has been removed in Python 3.0",
14 stacklevel=2)
15 del warnpy3k
16
17 import markupbase
18 import re
19
20 __all__ = ["SGMLParser", "SGMLParseError"]
21
22 # Regular expressions used for parsing
23
24 interesting = re.compile('[&<]')
25 incomplete = re.compile('&([a-zA-Z][a-zA-Z0-9]*|#[0-9]*)?|'
26 '<([a-zA-Z][^<>]*|'
27 '/([a-zA-Z][^<>]*)?|'
28 '![^<>]*)?')
29
30 entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]')
31 charref = re.compile('&#([0-9]+)[^0-9]')
32
33 starttagopen = re.compile('<[>a-zA-Z]')
34 shorttagopen = re.compile('<[a-zA-Z][-.a-zA-Z0-9]*/')
35 shorttag = re.compile('<([a-zA-Z][-.a-zA-Z0-9]*)/([^/]*)/')
36 piclose = re.compile('>')
37 endbracket = re.compile('[<>]')
38 tagfind = re.compile('[a-zA-Z][-_.a-zA-Z0-9]*')
39 attrfind = re.compile(
40 r'\s*([a-zA-Z_][-:.a-zA-Z_0-9]*)(\s*=\s*'
41 r'(\'[^\']*\'|"[^"]*"|[][\-a-zA-Z0-9./,:;+*%?!&$\(\)_#=~\'"@]*))?')
42
43
44 class SGMLParseError(RuntimeError):
45 """Exception raised for all parse errors."""
46 pass
47
48
49 # SGML parser base class -- find tags and call handler functions.
50 # Usage: p = SGMLParser(); p.feed(data); ...; p.close().
51 # The dtd is defined by deriving a class which defines methods
52 # with special names to handle tags: start_foo and end_foo to handle
53 # <foo> and </foo>, respectively, or do_foo to handle <foo> by itself.
54 # (Tags are converted to lower case for this purpose.) The data
55 # between tags is passed to the parser by calling self.handle_data()
56 # with some data as argument (the data may be split up in arbitrary
57 # chunks). Entity references are passed by calling
58 # self.handle_entityref() with the entity reference as argument.
59
60 class SGMLParser(markupbase.ParserBase):
61 # Definition of entities -- derived classes may override
62 entity_or_charref = re.compile('&(?:'
63 '([a-zA-Z][-.a-zA-Z0-9]*)|#([0-9]+)'
64 ')(;?)')
65
66 def __init__(self, verbose=0):
67 """Initialize and reset this instance."""
68 self.verbose = verbose
69 self.reset()
70
71 def reset(self):
72 """Reset this instance. Loses all unprocessed data."""
73 self.__starttag_text = None
74 self.rawdata = ''
75 self.stack = []
76 self.lasttag = '???'
77 self.nomoretags = 0
78 self.literal = 0
79 markupbase.ParserBase.reset(self)
80
81 def setnomoretags(self):
82 """Enter literal mode (CDATA) till EOF.
83
84 Intended for derived classes only.
85 """
86 self.nomoretags = self.literal = 1
87
88 def setliteral(self, *args):
89 """Enter literal mode (CDATA).
90
91 Intended for derived classes only.
92 """
93 self.literal = 1
94
95 def feed(self, data):
96 """Feed some data to the parser.
97
98 Call this as often as you want, with as little or as much text
99 as you want (may include '\n'). (This just saves the text,
100 all the processing is done by goahead().)
101 """
102
103 self.rawdata = self.rawdata + data
104 self.goahead(0)
105
106 def close(self):
107 """Handle the remaining data."""
108 self.goahead(1)
109
110 def error(self, message):
111 raise SGMLParseError(message)
112
113 # Internal -- handle data as far as reasonable. May leave state
114 # and data to be processed by a subsequent call. If 'end' is
115 # true, force handling all data as if followed by EOF marker.
116 def goahead(self, end):
117 rawdata = self.rawdata
118 i = 0
119 n = len(rawdata)
120 while i < n:
121 if self.nomoretags:
122 self.handle_data(rawdata[i:n])
123 i = n
124 break
125 match = interesting.search(rawdata, i)
126 if match: j = match.start()
127 else: j = n
128 if i < j:
129 self.handle_data(rawdata[i:j])
130 i = j
131 if i == n: break
132 if rawdata[i] == '<':
133 if starttagopen.match(rawdata, i):
134 if self.literal:
135 self.handle_data(rawdata[i])
136 i = i+1
137 continue
138 k = self.parse_starttag(i)
139 if k < 0: break
140 i = k
141 continue
142 if rawdata.startswith("</", i):
143 k = self.parse_endtag(i)
144 if k < 0: break
145 i = k
146 self.literal = 0
147 continue
148 if self.literal:
149 if n > (i + 1):
150 self.handle_data("<")
151 i = i+1
152 else:
153 # incomplete
154 break
155 continue
156 if rawdata.startswith("<!--", i):
157 # Strictly speaking, a comment is --.*--
158 # within a declaration tag <!...>.
159 # This should be removed,
160 # and comments handled only in parse_declaration.
161 k = self.parse_comment(i)
162 if k < 0: break
163 i = k
164 continue
165 if rawdata.startswith("<?", i):
166 k = self.parse_pi(i)
167 if k < 0: break
168 i = i+k
169 continue
170 if rawdata.startswith("<!", i):
171 # This is some sort of declaration; in "HTML as
172 # deployed," this should only be the document type
173 # declaration ("<!DOCTYPE html...>").
174 k = self.parse_declaration(i)
175 if k < 0: break
176 i = k
177 continue
178 elif rawdata[i] == '&':
179 if self.literal:
180 self.handle_data(rawdata[i])
181 i = i+1
182 continue
183 match = charref.match(rawdata, i)
184 if match:
185 name = match.group(1)
186 self.handle_charref(name)
187 i = match.end(0)
188 if rawdata[i-1] != ';': i = i-1
189 continue
190 match = entityref.match(rawdata, i)
191 if match:
192 name = match.group(1)
193 self.handle_entityref(name)
194 i = match.end(0)
195 if rawdata[i-1] != ';': i = i-1
196 continue
197 else:
198 self.error('neither < nor & ??')
199 # We get here only if incomplete matches but
200 # nothing else
201 match = incomplete.match(rawdata, i)
202 if not match:
203 self.handle_data(rawdata[i])
204 i = i+1
205 continue
206 j = match.end(0)
207 if j == n:
208 break # Really incomplete
209 self.handle_data(rawdata[i:j])
210 i = j
211 # end while
212 if end and i < n:
213 self.handle_data(rawdata[i:n])
214 i = n
215 self.rawdata = rawdata[i:]
216 # XXX if end: check for empty stack
217
218 # Extensions for the DOCTYPE scanner:
219 _decl_otherchars = '='
220
221 # Internal -- parse processing instr, return length or -1 if not terminated
222 def parse_pi(self, i):
223 rawdata = self.rawdata
224 if rawdata[i:i+2] != '<?':
225 self.error('unexpected call to parse_pi()')
226 match = piclose.search(rawdata, i+2)
227 if not match:
228 return -1
229 j = match.start(0)
230 self.handle_pi(rawdata[i+2: j])
231 j = match.end(0)
232 return j-i
233
234 def get_starttag_text(self):
235 return self.__starttag_text
236
237 # Internal -- handle starttag, return length or -1 if not terminated
238 def parse_starttag(self, i):
239 self.__starttag_text = None
240 start_pos = i
241 rawdata = self.rawdata
242 if shorttagopen.match(rawdata, i):
243 # SGML shorthand: <tag/data/ == <tag>data</tag>
244 # XXX Can data contain &... (entity or char refs)?
245 # XXX Can data contain < or > (tag characters)?
246 # XXX Can there be whitespace before the first /?
247 match = shorttag.match(rawdata, i)
248 if not match:
249 return -1
250 tag, data = match.group(1, 2)
251 self.__starttag_text = '<%s/' % tag
252 tag = tag.lower()
253 k = match.end(0)
254 self.finish_shorttag(tag, data)
255 self.__starttag_text = rawdata[start_pos:match.end(1) + 1]
256 return k
257 # XXX The following should skip matching quotes (' or ")
258 # As a shortcut way to exit, this isn't so bad, but shouldn't
259 # be used to locate the actual end of the start tag since the
260 # < or > characters may be embedded in an attribute value.
261 match = endbracket.search(rawdata, i+1)
262 if not match:
263 return -1
264 j = match.start(0)
265 # Now parse the data between i+1 and j into a tag and attrs
266 attrs = []
267 if rawdata[i:i+2] == '<>':
268 # SGML shorthand: <> == <last open tag seen>
269 k = j
270 tag = self.lasttag
271 else:
272 match = tagfind.match(rawdata, i+1)
273 if not match:
274 self.error('unexpected call to parse_starttag')
275 k = match.end(0)
276 tag = rawdata[i+1:k].lower()
277 self.lasttag = tag
278 while k < j:
279 match = attrfind.match(rawdata, k)
280 if not match: break
281 attrname, rest, attrvalue = match.group(1, 2, 3)
282 if not rest:
283 attrvalue = attrname
284 else:
285 if (attrvalue[:1] == "'" == attrvalue[-1:] or
286 attrvalue[:1] == '"' == attrvalue[-1:]):
287 # strip quotes
288 attrvalue = attrvalue[1:-1]
289 attrvalue = self.entity_or_charref.sub(
290 self._convert_ref, attrvalue)
291 attrs.append((attrname.lower(), attrvalue))
292 k = match.end(0)
293 if rawdata[j] == '>':
294 j = j+1
295 self.__starttag_text = rawdata[start_pos:j]
296 self.finish_starttag(tag, attrs)
297 return j
298
299 # Internal -- convert entity or character reference
300 def _convert_ref(self, match):
301 if match.group(2):
302 return self.convert_charref(match.group(2)) or \
303 '&#%s%s' % match.groups()[1:]
304 elif match.group(3):
305 return self.convert_entityref(match.group(1)) or \
306 '&%s;' % match.group(1)
307 else:
308 return '&%s' % match.group(1)
309
310 # Internal -- parse endtag
311 def parse_endtag(self, i):
312 rawdata = self.rawdata
313 match = endbracket.search(rawdata, i+1)
314 if not match:
315 return -1
316 j = match.start(0)
317 tag = rawdata[i+2:j].strip().lower()
318 if rawdata[j] == '>':
319 j = j+1
320 self.finish_endtag(tag)
321 return j
322
323 # Internal -- finish parsing of <tag/data/ (same as <tag>data</tag>)
324 def finish_shorttag(self, tag, data):
325 self.finish_starttag(tag, [])
326 self.handle_data(data)
327 self.finish_endtag(tag)
328
329 # Internal -- finish processing of start tag
330 # Return -1 for unknown tag, 0 for open-only tag, 1 for balanced tag
331 def finish_starttag(self, tag, attrs):
332 try:
333 method = getattr(self, 'start_' + tag)
334 except AttributeError:
335 try:
336 method = getattr(self, 'do_' + tag)
337 except AttributeError:
338 self.unknown_starttag(tag, attrs)
339 return -1
340 else:
341 self.handle_starttag(tag, method, attrs)
342 return 0
343 else:
344 self.stack.append(tag)
345 self.handle_starttag(tag, method, attrs)
346 return 1
347
348 # Internal -- finish processing of end tag
349 def finish_endtag(self, tag):
350 if not tag:
351 found = len(self.stack) - 1
352 if found < 0:
353 self.unknown_endtag(tag)
354 return
355 else:
356 if tag not in self.stack:
357 try:
358 method = getattr(self, 'end_' + tag)
359 except AttributeError:
360 self.unknown_endtag(tag)
361 else:
362 self.report_unbalanced(tag)
363 return
364 found = len(self.stack)
365 for i in range(found):
366 if self.stack[i] == tag: found = i
367 while len(self.stack) > found:
368 tag = self.stack[-1]
369 try:
370 method = getattr(self, 'end_' + tag)
371 except AttributeError:
372 method = None
373 if method:
374 self.handle_endtag(tag, method)
375 else:
376 self.unknown_endtag(tag)
377 del self.stack[-1]
378
379 # Overridable -- handle start tag
380 def handle_starttag(self, tag, method, attrs):
381 method(attrs)
382
383 # Overridable -- handle end tag
384 def handle_endtag(self, tag, method):
385 method()
386
387 # Example -- report an unbalanced </...> tag.
388 def report_unbalanced(self, tag):
389 if self.verbose:
390 print '*** Unbalanced </' + tag + '>'
391 print '*** Stack:', self.stack
392
393 def convert_charref(self, name):
394 """Convert character reference, may be overridden."""
395 try:
396 n = int(name)
397 except ValueError:
398 return
399 if not 0 <= n <= 127:
400 return
401 return self.convert_codepoint(n)
402
403 def convert_codepoint(self, codepoint):
404 return chr(codepoint)
405
406 def handle_charref(self, name):
407 """Handle character reference, no need to override."""
408 replacement = self.convert_charref(name)
409 if replacement is None:
410 self.unknown_charref(name)
411 else:
412 self.handle_data(replacement)
413
414 # Definition of entities -- derived classes may override
415 entitydefs = \
416 {'lt': '<', 'gt': '>', 'amp': '&', 'quot': '"', 'apos': '\''}
417
418 def convert_entityref(self, name):
419 """Convert entity references.
420
421 As an alternative to overriding this method; one can tailor the
422 results by setting up the self.entitydefs mapping appropriately.
423 """
424 table = self.entitydefs
425 if name in table:
426 return table[name]
427 else:
428 return
429
430 def handle_entityref(self, name):
431 """Handle entity references, no need to override."""
432 replacement = self.convert_entityref(name)
433 if replacement is None:
434 self.unknown_entityref(name)
435 else:
436 self.handle_data(replacement)
437
438 # Example -- handle data, should be overridden
439 def handle_data(self, data):
440 pass
441
442 # Example -- handle comment, could be overridden
443 def handle_comment(self, data):
444 pass
445
446 # Example -- handle declaration, could be overridden
447 def handle_decl(self, decl):
448 pass
449
450 # Example -- handle processing instruction, could be overridden
451 def handle_pi(self, data):
452 pass
453
454 # To be overridden -- handlers for unknown objects
455 def unknown_starttag(self, tag, attrs): pass
456 def unknown_endtag(self, tag): pass
457 def unknown_charref(self, ref): pass
458 def unknown_entityref(self, ref): pass
459
460
461 class TestSGMLParser(SGMLParser):
462
463 def __init__(self, verbose=0):
464 self.testdata = ""
465 SGMLParser.__init__(self, verbose)
466
467 def handle_data(self, data):
468 self.testdata = self.testdata + data
469 if len(repr(self.testdata)) >= 70:
470 self.flush()
471
472 def flush(self):
473 data = self.testdata
474 if data:
475 self.testdata = ""
476 print 'data:', repr(data)
477
478 def handle_comment(self, data):
479 self.flush()
480 r = repr(data)
481 if len(r) > 68:
482 r = r[:32] + '...' + r[-32:]
483 print 'comment:', r
484
485 def unknown_starttag(self, tag, attrs):
486 self.flush()
487 if not attrs:
488 print 'start tag: <' + tag + '>'
489 else:
490 print 'start tag: <' + tag,
491 for name, value in attrs:
492 print name + '=' + '"' + value + '"',
493 print '>'
494
495 def unknown_endtag(self, tag):
496 self.flush()
497 print 'end tag: </' + tag + '>'
498
499 def unknown_entityref(self, ref):
500 self.flush()
501 print '*** unknown entity ref: &' + ref + ';'
502
503 def unknown_charref(self, ref):
504 self.flush()
505 print '*** unknown char ref: &#' + ref + ';'
506
507 def unknown_decl(self, data):
508 self.flush()
509 print '*** unknown decl: [' + data + ']'
510
511 def close(self):
512 SGMLParser.close(self)
513 self.flush()
514
515
516 def test(args = None):
517 import sys
518
519 if args is None:
520 args = sys.argv[1:]
521
522 if args and args[0] == '-s':
523 args = args[1:]
524 klass = SGMLParser
525 else:
526 klass = TestSGMLParser
527
528 if args:
529 file = args[0]
530 else:
531 file = 'test.html'
532
533 if file == '-':
534 f = sys.stdin
535 else:
536 try:
537 f = open(file, 'r')
538 except IOError, msg:
539 print file, ":", msg
540 sys.exit(1)
541
542 data = f.read()
543 if f is not sys.stdin:
544 f.close()
545
546 x = klass()
547 for c in data:
548 x.feed(c)
549 x.close()
550
551
552 if __name__ == '__main__':
553 test()