]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Lib/xml/dom/expatbuilder.py
95c23a0a17ff2b2a637ba8480ede47007d99ca6c
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / xml / dom / expatbuilder.py
1 """Facility to use the Expat parser to load a minidom instance
2 from a string or file.
3
4 This avoids all the overhead of SAX and pulldom to gain performance.
5 """
6
7 # Warning!
8 #
9 # This module is tightly bound to the implementation details of the
10 # minidom DOM and can't be used with other DOM implementations. This
11 # is due, in part, to a lack of appropriate methods in the DOM (there is
12 # no way to create Entity and Notation nodes via the DOM Level 2
13 # interface), and for performance. The later is the cause of some fairly
14 # cryptic code.
15 #
16 # Performance hacks:
17 #
18 # - .character_data_handler() has an extra case in which continuing
19 # data is appended to an existing Text node; this can be a
20 # speedup since pyexpat can break up character data into multiple
21 # callbacks even though we set the buffer_text attribute on the
22 # parser. This also gives us the advantage that we don't need a
23 # separate normalization pass.
24 #
25 # - Determining that a node exists is done using an identity comparison
26 # with None rather than a truth test; this avoids searching for and
27 # calling any methods on the node object if it exists. (A rather
28 # nice speedup is achieved this way as well!)
29
30 from xml.dom import xmlbuilder, minidom, Node
31 from xml.dom import EMPTY_NAMESPACE, EMPTY_PREFIX, XMLNS_NAMESPACE
32 from xml.parsers import expat
33 from xml.dom.minidom import _append_child, _set_attribute_node
34 from xml.dom.NodeFilter import NodeFilter
35
36 from xml.dom.minicompat import *
37
38 TEXT_NODE = Node.TEXT_NODE
39 CDATA_SECTION_NODE = Node.CDATA_SECTION_NODE
40 DOCUMENT_NODE = Node.DOCUMENT_NODE
41
42 FILTER_ACCEPT = xmlbuilder.DOMBuilderFilter.FILTER_ACCEPT
43 FILTER_REJECT = xmlbuilder.DOMBuilderFilter.FILTER_REJECT
44 FILTER_SKIP = xmlbuilder.DOMBuilderFilter.FILTER_SKIP
45 FILTER_INTERRUPT = xmlbuilder.DOMBuilderFilter.FILTER_INTERRUPT
46
47 theDOMImplementation = minidom.getDOMImplementation()
48
49 # Expat typename -> TypeInfo
50 _typeinfo_map = {
51 "CDATA": minidom.TypeInfo(None, "cdata"),
52 "ENUM": minidom.TypeInfo(None, "enumeration"),
53 "ENTITY": minidom.TypeInfo(None, "entity"),
54 "ENTITIES": minidom.TypeInfo(None, "entities"),
55 "ID": minidom.TypeInfo(None, "id"),
56 "IDREF": minidom.TypeInfo(None, "idref"),
57 "IDREFS": minidom.TypeInfo(None, "idrefs"),
58 "NMTOKEN": minidom.TypeInfo(None, "nmtoken"),
59 "NMTOKENS": minidom.TypeInfo(None, "nmtokens"),
60 }
61
62 class ElementInfo(object):
63 __slots__ = '_attr_info', '_model', 'tagName'
64
65 def __init__(self, tagName, model=None):
66 self.tagName = tagName
67 self._attr_info = []
68 self._model = model
69
70 def __getstate__(self):
71 return self._attr_info, self._model, self.tagName
72
73 def __setstate__(self, state):
74 self._attr_info, self._model, self.tagName = state
75
76 def getAttributeType(self, aname):
77 for info in self._attr_info:
78 if info[1] == aname:
79 t = info[-2]
80 if t[0] == "(":
81 return _typeinfo_map["ENUM"]
82 else:
83 return _typeinfo_map[info[-2]]
84 return minidom._no_type
85
86 def getAttributeTypeNS(self, namespaceURI, localName):
87 return minidom._no_type
88
89 def isElementContent(self):
90 if self._model:
91 type = self._model[0]
92 return type not in (expat.model.XML_CTYPE_ANY,
93 expat.model.XML_CTYPE_MIXED)
94 else:
95 return False
96
97 def isEmpty(self):
98 if self._model:
99 return self._model[0] == expat.model.XML_CTYPE_EMPTY
100 else:
101 return False
102
103 def isId(self, aname):
104 for info in self._attr_info:
105 if info[1] == aname:
106 return info[-2] == "ID"
107 return False
108
109 def isIdNS(self, euri, ename, auri, aname):
110 # not sure this is meaningful
111 return self.isId((auri, aname))
112
113 def _intern(builder, s):
114 return builder._intern_setdefault(s, s)
115
116 def _parse_ns_name(builder, name):
117 assert ' ' in name
118 parts = name.split(' ')
119 intern = builder._intern_setdefault
120 if len(parts) == 3:
121 uri, localname, prefix = parts
122 prefix = intern(prefix, prefix)
123 qname = "%s:%s" % (prefix, localname)
124 qname = intern(qname, qname)
125 localname = intern(localname, localname)
126 else:
127 uri, localname = parts
128 prefix = EMPTY_PREFIX
129 qname = localname = intern(localname, localname)
130 return intern(uri, uri), localname, prefix, qname
131
132
133 class ExpatBuilder:
134 """Document builder that uses Expat to build a ParsedXML.DOM document
135 instance."""
136
137 def __init__(self, options=None):
138 if options is None:
139 options = xmlbuilder.Options()
140 self._options = options
141 if self._options.filter is not None:
142 self._filter = FilterVisibilityController(self._options.filter)
143 else:
144 self._filter = None
145 # This *really* doesn't do anything in this case, so
146 # override it with something fast & minimal.
147 self._finish_start_element = id
148 self._parser = None
149 self.reset()
150
151 def createParser(self):
152 """Create a new parser object."""
153 return expat.ParserCreate()
154
155 def getParser(self):
156 """Return the parser object, creating a new one if needed."""
157 if not self._parser:
158 self._parser = self.createParser()
159 self._intern_setdefault = self._parser.intern.setdefault
160 self._parser.buffer_text = True
161 self._parser.ordered_attributes = True
162 self._parser.specified_attributes = True
163 self.install(self._parser)
164 return self._parser
165
166 def reset(self):
167 """Free all data structures used during DOM construction."""
168 self.document = theDOMImplementation.createDocument(
169 EMPTY_NAMESPACE, None, None)
170 self.curNode = self.document
171 self._elem_info = self.document._elem_info
172 self._cdata = False
173
174 def install(self, parser):
175 """Install the callbacks needed to build the DOM into the parser."""
176 # This creates circular references!
177 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
178 parser.StartElementHandler = self.first_element_handler
179 parser.EndElementHandler = self.end_element_handler
180 parser.ProcessingInstructionHandler = self.pi_handler
181 if self._options.entities:
182 parser.EntityDeclHandler = self.entity_decl_handler
183 parser.NotationDeclHandler = self.notation_decl_handler
184 if self._options.comments:
185 parser.CommentHandler = self.comment_handler
186 if self._options.cdata_sections:
187 parser.StartCdataSectionHandler = self.start_cdata_section_handler
188 parser.EndCdataSectionHandler = self.end_cdata_section_handler
189 parser.CharacterDataHandler = self.character_data_handler_cdata
190 else:
191 parser.CharacterDataHandler = self.character_data_handler
192 parser.ExternalEntityRefHandler = self.external_entity_ref_handler
193 parser.XmlDeclHandler = self.xml_decl_handler
194 parser.ElementDeclHandler = self.element_decl_handler
195 parser.AttlistDeclHandler = self.attlist_decl_handler
196
197 def parseFile(self, file):
198 """Parse a document from a file object, returning the document
199 node."""
200 parser = self.getParser()
201 first_buffer = True
202 try:
203 while 1:
204 buffer = file.read(16*1024)
205 if not buffer:
206 break
207 parser.Parse(buffer, 0)
208 if first_buffer and self.document.documentElement:
209 self._setup_subset(buffer)
210 first_buffer = False
211 parser.Parse("", True)
212 except ParseEscape:
213 pass
214 doc = self.document
215 self.reset()
216 self._parser = None
217 return doc
218
219 def parseString(self, string):
220 """Parse a document from a string, returning the document node."""
221 parser = self.getParser()
222 try:
223 parser.Parse(string, True)
224 self._setup_subset(string)
225 except ParseEscape:
226 pass
227 doc = self.document
228 self.reset()
229 self._parser = None
230 return doc
231
232 def _setup_subset(self, buffer):
233 """Load the internal subset if there might be one."""
234 if self.document.doctype:
235 extractor = InternalSubsetExtractor()
236 extractor.parseString(buffer)
237 subset = extractor.getSubset()
238 self.document.doctype.internalSubset = subset
239
240 def start_doctype_decl_handler(self, doctypeName, systemId, publicId,
241 has_internal_subset):
242 doctype = self.document.implementation.createDocumentType(
243 doctypeName, publicId, systemId)
244 doctype.ownerDocument = self.document
245 _append_child(self.document, doctype)
246 self.document.doctype = doctype
247 if self._filter and self._filter.acceptNode(doctype) == FILTER_REJECT:
248 self.document.doctype = None
249 del self.document.childNodes[-1]
250 doctype = None
251 self._parser.EntityDeclHandler = None
252 self._parser.NotationDeclHandler = None
253 if has_internal_subset:
254 if doctype is not None:
255 doctype.entities._seq = []
256 doctype.notations._seq = []
257 self._parser.CommentHandler = None
258 self._parser.ProcessingInstructionHandler = None
259 self._parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
260
261 def end_doctype_decl_handler(self):
262 if self._options.comments:
263 self._parser.CommentHandler = self.comment_handler
264 self._parser.ProcessingInstructionHandler = self.pi_handler
265 if not (self._elem_info or self._filter):
266 self._finish_end_element = id
267
268 def pi_handler(self, target, data):
269 node = self.document.createProcessingInstruction(target, data)
270 _append_child(self.curNode, node)
271 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
272 self.curNode.removeChild(node)
273
274 def character_data_handler_cdata(self, data):
275 childNodes = self.curNode.childNodes
276 if self._cdata:
277 if ( self._cdata_continue
278 and childNodes[-1].nodeType == CDATA_SECTION_NODE):
279 childNodes[-1].appendData(data)
280 return
281 node = self.document.createCDATASection(data)
282 self._cdata_continue = True
283 elif childNodes and childNodes[-1].nodeType == TEXT_NODE:
284 node = childNodes[-1]
285 value = node.data + data
286 d = node.__dict__
287 d['data'] = d['nodeValue'] = value
288 return
289 else:
290 node = minidom.Text()
291 d = node.__dict__
292 d['data'] = d['nodeValue'] = data
293 d['ownerDocument'] = self.document
294 _append_child(self.curNode, node)
295
296 def character_data_handler(self, data):
297 childNodes = self.curNode.childNodes
298 if childNodes and childNodes[-1].nodeType == TEXT_NODE:
299 node = childNodes[-1]
300 d = node.__dict__
301 d['data'] = d['nodeValue'] = node.data + data
302 return
303 node = minidom.Text()
304 d = node.__dict__
305 d['data'] = d['nodeValue'] = node.data + data
306 d['ownerDocument'] = self.document
307 _append_child(self.curNode, node)
308
309 def entity_decl_handler(self, entityName, is_parameter_entity, value,
310 base, systemId, publicId, notationName):
311 if is_parameter_entity:
312 # we don't care about parameter entities for the DOM
313 return
314 if not self._options.entities:
315 return
316 node = self.document._create_entity(entityName, publicId,
317 systemId, notationName)
318 if value is not None:
319 # internal entity
320 # node *should* be readonly, but we'll cheat
321 child = self.document.createTextNode(value)
322 node.childNodes.append(child)
323 self.document.doctype.entities._seq.append(node)
324 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
325 del self.document.doctype.entities._seq[-1]
326
327 def notation_decl_handler(self, notationName, base, systemId, publicId):
328 node = self.document._create_notation(notationName, publicId, systemId)
329 self.document.doctype.notations._seq.append(node)
330 if self._filter and self._filter.acceptNode(node) == FILTER_ACCEPT:
331 del self.document.doctype.notations._seq[-1]
332
333 def comment_handler(self, data):
334 node = self.document.createComment(data)
335 _append_child(self.curNode, node)
336 if self._filter and self._filter.acceptNode(node) == FILTER_REJECT:
337 self.curNode.removeChild(node)
338
339 def start_cdata_section_handler(self):
340 self._cdata = True
341 self._cdata_continue = False
342
343 def end_cdata_section_handler(self):
344 self._cdata = False
345 self._cdata_continue = False
346
347 def external_entity_ref_handler(self, context, base, systemId, publicId):
348 return 1
349
350 def first_element_handler(self, name, attributes):
351 if self._filter is None and not self._elem_info:
352 self._finish_end_element = id
353 self.getParser().StartElementHandler = self.start_element_handler
354 self.start_element_handler(name, attributes)
355
356 def start_element_handler(self, name, attributes):
357 node = self.document.createElement(name)
358 _append_child(self.curNode, node)
359 self.curNode = node
360
361 if attributes:
362 for i in range(0, len(attributes), 2):
363 a = minidom.Attr(attributes[i], EMPTY_NAMESPACE,
364 None, EMPTY_PREFIX)
365 value = attributes[i+1]
366 d = a.childNodes[0].__dict__
367 d['data'] = d['nodeValue'] = value
368 d = a.__dict__
369 d['value'] = d['nodeValue'] = value
370 d['ownerDocument'] = self.document
371 _set_attribute_node(node, a)
372
373 if node is not self.document.documentElement:
374 self._finish_start_element(node)
375
376 def _finish_start_element(self, node):
377 if self._filter:
378 # To be general, we'd have to call isSameNode(), but this
379 # is sufficient for minidom:
380 if node is self.document.documentElement:
381 return
382 filt = self._filter.startContainer(node)
383 if filt == FILTER_REJECT:
384 # ignore this node & all descendents
385 Rejecter(self)
386 elif filt == FILTER_SKIP:
387 # ignore this node, but make it's children become
388 # children of the parent node
389 Skipper(self)
390 else:
391 return
392 self.curNode = node.parentNode
393 node.parentNode.removeChild(node)
394 node.unlink()
395
396 # If this ever changes, Namespaces.end_element_handler() needs to
397 # be changed to match.
398 #
399 def end_element_handler(self, name):
400 curNode = self.curNode
401 self.curNode = curNode.parentNode
402 self._finish_end_element(curNode)
403
404 def _finish_end_element(self, curNode):
405 info = self._elem_info.get(curNode.tagName)
406 if info:
407 self._handle_white_text_nodes(curNode, info)
408 if self._filter:
409 if curNode is self.document.documentElement:
410 return
411 if self._filter.acceptNode(curNode) == FILTER_REJECT:
412 self.curNode.removeChild(curNode)
413 curNode.unlink()
414
415 def _handle_white_text_nodes(self, node, info):
416 if (self._options.whitespace_in_element_content
417 or not info.isElementContent()):
418 return
419
420 # We have element type information and should remove ignorable
421 # whitespace; identify for text nodes which contain only
422 # whitespace.
423 L = []
424 for child in node.childNodes:
425 if child.nodeType == TEXT_NODE and not child.data.strip():
426 L.append(child)
427
428 # Remove ignorable whitespace from the tree.
429 for child in L:
430 node.removeChild(child)
431
432 def element_decl_handler(self, name, model):
433 info = self._elem_info.get(name)
434 if info is None:
435 self._elem_info[name] = ElementInfo(name, model)
436 else:
437 assert info._model is None
438 info._model = model
439
440 def attlist_decl_handler(self, elem, name, type, default, required):
441 info = self._elem_info.get(elem)
442 if info is None:
443 info = ElementInfo(elem)
444 self._elem_info[elem] = info
445 info._attr_info.append(
446 [None, name, None, None, default, 0, type, required])
447
448 def xml_decl_handler(self, version, encoding, standalone):
449 self.document.version = version
450 self.document.encoding = encoding
451 # This is still a little ugly, thanks to the pyexpat API. ;-(
452 if standalone >= 0:
453 if standalone:
454 self.document.standalone = True
455 else:
456 self.document.standalone = False
457
458
459 # Don't include FILTER_INTERRUPT, since that's checked separately
460 # where allowed.
461 _ALLOWED_FILTER_RETURNS = (FILTER_ACCEPT, FILTER_REJECT, FILTER_SKIP)
462
463 class FilterVisibilityController(object):
464 """Wrapper around a DOMBuilderFilter which implements the checks
465 to make the whatToShow filter attribute work."""
466
467 __slots__ = 'filter',
468
469 def __init__(self, filter):
470 self.filter = filter
471
472 def startContainer(self, node):
473 mask = self._nodetype_mask[node.nodeType]
474 if self.filter.whatToShow & mask:
475 val = self.filter.startContainer(node)
476 if val == FILTER_INTERRUPT:
477 raise ParseEscape
478 if val not in _ALLOWED_FILTER_RETURNS:
479 raise ValueError, \
480 "startContainer() returned illegal value: " + repr(val)
481 return val
482 else:
483 return FILTER_ACCEPT
484
485 def acceptNode(self, node):
486 mask = self._nodetype_mask[node.nodeType]
487 if self.filter.whatToShow & mask:
488 val = self.filter.acceptNode(node)
489 if val == FILTER_INTERRUPT:
490 raise ParseEscape
491 if val == FILTER_SKIP:
492 # move all child nodes to the parent, and remove this node
493 parent = node.parentNode
494 for child in node.childNodes[:]:
495 parent.appendChild(child)
496 # node is handled by the caller
497 return FILTER_REJECT
498 if val not in _ALLOWED_FILTER_RETURNS:
499 raise ValueError, \
500 "acceptNode() returned illegal value: " + repr(val)
501 return val
502 else:
503 return FILTER_ACCEPT
504
505 _nodetype_mask = {
506 Node.ELEMENT_NODE: NodeFilter.SHOW_ELEMENT,
507 Node.ATTRIBUTE_NODE: NodeFilter.SHOW_ATTRIBUTE,
508 Node.TEXT_NODE: NodeFilter.SHOW_TEXT,
509 Node.CDATA_SECTION_NODE: NodeFilter.SHOW_CDATA_SECTION,
510 Node.ENTITY_REFERENCE_NODE: NodeFilter.SHOW_ENTITY_REFERENCE,
511 Node.ENTITY_NODE: NodeFilter.SHOW_ENTITY,
512 Node.PROCESSING_INSTRUCTION_NODE: NodeFilter.SHOW_PROCESSING_INSTRUCTION,
513 Node.COMMENT_NODE: NodeFilter.SHOW_COMMENT,
514 Node.DOCUMENT_NODE: NodeFilter.SHOW_DOCUMENT,
515 Node.DOCUMENT_TYPE_NODE: NodeFilter.SHOW_DOCUMENT_TYPE,
516 Node.DOCUMENT_FRAGMENT_NODE: NodeFilter.SHOW_DOCUMENT_FRAGMENT,
517 Node.NOTATION_NODE: NodeFilter.SHOW_NOTATION,
518 }
519
520
521 class FilterCrutch(object):
522 __slots__ = '_builder', '_level', '_old_start', '_old_end'
523
524 def __init__(self, builder):
525 self._level = 0
526 self._builder = builder
527 parser = builder._parser
528 self._old_start = parser.StartElementHandler
529 self._old_end = parser.EndElementHandler
530 parser.StartElementHandler = self.start_element_handler
531 parser.EndElementHandler = self.end_element_handler
532
533 class Rejecter(FilterCrutch):
534 __slots__ = ()
535
536 def __init__(self, builder):
537 FilterCrutch.__init__(self, builder)
538 parser = builder._parser
539 for name in ("ProcessingInstructionHandler",
540 "CommentHandler",
541 "CharacterDataHandler",
542 "StartCdataSectionHandler",
543 "EndCdataSectionHandler",
544 "ExternalEntityRefHandler",
545 ):
546 setattr(parser, name, None)
547
548 def start_element_handler(self, *args):
549 self._level = self._level + 1
550
551 def end_element_handler(self, *args):
552 if self._level == 0:
553 # restore the old handlers
554 parser = self._builder._parser
555 self._builder.install(parser)
556 parser.StartElementHandler = self._old_start
557 parser.EndElementHandler = self._old_end
558 else:
559 self._level = self._level - 1
560
561 class Skipper(FilterCrutch):
562 __slots__ = ()
563
564 def start_element_handler(self, *args):
565 node = self._builder.curNode
566 self._old_start(*args)
567 if self._builder.curNode is not node:
568 self._level = self._level + 1
569
570 def end_element_handler(self, *args):
571 if self._level == 0:
572 # We're popping back out of the node we're skipping, so we
573 # shouldn't need to do anything but reset the handlers.
574 self._builder._parser.StartElementHandler = self._old_start
575 self._builder._parser.EndElementHandler = self._old_end
576 self._builder = None
577 else:
578 self._level = self._level - 1
579 self._old_end(*args)
580
581
582 # framework document used by the fragment builder.
583 # Takes a string for the doctype, subset string, and namespace attrs string.
584
585 _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID = \
586 "http://xml.python.org/entities/fragment-builder/internal"
587
588 _FRAGMENT_BUILDER_TEMPLATE = (
589 '''\
590 <!DOCTYPE wrapper
591 %%s [
592 <!ENTITY fragment-builder-internal
593 SYSTEM "%s">
594 %%s
595 ]>
596 <wrapper %%s
597 >&fragment-builder-internal;</wrapper>'''
598 % _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID)
599
600
601 class FragmentBuilder(ExpatBuilder):
602 """Builder which constructs document fragments given XML source
603 text and a context node.
604
605 The context node is expected to provide information about the
606 namespace declarations which are in scope at the start of the
607 fragment.
608 """
609
610 def __init__(self, context, options=None):
611 if context.nodeType == DOCUMENT_NODE:
612 self.originalDocument = context
613 self.context = context
614 else:
615 self.originalDocument = context.ownerDocument
616 self.context = context
617 ExpatBuilder.__init__(self, options)
618
619 def reset(self):
620 ExpatBuilder.reset(self)
621 self.fragment = None
622
623 def parseFile(self, file):
624 """Parse a document fragment from a file object, returning the
625 fragment node."""
626 return self.parseString(file.read())
627
628 def parseString(self, string):
629 """Parse a document fragment from a string, returning the
630 fragment node."""
631 self._source = string
632 parser = self.getParser()
633 doctype = self.originalDocument.doctype
634 ident = ""
635 if doctype:
636 subset = doctype.internalSubset or self._getDeclarations()
637 if doctype.publicId:
638 ident = ('PUBLIC "%s" "%s"'
639 % (doctype.publicId, doctype.systemId))
640 elif doctype.systemId:
641 ident = 'SYSTEM "%s"' % doctype.systemId
642 else:
643 subset = ""
644 nsattrs = self._getNSattrs() # get ns decls from node's ancestors
645 document = _FRAGMENT_BUILDER_TEMPLATE % (ident, subset, nsattrs)
646 try:
647 parser.Parse(document, 1)
648 except:
649 self.reset()
650 raise
651 fragment = self.fragment
652 self.reset()
653 ## self._parser = None
654 return fragment
655
656 def _getDeclarations(self):
657 """Re-create the internal subset from the DocumentType node.
658
659 This is only needed if we don't already have the
660 internalSubset as a string.
661 """
662 doctype = self.context.ownerDocument.doctype
663 s = ""
664 if doctype:
665 for i in range(doctype.notations.length):
666 notation = doctype.notations.item(i)
667 if s:
668 s = s + "\n "
669 s = "%s<!NOTATION %s" % (s, notation.nodeName)
670 if notation.publicId:
671 s = '%s PUBLIC "%s"\n "%s">' \
672 % (s, notation.publicId, notation.systemId)
673 else:
674 s = '%s SYSTEM "%s">' % (s, notation.systemId)
675 for i in range(doctype.entities.length):
676 entity = doctype.entities.item(i)
677 if s:
678 s = s + "\n "
679 s = "%s<!ENTITY %s" % (s, entity.nodeName)
680 if entity.publicId:
681 s = '%s PUBLIC "%s"\n "%s"' \
682 % (s, entity.publicId, entity.systemId)
683 elif entity.systemId:
684 s = '%s SYSTEM "%s"' % (s, entity.systemId)
685 else:
686 s = '%s "%s"' % (s, entity.firstChild.data)
687 if entity.notationName:
688 s = "%s NOTATION %s" % (s, entity.notationName)
689 s = s + ">"
690 return s
691
692 def _getNSattrs(self):
693 return ""
694
695 def external_entity_ref_handler(self, context, base, systemId, publicId):
696 if systemId == _FRAGMENT_BUILDER_INTERNAL_SYSTEM_ID:
697 # this entref is the one that we made to put the subtree
698 # in; all of our given input is parsed in here.
699 old_document = self.document
700 old_cur_node = self.curNode
701 parser = self._parser.ExternalEntityParserCreate(context)
702 # put the real document back, parse into the fragment to return
703 self.document = self.originalDocument
704 self.fragment = self.document.createDocumentFragment()
705 self.curNode = self.fragment
706 try:
707 parser.Parse(self._source, 1)
708 finally:
709 self.curNode = old_cur_node
710 self.document = old_document
711 self._source = None
712 return -1
713 else:
714 return ExpatBuilder.external_entity_ref_handler(
715 self, context, base, systemId, publicId)
716
717
718 class Namespaces:
719 """Mix-in class for builders; adds support for namespaces."""
720
721 def _initNamespaces(self):
722 # list of (prefix, uri) ns declarations. Namespace attrs are
723 # constructed from this and added to the element's attrs.
724 self._ns_ordered_prefixes = []
725
726 def createParser(self):
727 """Create a new namespace-handling parser."""
728 parser = expat.ParserCreate(namespace_separator=" ")
729 parser.namespace_prefixes = True
730 return parser
731
732 def install(self, parser):
733 """Insert the namespace-handlers onto the parser."""
734 ExpatBuilder.install(self, parser)
735 if self._options.namespace_declarations:
736 parser.StartNamespaceDeclHandler = (
737 self.start_namespace_decl_handler)
738
739 def start_namespace_decl_handler(self, prefix, uri):
740 """Push this namespace declaration on our storage."""
741 self._ns_ordered_prefixes.append((prefix, uri))
742
743 def start_element_handler(self, name, attributes):
744 if ' ' in name:
745 uri, localname, prefix, qname = _parse_ns_name(self, name)
746 else:
747 uri = EMPTY_NAMESPACE
748 qname = name
749 localname = None
750 prefix = EMPTY_PREFIX
751 node = minidom.Element(qname, uri, prefix, localname)
752 node.ownerDocument = self.document
753 _append_child(self.curNode, node)
754 self.curNode = node
755
756 if self._ns_ordered_prefixes:
757 for prefix, uri in self._ns_ordered_prefixes:
758 if prefix:
759 a = minidom.Attr(_intern(self, 'xmlns:' + prefix),
760 XMLNS_NAMESPACE, prefix, "xmlns")
761 else:
762 a = minidom.Attr("xmlns", XMLNS_NAMESPACE,
763 "xmlns", EMPTY_PREFIX)
764 d = a.childNodes[0].__dict__
765 d['data'] = d['nodeValue'] = uri
766 d = a.__dict__
767 d['value'] = d['nodeValue'] = uri
768 d['ownerDocument'] = self.document
769 _set_attribute_node(node, a)
770 del self._ns_ordered_prefixes[:]
771
772 if attributes:
773 _attrs = node._attrs
774 _attrsNS = node._attrsNS
775 for i in range(0, len(attributes), 2):
776 aname = attributes[i]
777 value = attributes[i+1]
778 if ' ' in aname:
779 uri, localname, prefix, qname = _parse_ns_name(self, aname)
780 a = minidom.Attr(qname, uri, localname, prefix)
781 _attrs[qname] = a
782 _attrsNS[(uri, localname)] = a
783 else:
784 a = minidom.Attr(aname, EMPTY_NAMESPACE,
785 aname, EMPTY_PREFIX)
786 _attrs[aname] = a
787 _attrsNS[(EMPTY_NAMESPACE, aname)] = a
788 d = a.childNodes[0].__dict__
789 d['data'] = d['nodeValue'] = value
790 d = a.__dict__
791 d['ownerDocument'] = self.document
792 d['value'] = d['nodeValue'] = value
793 d['ownerElement'] = node
794
795 if __debug__:
796 # This only adds some asserts to the original
797 # end_element_handler(), so we only define this when -O is not
798 # used. If changing one, be sure to check the other to see if
799 # it needs to be changed as well.
800 #
801 def end_element_handler(self, name):
802 curNode = self.curNode
803 if ' ' in name:
804 uri, localname, prefix, qname = _parse_ns_name(self, name)
805 assert (curNode.namespaceURI == uri
806 and curNode.localName == localname
807 and curNode.prefix == prefix), \
808 "element stack messed up! (namespace)"
809 else:
810 assert curNode.nodeName == name, \
811 "element stack messed up - bad nodeName"
812 assert curNode.namespaceURI == EMPTY_NAMESPACE, \
813 "element stack messed up - bad namespaceURI"
814 self.curNode = curNode.parentNode
815 self._finish_end_element(curNode)
816
817
818 class ExpatBuilderNS(Namespaces, ExpatBuilder):
819 """Document builder that supports namespaces."""
820
821 def reset(self):
822 ExpatBuilder.reset(self)
823 self._initNamespaces()
824
825
826 class FragmentBuilderNS(Namespaces, FragmentBuilder):
827 """Fragment builder that supports namespaces."""
828
829 def reset(self):
830 FragmentBuilder.reset(self)
831 self._initNamespaces()
832
833 def _getNSattrs(self):
834 """Return string of namespace attributes from this element and
835 ancestors."""
836 # XXX This needs to be re-written to walk the ancestors of the
837 # context to build up the namespace information from
838 # declarations, elements, and attributes found in context.
839 # Otherwise we have to store a bunch more data on the DOM
840 # (though that *might* be more reliable -- not clear).
841 attrs = ""
842 context = self.context
843 L = []
844 while context:
845 if hasattr(context, '_ns_prefix_uri'):
846 for prefix, uri in context._ns_prefix_uri.items():
847 # add every new NS decl from context to L and attrs string
848 if prefix in L:
849 continue
850 L.append(prefix)
851 if prefix:
852 declname = "xmlns:" + prefix
853 else:
854 declname = "xmlns"
855 if attrs:
856 attrs = "%s\n %s='%s'" % (attrs, declname, uri)
857 else:
858 attrs = " %s='%s'" % (declname, uri)
859 context = context.parentNode
860 return attrs
861
862
863 class ParseEscape(Exception):
864 """Exception raised to short-circuit parsing in InternalSubsetExtractor."""
865 pass
866
867 class InternalSubsetExtractor(ExpatBuilder):
868 """XML processor which can rip out the internal document type subset."""
869
870 subset = None
871
872 def getSubset(self):
873 """Return the internal subset as a string."""
874 return self.subset
875
876 def parseFile(self, file):
877 try:
878 ExpatBuilder.parseFile(self, file)
879 except ParseEscape:
880 pass
881
882 def parseString(self, string):
883 try:
884 ExpatBuilder.parseString(self, string)
885 except ParseEscape:
886 pass
887
888 def install(self, parser):
889 parser.StartDoctypeDeclHandler = self.start_doctype_decl_handler
890 parser.StartElementHandler = self.start_element_handler
891
892 def start_doctype_decl_handler(self, name, publicId, systemId,
893 has_internal_subset):
894 if has_internal_subset:
895 parser = self.getParser()
896 self.subset = []
897 parser.DefaultHandler = self.subset.append
898 parser.EndDoctypeDeclHandler = self.end_doctype_decl_handler
899 else:
900 raise ParseEscape()
901
902 def end_doctype_decl_handler(self):
903 s = ''.join(self.subset).replace('\r\n', '\n').replace('\r', '\n')
904 self.subset = s
905 raise ParseEscape()
906
907 def start_element_handler(self, name, attrs):
908 raise ParseEscape()
909
910
911 def parse(file, namespaces=True):
912 """Parse a document, returning the resulting Document node.
913
914 'file' may be either a file name or an open file object.
915 """
916 if namespaces:
917 builder = ExpatBuilderNS()
918 else:
919 builder = ExpatBuilder()
920
921 if isinstance(file, StringTypes):
922 fp = open(file, 'rb')
923 try:
924 result = builder.parseFile(fp)
925 finally:
926 fp.close()
927 else:
928 result = builder.parseFile(file)
929 return result
930
931
932 def parseString(string, namespaces=True):
933 """Parse a document from a string, returning the resulting
934 Document node.
935 """
936 if namespaces:
937 builder = ExpatBuilderNS()
938 else:
939 builder = ExpatBuilder()
940 return builder.parseString(string)
941
942
943 def parseFragment(file, context, namespaces=True):
944 """Parse a fragment of a document, given the context from which it
945 was originally extracted. context should be the parent of the
946 node(s) which are in the fragment.
947
948 'file' may be either a file name or an open file object.
949 """
950 if namespaces:
951 builder = FragmentBuilderNS(context)
952 else:
953 builder = FragmentBuilder(context)
954
955 if isinstance(file, StringTypes):
956 fp = open(file, 'rb')
957 try:
958 result = builder.parseFile(fp)
959 finally:
960 fp.close()
961 else:
962 result = builder.parseFile(file)
963 return result
964
965
966 def parseFragmentString(string, context, namespaces=True):
967 """Parse a fragment of a document from a string, given the context
968 from which it was originally extracted. context should be the
969 parent of the node(s) which are in the fragment.
970 """
971 if namespaces:
972 builder = FragmentBuilderNS(context)
973 else:
974 builder = FragmentBuilder(context)
975 return builder.parseString(string)
976
977
978 def makeBuilder(options):
979 """Create a builder based on an Options object."""
980 if options.namespaces:
981 return ExpatBuilderNS(options)
982 else:
983 return ExpatBuilder(options)