3 # $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $
5 # light-weight XML support for Python 2.3 and later.
7 # history (since 1.2.6):
8 # 2005-11-12 fl added tostringlist/fromstringlist helpers
9 # 2006-07-05 fl merged in selected changes from the 1.3 sandbox
10 # 2006-07-05 fl removed support for 2.1 and earlier
11 # 2007-06-21 fl added deprecation/future warnings
12 # 2007-08-25 fl added doctype hook, added parser version attribute etc
13 # 2007-08-26 fl added new serializer code (better namespace handling, etc)
14 # 2007-08-27 fl warn for broken /tag searches on tree level
15 # 2007-09-02 fl added html/text methods to serializer (experimental)
16 # 2007-09-05 fl added method argument to tostring/tostringlist
17 # 2007-09-06 fl improved error handling
18 # 2007-09-13 fl added itertext, iterfind; assorted cleanups
19 # 2007-12-15 fl added C14N hooks, copy method (experimental)
21 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
23 # fredrik@pythonware.com
24 # http://www.pythonware.com
26 # --------------------------------------------------------------------
27 # The ElementTree toolkit is
29 # Copyright (c) 1999-2008 by Fredrik Lundh
31 # By obtaining, using, and/or copying this software and/or its
32 # associated documentation, you agree that you have read, understood,
33 # and will comply with the following terms and conditions:
35 # Permission to use, copy, modify, and distribute this software and
36 # its associated documentation for any purpose and without fee is
37 # hereby granted, provided that the above copyright notice appears in
38 # all copies, and that both that copyright notice and this permission
39 # notice appear in supporting documentation, and that the name of
40 # Secret Labs AB or the author not be used in advertising or publicity
41 # pertaining to distribution of the software without specific, written
44 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
45 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
46 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
47 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
48 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
49 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
50 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
52 # --------------------------------------------------------------------
54 # Licensed to PSF under a Contributor Agreement.
55 # See http://www.python.org/psf/license for licensing details.
61 "Element", "ElementTree",
62 "fromstring", "fromstringlist",
63 "iselement", "iterparse",
64 "parse", "ParseError",
65 "PI", "ProcessingInstruction",
68 "tostring", "tostringlist",
72 "XMLParser", "XMLTreeBuilder",
78 # The <b>Element</b> type is a flexible container object, designed to
79 # store hierarchical data structures in memory. The type can be
80 # described as a cross between a list and a dictionary.
82 # Each element has a number of properties associated with it:
84 # <li>a <i>tag</i>. This is a string identifying what kind of data
85 # this element represents (the element type, in other words).</li>
86 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
87 # <li>a <i>text</i> string.</li>
88 # <li>an optional <i>tail</i> string.</li>
89 # <li>a number of <i>child elements</i>, stored in a Python sequence</li>
92 # To create an element instance, use the {@link #Element} constructor
93 # or the {@link #SubElement} factory function.
95 # The {@link #ElementTree} class can be used to wrap an element
96 # structure, and convert it from and to XML.
104 class _SimpleElementPath(object):
105 # emulate pre-1.2 find/findtext/findall behaviour
106 def find(self
, element
, tag
, namespaces
=None):
111 def findtext(self
, element
, tag
, default
=None, namespaces
=None):
112 elem
= self
.find(element
, tag
)
115 return elem
.text
or ""
116 def iterfind(self
, element
, tag
, namespaces
=None):
118 for elem
in element
.iter(tag
[3:]):
123 def findall(self
, element
, tag
, namespaces
=None):
124 return list(self
.iterfind(element
, tag
, namespaces
))
127 from . import ElementPath
129 ElementPath
= _SimpleElementPath()
132 # Parser error. This is a subclass of <b>SyntaxError</b>.
134 # In addition to the exception value, an exception instance contains a
135 # specific exception code in the <b>code</b> attribute, and the line and
136 # column of the error in the <b>position</b> attribute.
138 class ParseError(SyntaxError):
141 # --------------------------------------------------------------------
144 # Checks if an object appears to be a valid element object.
146 # @param An element instance.
147 # @return A true value if this is an element object.
150 def iselement(element
):
151 # FIXME: not sure about this; might be a better idea to look
152 # for tag/attrib/text attributes
153 return isinstance(element
, Element
) or hasattr(element
, "tag")
156 # Element class. This class defines the Element interface, and
157 # provides a reference implementation of this interface.
159 # The element name, attribute names, and attribute values can be
160 # either ASCII strings (ordinary Python strings containing only 7-bit
161 # ASCII characters) or Unicode strings.
163 # @param tag The element name.
164 # @param attrib An optional dictionary, containing element attributes.
165 # @param **extra Additional attributes, given as keyword arguments.
169 # @see ProcessingInstruction
171 class Element(object):
172 # <tag attrib>text<child/>...</tag>tail
175 # (Attribute) Element tag.
180 # (Attribute) Element attribute dictionary. Where possible, use
181 # {@link #Element.get},
182 # {@link #Element.set},
183 # {@link #Element.keys}, and
184 # {@link #Element.items} to access
185 # element attributes.
190 # (Attribute) Text before first subelement. This is either a
191 # string or the value None. Note that if there was no text, this
192 # attribute may be either None or an empty string, depending on
198 # (Attribute) Text after this element's end tag, but before the
199 # next sibling element's start tag. This is either a string or
200 # the value None. Note that if there was no text, this attribute
201 # may be either None or an empty string, depending on the parser.
203 tail
= None # text after end tag, if any
207 def __init__(self
, tag
, attrib
={}, **extra
):
208 attrib
= attrib
.copy()
215 return "<Element %s at 0x%x>" % (repr(self
.tag
), id(self
))
218 # Creates a new element object of the same type as this element.
220 # @param tag Element tag.
221 # @param attrib Element attributes, given as a dictionary.
222 # @return A new element instance.
224 def makeelement(self
, tag
, attrib
):
225 return self
.__class
__(tag
, attrib
)
228 # (Experimental) Copies the current element. This creates a
229 # shallow copy; subelements will be shared with the original tree.
231 # @return A new element instance.
234 elem
= self
.makeelement(self
.tag
, self
.attrib
)
235 elem
.text
= self
.text
236 elem
.tail
= self
.tail
241 # Returns the number of subelements. Note that this only counts
242 # full elements; to check if there's any content in an element, you
243 # have to check both the length and the <b>text</b> attribute.
245 # @return The number of subelements.
248 return len(self
._children
)
250 def __nonzero__(self
):
252 "The behavior of this method will change in future versions. "
253 "Use specific 'len(elem)' or 'elem is not None' test instead.",
254 FutureWarning
, stacklevel
=2
256 return len(self
._children
) != 0 # emulate old behaviour, for now
259 # Returns the given subelement, by index.
261 # @param index What subelement to return.
262 # @return The given subelement.
263 # @exception IndexError If the given element does not exist.
265 def __getitem__(self
, index
):
266 return self
._children
[index
]
269 # Replaces the given subelement, by index.
271 # @param index What subelement to replace.
272 # @param element The new element value.
273 # @exception IndexError If the given element does not exist.
275 def __setitem__(self
, index
, element
):
276 # if isinstance(index, slice):
277 # for elt in element:
278 # assert iselement(elt)
280 # assert iselement(element)
281 self
._children
[index
] = element
284 # Deletes the given subelement, by index.
286 # @param index What subelement to delete.
287 # @exception IndexError If the given element does not exist.
289 def __delitem__(self
, index
):
290 del self
._children
[index
]
293 # Adds a subelement to the end of this element. In document order,
294 # the new element will appear after the last existing subelement (or
295 # directly after the text, if it's the first subelement), but before
296 # the end tag for this element.
298 # @param element The element to add.
300 def append(self
, element
):
301 # assert iselement(element)
302 self
._children
.append(element
)
305 # Appends subelements from a sequence.
307 # @param elements A sequence object with zero or more elements.
310 def extend(self
, elements
):
311 # for element in elements:
312 # assert iselement(element)
313 self
._children
.extend(elements
)
316 # Inserts a subelement at the given position in this element.
318 # @param index Where to insert the new subelement.
320 def insert(self
, index
, element
):
321 # assert iselement(element)
322 self
._children
.insert(index
, element
)
325 # Removes a matching subelement. Unlike the <b>find</b> methods,
326 # this method compares elements based on identity, not on tag
327 # value or contents. To remove subelements by other means, the
328 # easiest way is often to use a list comprehension to select what
329 # elements to keep, and use slice assignment to update the parent
332 # @param element What element to remove.
333 # @exception ValueError If a matching element could not be found.
335 def remove(self
, element
):
336 # assert iselement(element)
337 self
._children
.remove(element
)
340 # (Deprecated) Returns all subelements. The elements are returned
343 # @return A list of subelements.
344 # @defreturn list of Element instances
346 def getchildren(self
):
348 "This method will be removed in future versions. "
349 "Use 'list(elem)' or iteration over elem instead.",
350 DeprecationWarning, stacklevel
=2
352 return self
._children
355 # Finds the first matching subelement, by tag name or path.
357 # @param path What element to look for.
358 # @keyparam namespaces Optional namespace prefix map.
359 # @return The first matching element, or None if no element was found.
360 # @defreturn Element or None
362 def find(self
, path
, namespaces
=None):
363 return ElementPath
.find(self
, path
, namespaces
)
366 # Finds text for the first matching subelement, by tag name or path.
368 # @param path What element to look for.
369 # @param default What to return if the element was not found.
370 # @keyparam namespaces Optional namespace prefix map.
371 # @return The text content of the first matching element, or the
372 # default value no element was found. Note that if the element
373 # is found, but has no text content, this method returns an
377 def findtext(self
, path
, default
=None, namespaces
=None):
378 return ElementPath
.findtext(self
, path
, default
, namespaces
)
381 # Finds all matching subelements, by tag name or path.
383 # @param path What element to look for.
384 # @keyparam namespaces Optional namespace prefix map.
385 # @return A list or other sequence containing all matching elements,
387 # @defreturn list of Element instances
389 def findall(self
, path
, namespaces
=None):
390 return ElementPath
.findall(self
, path
, namespaces
)
393 # Finds all matching subelements, by tag name or path.
395 # @param path What element to look for.
396 # @keyparam namespaces Optional namespace prefix map.
397 # @return An iterator or sequence containing all matching elements,
399 # @defreturn a generated sequence of Element instances
401 def iterfind(self
, path
, namespaces
=None):
402 return ElementPath
.iterfind(self
, path
, namespaces
)
405 # Resets an element. This function removes all subelements, clears
406 # all attributes, and sets the <b>text</b> and <b>tail</b> attributes
412 self
.text
= self
.tail
= None
415 # Gets an element attribute. Equivalent to <b>attrib.get</b>, but
416 # some implementations may handle this a bit more efficiently.
418 # @param key What attribute to look for.
419 # @param default What to return if the attribute was not found.
420 # @return The attribute value, or the default value, if the
421 # attribute was not found.
422 # @defreturn string or None
424 def get(self
, key
, default
=None):
425 return self
.attrib
.get(key
, default
)
428 # Sets an element attribute. Equivalent to <b>attrib[key] = value</b>,
429 # but some implementations may handle this a bit more efficiently.
431 # @param key What attribute to set.
432 # @param value The attribute value.
434 def set(self
, key
, value
):
435 self
.attrib
[key
] = value
438 # Gets a list of attribute names. The names are returned in an
439 # arbitrary order (just like for an ordinary Python dictionary).
440 # Equivalent to <b>attrib.keys()</b>.
442 # @return A list of element attribute names.
443 # @defreturn list of strings
446 return self
.attrib
.keys()
449 # Gets element attributes, as a sequence. The attributes are
450 # returned in an arbitrary order. Equivalent to <b>attrib.items()</b>.
452 # @return A list of (name, value) tuples for all attributes.
453 # @defreturn list of (string, string) tuples
456 return self
.attrib
.items()
459 # Creates a tree iterator. The iterator loops over this element
460 # and all subelements, in document order, and returns all elements
461 # with a matching tag.
463 # If the tree structure is modified during iteration, new or removed
464 # elements may or may not be included. To get a stable set, use the
465 # list() function on the iterator, and loop over the resulting list.
467 # @param tag What tags to look for (default is to return all elements).
468 # @return An iterator containing all the matching elements.
469 # @defreturn iterator
471 def iter(self
, tag
=None):
474 if tag
is None or self
.tag
== tag
:
476 for e
in self
._children
:
477 for e
in e
.iter(tag
):
481 def getiterator(self
, tag
=None):
482 # Change for a DeprecationWarning in 1.4
484 "This method will be removed in future versions. "
485 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
486 PendingDeprecationWarning
, stacklevel
=2
488 return list(self
.iter(tag
))
491 # Creates a text iterator. The iterator loops over this element
492 # and all subelements, in document order, and returns all inner
495 # @return An iterator containing all inner text.
496 # @defreturn iterator
500 if not isinstance(tag
, basestring
) and tag
is not None:
505 for s
in e
.itertext():
511 _Element
= _ElementInterface
= Element
514 # Subelement factory. This function creates an element instance, and
515 # appends it to an existing element.
517 # The element name, attribute names, and attribute values can be
518 # either 8-bit ASCII strings or Unicode strings.
520 # @param parent The parent element.
521 # @param tag The subelement name.
522 # @param attrib An optional dictionary, containing element attributes.
523 # @param **extra Additional attributes, given as keyword arguments.
524 # @return An element instance.
527 def SubElement(parent
, tag
, attrib
={}, **extra
):
528 attrib
= attrib
.copy()
530 element
= parent
.makeelement(tag
, attrib
)
531 parent
.append(element
)
535 # Comment element factory. This factory function creates a special
536 # element that will be serialized as an XML comment by the standard
539 # The comment string can be either an 8-bit ASCII string or a Unicode
542 # @param text A string containing the comment string.
543 # @return An element instance, representing a comment.
546 def Comment(text
=None):
547 element
= Element(Comment
)
552 # PI element factory. This factory function creates a special element
553 # that will be serialized as an XML processing instruction by the standard
556 # @param target A string containing the PI target.
557 # @param text A string containing the PI contents, if any.
558 # @return An element instance, representing a PI.
561 def ProcessingInstruction(target
, text
=None):
562 element
= Element(ProcessingInstruction
)
563 element
.text
= target
565 element
.text
= element
.text
+ " " + text
568 PI
= ProcessingInstruction
571 # QName wrapper. This can be used to wrap a QName attribute value, in
572 # order to get proper namespace handling on output.
574 # @param text A string containing the QName value, in the form {uri}local,
575 # or, if the tag argument is given, the URI part of a QName.
576 # @param tag Optional tag. If given, the first argument is interpreted as
577 # an URI, and this argument is interpreted as a local name.
578 # @return An opaque object, representing the QName.
581 def __init__(self
, text_or_uri
, tag
=None):
583 text_or_uri
= "{%s}%s" % (text_or_uri
, tag
)
584 self
.text
= text_or_uri
588 return hash(self
.text
)
589 def __cmp__(self
, other
):
590 if isinstance(other
, QName
):
591 return cmp(self
.text
, other
.text
)
592 return cmp(self
.text
, other
)
594 # --------------------------------------------------------------------
597 # ElementTree wrapper class. This class represents an entire element
598 # hierarchy, and adds some extra support for serialization to and from
601 # @param element Optional root element.
602 # @keyparam file Optional file handle or file name. If given, the
603 # tree is initialized with the contents of this XML file.
605 class ElementTree(object):
607 def __init__(self
, element
=None, file=None):
608 # assert element is None or iselement(element)
609 self
._root
= element
# first node
614 # Gets the root element for this tree.
616 # @return An element instance.
623 # Replaces the root element for this tree. This discards the
624 # current contents of the tree, and replaces it with the given
625 # element. Use with care.
627 # @param element An element instance.
629 def _setroot(self
, element
):
630 # assert iselement(element)
634 # Loads an external XML document into this element tree.
636 # @param source A file name or file object. If a file object is
637 # given, it only has to implement a <b>read(n)</b> method.
638 # @keyparam parser An optional parser instance. If not given, the
639 # standard {@link XMLParser} parser is used.
640 # @return The document root element.
642 # @exception ParseError If the parser fails to parse the document.
644 def parse(self
, source
, parser
=None):
646 if not hasattr(source
, "read"):
647 source
= open(source
, "rb")
651 parser
= XMLParser(target
=TreeBuilder())
653 data
= source
.read(65536)
657 self
._root
= parser
.close()
664 # Creates a tree iterator for the root element. The iterator loops
665 # over all elements in this tree, in document order.
667 # @param tag What tags to look for (default is to return all elements)
668 # @return An iterator.
669 # @defreturn iterator
671 def iter(self
, tag
=None):
672 # assert self._root is not None
673 return self
._root
.iter(tag
)
676 def getiterator(self
, tag
=None):
677 # Change for a DeprecationWarning in 1.4
679 "This method will be removed in future versions. "
680 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
681 PendingDeprecationWarning
, stacklevel
=2
683 return list(self
.iter(tag
))
686 # Same as getroot().find(path), starting at the root of the
689 # @param path What element to look for.
690 # @keyparam namespaces Optional namespace prefix map.
691 # @return The first matching element, or None if no element was found.
692 # @defreturn Element or None
694 def find(self
, path
, namespaces
=None):
695 # assert self._root is not None
699 "This search is broken in 1.3 and earlier, and will be "
700 "fixed in a future version. If you rely on the current "
701 "behaviour, change it to %r" % path
,
702 FutureWarning
, stacklevel
=2
704 return self
._root
.find(path
, namespaces
)
707 # Same as getroot().findtext(path), starting at the root of the tree.
709 # @param path What element to look for.
710 # @param default What to return if the element was not found.
711 # @keyparam namespaces Optional namespace prefix map.
712 # @return The text content of the first matching element, or the
713 # default value no element was found. Note that if the element
714 # is found, but has no text content, this method returns an
718 def findtext(self
, path
, default
=None, namespaces
=None):
719 # assert self._root is not None
723 "This search is broken in 1.3 and earlier, and will be "
724 "fixed in a future version. If you rely on the current "
725 "behaviour, change it to %r" % path
,
726 FutureWarning
, stacklevel
=2
728 return self
._root
.findtext(path
, default
, namespaces
)
731 # Same as getroot().findall(path), starting at the root of the tree.
733 # @param path What element to look for.
734 # @keyparam namespaces Optional namespace prefix map.
735 # @return A list or iterator containing all matching elements,
737 # @defreturn list of Element instances
739 def findall(self
, path
, namespaces
=None):
740 # assert self._root is not None
744 "This search is broken in 1.3 and earlier, and will be "
745 "fixed in a future version. If you rely on the current "
746 "behaviour, change it to %r" % path
,
747 FutureWarning
, stacklevel
=2
749 return self
._root
.findall(path
, namespaces
)
752 # Finds all matching subelements, by tag name or path.
753 # Same as getroot().iterfind(path).
755 # @param path What element to look for.
756 # @keyparam namespaces Optional namespace prefix map.
757 # @return An iterator or sequence containing all matching elements,
759 # @defreturn a generated sequence of Element instances
761 def iterfind(self
, path
, namespaces
=None):
762 # assert self._root is not None
766 "This search is broken in 1.3 and earlier, and will be "
767 "fixed in a future version. If you rely on the current "
768 "behaviour, change it to %r" % path
,
769 FutureWarning
, stacklevel
=2
771 return self
._root
.iterfind(path
, namespaces
)
774 # Writes the element tree to a file, as XML.
776 # @def write(file, **options)
777 # @param file A file name, or a file object opened for writing.
778 # @param **options Options, given as keyword arguments.
779 # @keyparam encoding Optional output encoding (default is US-ASCII).
780 # @keyparam xml_declaration Controls if an XML declaration should
781 # be added to the file. Use False for never, True for always,
782 # None for only if not US-ASCII or UTF-8. None is default.
783 # @keyparam default_namespace Sets the default XML namespace (for "xmlns").
784 # @keyparam method Optional output method ("xml", "html", "text" or
785 # "c14n"; default is "xml").
787 def write(self
, file_or_filename
,
790 xml_declaration
=None,
791 default_namespace
=None,
793 # assert self._root is not None
796 elif method
not in _serialize
:
797 # FIXME: raise an ImportError for c14n if ElementC14N is missing?
798 raise ValueError("unknown method %r" % method
)
799 if hasattr(file_or_filename
, "write"):
800 file = file_or_filename
802 file = open(file_or_filename
, "wb")
808 encoding
= "us-ascii"
809 elif xml_declaration
or (xml_declaration
is None and
810 encoding
not in ("utf-8", "us-ascii")):
812 write("<?xml version='1.0' encoding='%s'?>\n" % encoding
)
814 _serialize_text(write
, self
._root
, encoding
)
816 qnames
, namespaces
= _namespaces(
817 self
._root
, encoding
, default_namespace
819 serialize
= _serialize
[method
]
820 serialize(write
, self
._root
, encoding
, qnames
, namespaces
)
821 if file_or_filename
is not file:
824 def write_c14n(self
, file):
825 # lxml.etree compatibility. use output method instead
826 return self
.write(file, method
="c14n")
828 # --------------------------------------------------------------------
829 # serialization support
831 def _namespaces(elem
, encoding
, default_namespace
=None):
832 # identify namespaces used in this tree
834 # maps qnames to *encoded* prefix:local names
835 qnames
= {None: None}
837 # maps uri:s to prefixes
839 if default_namespace
:
840 namespaces
[default_namespace
] = ""
843 return text
.encode(encoding
)
845 def add_qname(qname
):
846 # calculate serialized qname representation
849 uri
, tag
= qname
[1:].rsplit("}", 1)
850 prefix
= namespaces
.get(uri
)
852 prefix
= _namespace_map
.get(uri
)
854 prefix
= "ns%d" % len(namespaces
)
856 namespaces
[uri
] = prefix
858 qnames
[qname
] = encode("%s:%s" % (prefix
, tag
))
860 qnames
[qname
] = encode(tag
) # default element
862 if default_namespace
:
863 # FIXME: can this be handled in XML 1.0?
865 "cannot use non-qualified names with "
866 "default_namespace option"
868 qnames
[qname
] = encode(qname
)
870 _raise_serialization_error(qname
)
872 # populate qname and namespaces table
875 except AttributeError:
876 iterate
= elem
.getiterator
# cET compatibility
877 for elem
in iterate():
879 if isinstance(tag
, QName
):
880 if tag
.text
not in qnames
:
882 elif isinstance(tag
, basestring
):
883 if tag
not in qnames
:
885 elif tag
is not None and tag
is not Comment
and tag
is not PI
:
886 _raise_serialization_error(tag
)
887 for key
, value
in elem
.items():
888 if isinstance(key
, QName
):
890 if key
not in qnames
:
892 if isinstance(value
, QName
) and value
.text
not in qnames
:
893 add_qname(value
.text
)
895 if isinstance(text
, QName
) and text
.text
not in qnames
:
897 return qnames
, namespaces
899 def _serialize_xml(write
, elem
, encoding
, qnames
, namespaces
):
903 write("<!--%s-->" % _encode(text
, encoding
))
904 elif tag
is ProcessingInstruction
:
905 write("<?%s?>" % _encode(text
, encoding
))
910 write(_escape_cdata(text
, encoding
))
912 _serialize_xml(write
, e
, encoding
, qnames
, None)
916 if items
or namespaces
:
918 for v
, k
in sorted(namespaces
.items(),
919 key
=lambda x
: x
[1]): # sort on prefix
922 write(" xmlns%s=\"%s\"" % (
924 _escape_attrib(v
, encoding
)
926 for k
, v
in sorted(items
): # lexical order
927 if isinstance(k
, QName
):
929 if isinstance(v
, QName
):
932 v
= _escape_attrib(v
, encoding
)
933 write(" %s=\"%s\"" % (qnames
[k
], v
))
934 if text
or len(elem
):
937 write(_escape_cdata(text
, encoding
))
939 _serialize_xml(write
, e
, encoding
, qnames
, None)
940 write("</" + tag
+ ">")
944 write(_escape_cdata(elem
.tail
, encoding
))
946 HTML_EMPTY
= ("area", "base", "basefont", "br", "col", "frame", "hr",
947 "img", "input", "isindex", "link", "meta", "param")
950 HTML_EMPTY
= set(HTML_EMPTY
)
954 def _serialize_html(write
, elem
, encoding
, qnames
, namespaces
):
958 write("<!--%s-->" % _escape_cdata(text
, encoding
))
959 elif tag
is ProcessingInstruction
:
960 write("<?%s?>" % _escape_cdata(text
, encoding
))
965 write(_escape_cdata(text
, encoding
))
967 _serialize_html(write
, e
, encoding
, qnames
, None)
971 if items
or namespaces
:
973 for v
, k
in sorted(namespaces
.items(),
974 key
=lambda x
: x
[1]): # sort on prefix
977 write(" xmlns%s=\"%s\"" % (
979 _escape_attrib(v
, encoding
)
981 for k
, v
in sorted(items
): # lexical order
982 if isinstance(k
, QName
):
984 if isinstance(v
, QName
):
987 v
= _escape_attrib_html(v
, encoding
)
988 # FIXME: handle boolean attributes
989 write(" %s=\"%s\"" % (qnames
[k
], v
))
993 if ltag
== "script" or ltag
== "style":
994 write(_encode(text
, encoding
))
996 write(_escape_cdata(text
, encoding
))
998 _serialize_html(write
, e
, encoding
, qnames
, None)
999 if ltag
not in HTML_EMPTY
:
1000 write("</" + tag
+ ">")
1002 write(_escape_cdata(elem
.tail
, encoding
))
1004 def _serialize_text(write
, elem
, encoding
):
1005 for part
in elem
.itertext():
1006 write(part
.encode(encoding
))
1008 write(elem
.tail
.encode(encoding
))
1011 "xml": _serialize_xml
,
1012 "html": _serialize_html
,
1013 "text": _serialize_text
,
1014 # this optional method is imported at the end of the module
1015 # "c14n": _serialize_c14n,
1019 # Registers a namespace prefix. The registry is global, and any
1020 # existing mapping for either the given prefix or the namespace URI
1023 # @param prefix Namespace prefix.
1024 # @param uri Namespace uri. Tags and attributes in this namespace
1025 # will be serialized with the given prefix, if at all possible.
1026 # @exception ValueError If the prefix is reserved, or is otherwise
1029 def register_namespace(prefix
, uri
):
1030 if re
.match("ns\d+$", prefix
):
1031 raise ValueError("Prefix format reserved for internal use")
1032 for k
, v
in _namespace_map
.items():
1033 if k
== uri
or v
== prefix
:
1034 del _namespace_map
[k
]
1035 _namespace_map
[uri
] = prefix
1038 # "well-known" namespace prefixes
1039 "http://www.w3.org/XML/1998/namespace": "xml",
1040 "http://www.w3.org/1999/xhtml": "html",
1041 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1042 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1044 "http://www.w3.org/2001/XMLSchema": "xs",
1045 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1047 "http://purl.org/dc/elements/1.1/": "dc",
1050 def _raise_serialization_error(text
):
1052 "cannot serialize %r (type %s)" % (text
, type(text
).__name
__)
1055 def _encode(text
, encoding
):
1057 return text
.encode(encoding
, "xmlcharrefreplace")
1058 except (TypeError, AttributeError):
1059 _raise_serialization_error(text
)
1061 def _escape_cdata(text
, encoding
):
1062 # escape character data
1064 # it's worth avoiding do-nothing calls for strings that are
1065 # shorter than 500 character, or so. assume that's, by far,
1066 # the most common case in most applications.
1068 text
= text
.replace("&", "&")
1070 text
= text
.replace("<", "<")
1072 text
= text
.replace(">", ">")
1073 return text
.encode(encoding
, "xmlcharrefreplace")
1074 except (TypeError, AttributeError):
1075 _raise_serialization_error(text
)
1077 def _escape_attrib(text
, encoding
):
1078 # escape attribute value
1081 text
= text
.replace("&", "&")
1083 text
= text
.replace("<", "<")
1085 text
= text
.replace(">", ">")
1087 text
= text
.replace("\"", """)
1089 text
= text
.replace("\n", " ")
1090 return text
.encode(encoding
, "xmlcharrefreplace")
1091 except (TypeError, AttributeError):
1092 _raise_serialization_error(text
)
1094 def _escape_attrib_html(text
, encoding
):
1095 # escape attribute value
1098 text
= text
.replace("&", "&")
1100 text
= text
.replace(">", ">")
1102 text
= text
.replace("\"", """)
1103 return text
.encode(encoding
, "xmlcharrefreplace")
1104 except (TypeError, AttributeError):
1105 _raise_serialization_error(text
)
1107 # --------------------------------------------------------------------
1110 # Generates a string representation of an XML element, including all
1113 # @param element An Element instance.
1114 # @keyparam encoding Optional output encoding (default is US-ASCII).
1115 # @keyparam method Optional output method ("xml", "html", "text" or
1116 # "c14n"; default is "xml").
1117 # @return An encoded string containing the XML data.
1120 def tostring(element
, encoding
=None, method
=None):
1125 file.write
= data
.append
1126 ElementTree(element
).write(file, encoding
, method
=method
)
1127 return "".join(data
)
1130 # Generates a string representation of an XML element, including all
1131 # subelements. The string is returned as a sequence of string fragments.
1133 # @param element An Element instance.
1134 # @keyparam encoding Optional output encoding (default is US-ASCII).
1135 # @keyparam method Optional output method ("xml", "html", "text" or
1136 # "c14n"; default is "xml").
1137 # @return A sequence object containing the XML data.
1138 # @defreturn sequence
1141 def tostringlist(element
, encoding
=None, method
=None):
1146 file.write
= data
.append
1147 ElementTree(element
).write(file, encoding
, method
=method
)
1148 # FIXME: merge small fragments into larger parts
1152 # Writes an element tree or element structure to sys.stdout. This
1153 # function should be used for debugging only.
1155 # The exact output format is implementation dependent. In this
1156 # version, it's written as an ordinary XML file.
1158 # @param elem An element tree or an individual element.
1162 if not isinstance(elem
, ElementTree
):
1163 elem
= ElementTree(elem
)
1164 elem
.write(sys
.stdout
)
1165 tail
= elem
.getroot().tail
1166 if not tail
or tail
[-1] != "\n":
1167 sys
.stdout
.write("\n")
1169 # --------------------------------------------------------------------
1173 # Parses an XML document into an element tree.
1175 # @param source A filename or file object containing XML data.
1176 # @param parser An optional parser instance. If not given, the
1177 # standard {@link XMLParser} parser is used.
1178 # @return An ElementTree instance
1180 def parse(source
, parser
=None):
1181 tree
= ElementTree()
1182 tree
.parse(source
, parser
)
1186 # Parses an XML document into an element tree incrementally, and reports
1187 # what's going on to the user.
1189 # @param source A filename or file object containing XML data.
1190 # @param events A list of events to report back. If omitted, only "end"
1191 # events are reported.
1192 # @param parser An optional parser instance. If not given, the
1193 # standard {@link XMLParser} parser is used.
1194 # @return A (event, elem) iterator.
1196 def iterparse(source
, events
=None, parser
=None):
1197 close_source
= False
1198 if not hasattr(source
, "read"):
1199 source
= open(source
, "rb")
1202 parser
= XMLParser(target
=TreeBuilder())
1203 return _IterParseIterator(source
, events
, parser
, close_source
)
1205 class _IterParseIterator(object):
1207 def __init__(self
, source
, events
, parser
, close_source
=False):
1209 self
._close
_file
= close_source
1213 self
.root
= self
._root
= None
1214 self
._parser
= parser
1215 # wire up the parser for event reporting
1216 parser
= self
._parser
._parser
1217 append
= self
._events
.append
1220 for event
in events
:
1221 if event
== "start":
1223 parser
.ordered_attributes
= 1
1224 parser
.specified_attributes
= 1
1225 def handler(tag
, attrib_in
, event
=event
, append
=append
,
1226 start
=self
._parser
._start
_list
):
1227 append((event
, start(tag
, attrib_in
)))
1228 parser
.StartElementHandler
= handler
1229 except AttributeError:
1230 def handler(tag
, attrib_in
, event
=event
, append
=append
,
1231 start
=self
._parser
._start
):
1232 append((event
, start(tag
, attrib_in
)))
1233 parser
.StartElementHandler
= handler
1234 elif event
== "end":
1235 def handler(tag
, event
=event
, append
=append
,
1236 end
=self
._parser
._end
):
1237 append((event
, end(tag
)))
1238 parser
.EndElementHandler
= handler
1239 elif event
== "start-ns":
1240 def handler(prefix
, uri
, event
=event
, append
=append
):
1242 uri
= (uri
or "").encode("ascii")
1243 except UnicodeError:
1245 append((event
, (prefix
or "", uri
or "")))
1246 parser
.StartNamespaceDeclHandler
= handler
1247 elif event
== "end-ns":
1248 def handler(prefix
, event
=event
, append
=append
):
1249 append((event
, None))
1250 parser
.EndNamespaceDeclHandler
= handler
1252 raise ValueError("unknown event %r" % event
)
1257 item
= self
._events
[self
._index
]
1266 if self
._parser
is None:
1267 self
.root
= self
._root
1268 if self
._close
_file
:
1274 data
= self
._file
.read(16384)
1277 self
._parser
.feed(data
)
1278 except SyntaxError as exc
:
1281 self
._root
= self
._parser
.close()
1288 # Parses an XML document from a string constant. This function can
1289 # be used to embed "XML literals" in Python code.
1291 # @param source A string containing XML data.
1292 # @param parser An optional parser instance. If not given, the
1293 # standard {@link XMLParser} parser is used.
1294 # @return An Element instance.
1295 # @defreturn Element
1297 def XML(text
, parser
=None):
1299 parser
= XMLParser(target
=TreeBuilder())
1301 return parser
.close()
1304 # Parses an XML document from a string constant, and also returns
1305 # a dictionary which maps from element id:s to elements.
1307 # @param source A string containing XML data.
1308 # @param parser An optional parser instance. If not given, the
1309 # standard {@link XMLParser} parser is used.
1310 # @return A tuple containing an Element instance and a dictionary.
1311 # @defreturn (Element, dictionary)
1313 def XMLID(text
, parser
=None):
1315 parser
= XMLParser(target
=TreeBuilder())
1317 tree
= parser
.close()
1319 for elem
in tree
.iter():
1326 # Parses an XML document from a string constant. Same as {@link #XML}.
1328 # @def fromstring(text)
1329 # @param source A string containing XML data.
1330 # @return An Element instance.
1331 # @defreturn Element
1336 # Parses an XML document from a sequence of string fragments.
1338 # @param sequence A list or other sequence containing XML data fragments.
1339 # @param parser An optional parser instance. If not given, the
1340 # standard {@link XMLParser} parser is used.
1341 # @return An Element instance.
1342 # @defreturn Element
1345 def fromstringlist(sequence
, parser
=None):
1347 parser
= XMLParser(target
=TreeBuilder())
1348 for text
in sequence
:
1350 return parser
.close()
1352 # --------------------------------------------------------------------
1355 # Generic element structure builder. This builder converts a sequence
1356 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1357 # #TreeBuilder.end} method calls to a well-formed element structure.
1359 # You can use this class to build an element structure using a custom XML
1360 # parser, or a parser for some other XML-like format.
1362 # @param element_factory Optional element factory. This factory
1363 # is called to create new Element instances, as necessary.
1365 class TreeBuilder(object):
1367 def __init__(self
, element_factory
=None):
1368 self
._data
= [] # data collector
1369 self
._elem
= [] # element stack
1370 self
._last
= None # last element
1371 self
._tail
= None # true if we're after an end tag
1372 if element_factory
is None:
1373 element_factory
= Element
1374 self
._factory
= element_factory
1377 # Flushes the builder buffers, and returns the toplevel document
1380 # @return An Element instance.
1381 # @defreturn Element
1384 assert len(self
._elem
) == 0, "missing end tags"
1385 assert self
._last
is not None, "missing toplevel element"
1390 if self
._last
is not None:
1391 text
= "".join(self
._data
)
1393 assert self
._last
.tail
is None, "internal error (tail)"
1394 self
._last
.tail
= text
1396 assert self
._last
.text
is None, "internal error (text)"
1397 self
._last
.text
= text
1401 # Adds text to the current element.
1403 # @param data A string. This should be either an 8-bit string
1404 # containing ASCII text, or a Unicode string.
1406 def data(self
, data
):
1407 self
._data
.append(data
)
1410 # Opens a new element.
1412 # @param tag The element name.
1413 # @param attrib A dictionary containing element attributes.
1414 # @return The opened element.
1415 # @defreturn Element
1417 def start(self
, tag
, attrs
):
1419 self
._last
= elem
= self
._factory
(tag
, attrs
)
1421 self
._elem
[-1].append(elem
)
1422 self
._elem
.append(elem
)
1427 # Closes the current element.
1429 # @param tag The element name.
1430 # @return The closed element.
1431 # @defreturn Element
1435 self
._last
= self
._elem
.pop()
1436 assert self
._last
.tag
== tag
,\
1437 "end tag mismatch (expected %s, got %s)" % (
1438 self
._last
.tag
, tag
)
1443 # Element structure builder for XML source data, based on the
1444 # <b>expat</b> parser.
1446 # @keyparam target Target object. If omitted, the builder uses an
1447 # instance of the standard {@link #TreeBuilder} class.
1448 # @keyparam html Predefine HTML entities. This flag is not supported
1449 # by the current implementation.
1450 # @keyparam encoding Optional encoding. If given, the value overrides
1451 # the encoding specified in the XML file.
1455 class XMLParser(object):
1457 def __init__(self
, html
=0, target
=None, encoding
=None):
1459 from xml
.parsers
import expat
1462 import pyexpat
as expat
1465 "No module named expat; use SimpleXMLTreeBuilder instead"
1467 parser
= expat
.ParserCreate(encoding
, "}")
1469 target
= TreeBuilder()
1470 # underscored names are provided for compatibility only
1471 self
.parser
= self
._parser
= parser
1472 self
.target
= self
._target
= target
1473 self
._error
= expat
.error
1474 self
._names
= {} # name memo cache
1476 parser
.DefaultHandlerExpand
= self
._default
1477 parser
.StartElementHandler
= self
._start
1478 parser
.EndElementHandler
= self
._end
1479 parser
.CharacterDataHandler
= self
._data
1480 # optional callbacks
1481 parser
.CommentHandler
= self
._comment
1482 parser
.ProcessingInstructionHandler
= self
._pi
1483 # let expat do the buffering, if supported
1485 self
._parser
.buffer_text
= 1
1486 except AttributeError:
1488 # use new-style attribute handling, if supported
1490 self
._parser
.ordered_attributes
= 1
1491 self
._parser
.specified_attributes
= 1
1492 parser
.StartElementHandler
= self
._start
_list
1493 except AttributeError:
1495 self
._doctype
= None
1498 self
.version
= "Expat %d.%d.%d" % expat
.version_info
1499 except AttributeError:
1502 def _raiseerror(self
, value
):
1503 err
= ParseError(value
)
1504 err
.code
= value
.code
1505 err
.position
= value
.lineno
, value
.offset
1508 def _fixtext(self
, text
):
1509 # convert text string to ascii, if possible
1511 return text
.encode("ascii")
1512 except UnicodeError:
1515 def _fixname(self
, key
):
1516 # expand qname, and convert name string to ascii, if possible
1518 name
= self
._names
[key
]
1523 self
._names
[key
] = name
= self
._fixtext
(name
)
1526 def _start(self
, tag
, attrib_in
):
1527 fixname
= self
._fixname
1528 fixtext
= self
._fixtext
1531 for key
, value
in attrib_in
.items():
1532 attrib
[fixname(key
)] = fixtext(value
)
1533 return self
.target
.start(tag
, attrib
)
1535 def _start_list(self
, tag
, attrib_in
):
1536 fixname
= self
._fixname
1537 fixtext
= self
._fixtext
1541 for i
in range(0, len(attrib_in
), 2):
1542 attrib
[fixname(attrib_in
[i
])] = fixtext(attrib_in
[i
+1])
1543 return self
.target
.start(tag
, attrib
)
1545 def _data(self
, text
):
1546 return self
.target
.data(self
._fixtext
(text
))
1548 def _end(self
, tag
):
1549 return self
.target
.end(self
._fixname
(tag
))
1551 def _comment(self
, data
):
1553 comment
= self
.target
.comment
1554 except AttributeError:
1557 return comment(self
._fixtext
(data
))
1559 def _pi(self
, target
, data
):
1562 except AttributeError:
1565 return pi(self
._fixtext
(target
), self
._fixtext
(data
))
1567 def _default(self
, text
):
1570 # deal with undefined entities
1572 self
.target
.data(self
.entity
[text
[1:-1]])
1574 from xml
.parsers
import expat
1576 "undefined entity %s: line %d, column %d" %
1577 (text
, self
._parser
.ErrorLineNumber
,
1578 self
._parser
.ErrorColumnNumber
)
1580 err
.code
= 11 # XML_ERROR_UNDEFINED_ENTITY
1581 err
.lineno
= self
._parser
.ErrorLineNumber
1582 err
.offset
= self
._parser
.ErrorColumnNumber
1584 elif prefix
== "<" and text
[:9] == "<!DOCTYPE":
1585 self
._doctype
= [] # inside a doctype declaration
1586 elif self
._doctype
is not None:
1587 # parse doctype contents
1589 self
._doctype
= None
1594 self
._doctype
.append(text
)
1595 n
= len(self
._doctype
)
1597 type = self
._doctype
[1]
1598 if type == "PUBLIC" and n
== 4:
1599 name
, type, pubid
, system
= self
._doctype
1600 elif type == "SYSTEM" and n
== 3:
1601 name
, type, system
= self
._doctype
1607 if hasattr(self
.target
, "doctype"):
1608 self
.target
.doctype(name
, pubid
, system
[1:-1])
1609 elif self
.doctype
is not self
._XMLParser
__doctype
:
1610 # warn about deprecated call
1611 self
._XMLParser
__doctype
(name
, pubid
, system
[1:-1])
1612 self
.doctype(name
, pubid
, system
[1:-1])
1613 self
._doctype
= None
1616 # (Deprecated) Handles a doctype declaration.
1618 # @param name Doctype name.
1619 # @param pubid Public identifier.
1620 # @param system System identifier.
1622 def doctype(self
, name
, pubid
, system
):
1623 """This method of XMLParser is deprecated."""
1625 "This method of XMLParser is deprecated. Define doctype() "
1626 "method on the TreeBuilder target.",
1630 # sentinel, if doctype is redefined in a subclass
1634 # Feeds data to the parser.
1636 # @param data Encoded data.
1638 def feed(self
, data
):
1640 self
._parser
.Parse(data
, 0)
1641 except self
._error
, v
:
1645 # Finishes feeding data to the parser.
1647 # @return An element structure.
1648 # @defreturn Element
1652 self
._parser
.Parse("", 1) # end of data
1653 except self
._error
, v
:
1655 tree
= self
.target
.close()
1656 del self
.target
, self
._parser
# get rid of circular references
1660 XMLTreeBuilder
= XMLParser
1662 # workaround circular import.
1664 from ElementC14N
import _serialize_c14n
1665 _serialize
["c14n"] = _serialize_c14n