3 # $Id: ElementTree.py 3440 2008-07-18 14:45:01Z fredrik $
5 # light-weight XML support for Python 2.3 and later.
7 # history (since 1.2.6):
8 # 2005-11-12 fl added tostringlist/fromstringlist helpers
9 # 2006-07-05 fl merged in selected changes from the 1.3 sandbox
10 # 2006-07-05 fl removed support for 2.1 and earlier
11 # 2007-06-21 fl added deprecation/future warnings
12 # 2007-08-25 fl added doctype hook, added parser version attribute etc
13 # 2007-08-26 fl added new serializer code (better namespace handling, etc)
14 # 2007-08-27 fl warn for broken /tag searches on tree level
15 # 2007-09-02 fl added html/text methods to serializer (experimental)
16 # 2007-09-05 fl added method argument to tostring/tostringlist
17 # 2007-09-06 fl improved error handling
18 # 2007-09-13 fl added itertext, iterfind; assorted cleanups
19 # 2007-12-15 fl added C14N hooks, copy method (experimental)
21 # Copyright (c) 1999-2008 by Fredrik Lundh. All rights reserved.
23 # fredrik@pythonware.com
24 # http://www.pythonware.com
26 # --------------------------------------------------------------------
27 # The ElementTree toolkit is
29 # Copyright (c) 1999-2008 by Fredrik Lundh
31 # By obtaining, using, and/or copying this software and/or its
32 # associated documentation, you agree that you have read, understood,
33 # and will comply with the following terms and conditions:
35 # Permission to use, copy, modify, and distribute this software and
36 # its associated documentation for any purpose and without fee is
37 # hereby granted, provided that the above copyright notice appears in
38 # all copies, and that both that copyright notice and this permission
39 # notice appear in supporting documentation, and that the name of
40 # Secret Labs AB or the author not be used in advertising or publicity
41 # pertaining to distribution of the software without specific, written
44 # SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD
45 # TO THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANT-
46 # ABILITY AND FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR
47 # BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY
48 # DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
49 # WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
50 # ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
52 # --------------------------------------------------------------------
54 # Licensed to PSF under a Contributor Agreement.
55 # See http://www.python.org/psf/license for licensing details.
61 "Element", "ElementTree",
62 "fromstring", "fromstringlist",
63 "iselement", "iterparse",
64 "parse", "ParseError",
65 "PI", "ProcessingInstruction",
68 "tostring", "tostringlist",
72 "XMLParser", "XMLTreeBuilder",
78 # The <b>Element</b> type is a flexible container object, designed to
79 # store hierarchical data structures in memory. The type can be
80 # described as a cross between a list and a dictionary.
82 # Each element has a number of properties associated with it:
84 # <li>a <i>tag</i>. This is a string identifying what kind of data
85 # this element represents (the element type, in other words).</li>
86 # <li>a number of <i>attributes</i>, stored in a Python dictionary.</li>
87 # <li>a <i>text</i> string.</li>
88 # <li>an optional <i>tail</i> string.</li>
89 # <li>a number of <i>child elements</i>, stored in a Python sequence</li>
92 # To create an element instance, use the {@link #Element} constructor
93 # or the {@link #SubElement} factory function.
95 # The {@link #ElementTree} class can be used to wrap an element
96 # structure, and convert it from and to XML.
104 class _SimpleElementPath(object):
105 # emulate pre-1.2 find/findtext/findall behaviour
106 def find(self
, element
, tag
, namespaces
=None):
111 def findtext(self
, element
, tag
, default
=None, namespaces
=None):
112 elem
= self
.find(element
, tag
)
115 return elem
.text
or ""
116 def iterfind(self
, element
, tag
, namespaces
=None):
118 for elem
in element
.iter(tag
[3:]):
123 def findall(self
, element
, tag
, namespaces
=None):
124 return list(self
.iterfind(element
, tag
, namespaces
))
127 from . import ElementPath
129 ElementPath
= _SimpleElementPath()
132 # Parser error. This is a subclass of <b>SyntaxError</b>.
134 # In addition to the exception value, an exception instance contains a
135 # specific exception code in the <b>code</b> attribute, and the line and
136 # column of the error in the <b>position</b> attribute.
138 class ParseError(SyntaxError):
141 # --------------------------------------------------------------------
144 # Checks if an object appears to be a valid element object.
146 # @param An element instance.
147 # @return A true value if this is an element object.
150 def iselement(element
):
151 # FIXME: not sure about this; might be a better idea to look
152 # for tag/attrib/text attributes
153 return isinstance(element
, Element
) or hasattr(element
, "tag")
156 # Element class. This class defines the Element interface, and
157 # provides a reference implementation of this interface.
159 # The element name, attribute names, and attribute values can be
160 # either ASCII strings (ordinary Python strings containing only 7-bit
161 # ASCII characters) or Unicode strings.
163 # @param tag The element name.
164 # @param attrib An optional dictionary, containing element attributes.
165 # @param **extra Additional attributes, given as keyword arguments.
169 # @see ProcessingInstruction
171 class Element(object):
172 # <tag attrib>text<child/>...</tag>tail
175 # (Attribute) Element tag.
180 # (Attribute) Element attribute dictionary. Where possible, use
181 # {@link #Element.get},
182 # {@link #Element.set},
183 # {@link #Element.keys}, and
184 # {@link #Element.items} to access
185 # element attributes.
190 # (Attribute) Text before first subelement. This is either a
191 # string or the value None. Note that if there was no text, this
192 # attribute may be either None or an empty string, depending on
198 # (Attribute) Text after this element's end tag, but before the
199 # next sibling element's start tag. This is either a string or
200 # the value None. Note that if there was no text, this attribute
201 # may be either None or an empty string, depending on the parser.
203 tail
= None # text after end tag, if any
207 def __init__(self
, tag
, attrib
={}, **extra
):
208 attrib
= attrib
.copy()
215 return "<Element %s at 0x%x>" % (repr(self
.tag
), id(self
))
218 # Creates a new element object of the same type as this element.
220 # @param tag Element tag.
221 # @param attrib Element attributes, given as a dictionary.
222 # @return A new element instance.
224 def makeelement(self
, tag
, attrib
):
225 return self
.__class
__(tag
, attrib
)
228 # (Experimental) Copies the current element. This creates a
229 # shallow copy; subelements will be shared with the original tree.
231 # @return A new element instance.
234 elem
= self
.makeelement(self
.tag
, self
.attrib
)
235 elem
.text
= self
.text
236 elem
.tail
= self
.tail
241 # Returns the number of subelements. Note that this only counts
242 # full elements; to check if there's any content in an element, you
243 # have to check both the length and the <b>text</b> attribute.
245 # @return The number of subelements.
248 return len(self
._children
)
250 def __nonzero__(self
):
252 "The behavior of this method will change in future versions. "
253 "Use specific 'len(elem)' or 'elem is not None' test instead.",
254 FutureWarning
, stacklevel
=2
256 return len(self
._children
) != 0 # emulate old behaviour, for now
259 # Returns the given subelement, by index.
261 # @param index What subelement to return.
262 # @return The given subelement.
263 # @exception IndexError If the given element does not exist.
265 def __getitem__(self
, index
):
266 return self
._children
[index
]
269 # Replaces the given subelement, by index.
271 # @param index What subelement to replace.
272 # @param element The new element value.
273 # @exception IndexError If the given element does not exist.
275 def __setitem__(self
, index
, element
):
276 # if isinstance(index, slice):
277 # for elt in element:
278 # assert iselement(elt)
280 # assert iselement(element)
281 self
._children
[index
] = element
284 # Deletes the given subelement, by index.
286 # @param index What subelement to delete.
287 # @exception IndexError If the given element does not exist.
289 def __delitem__(self
, index
):
290 del self
._children
[index
]
293 # Adds a subelement to the end of this element. In document order,
294 # the new element will appear after the last existing subelement (or
295 # directly after the text, if it's the first subelement), but before
296 # the end tag for this element.
298 # @param element The element to add.
300 def append(self
, element
):
301 # assert iselement(element)
302 self
._children
.append(element
)
305 # Appends subelements from a sequence.
307 # @param elements A sequence object with zero or more elements.
310 def extend(self
, elements
):
311 # for element in elements:
312 # assert iselement(element)
313 self
._children
.extend(elements
)
316 # Inserts a subelement at the given position in this element.
318 # @param index Where to insert the new subelement.
320 def insert(self
, index
, element
):
321 # assert iselement(element)
322 self
._children
.insert(index
, element
)
325 # Removes a matching subelement. Unlike the <b>find</b> methods,
326 # this method compares elements based on identity, not on tag
327 # value or contents. To remove subelements by other means, the
328 # easiest way is often to use a list comprehension to select what
329 # elements to keep, and use slice assignment to update the parent
332 # @param element What element to remove.
333 # @exception ValueError If a matching element could not be found.
335 def remove(self
, element
):
336 # assert iselement(element)
337 self
._children
.remove(element
)
340 # (Deprecated) Returns all subelements. The elements are returned
343 # @return A list of subelements.
344 # @defreturn list of Element instances
346 def getchildren(self
):
348 "This method will be removed in future versions. "
349 "Use 'list(elem)' or iteration over elem instead.",
350 DeprecationWarning, stacklevel
=2
352 return self
._children
355 # Finds the first matching subelement, by tag name or path.
357 # @param path What element to look for.
358 # @keyparam namespaces Optional namespace prefix map.
359 # @return The first matching element, or None if no element was found.
360 # @defreturn Element or None
362 def find(self
, path
, namespaces
=None):
363 return ElementPath
.find(self
, path
, namespaces
)
366 # Finds text for the first matching subelement, by tag name or path.
368 # @param path What element to look for.
369 # @param default What to return if the element was not found.
370 # @keyparam namespaces Optional namespace prefix map.
371 # @return The text content of the first matching element, or the
372 # default value no element was found. Note that if the element
373 # is found, but has no text content, this method returns an
377 def findtext(self
, path
, default
=None, namespaces
=None):
378 return ElementPath
.findtext(self
, path
, default
, namespaces
)
381 # Finds all matching subelements, by tag name or path.
383 # @param path What element to look for.
384 # @keyparam namespaces Optional namespace prefix map.
385 # @return A list or other sequence containing all matching elements,
387 # @defreturn list of Element instances
389 def findall(self
, path
, namespaces
=None):
390 return ElementPath
.findall(self
, path
, namespaces
)
393 # Finds all matching subelements, by tag name or path.
395 # @param path What element to look for.
396 # @keyparam namespaces Optional namespace prefix map.
397 # @return An iterator or sequence containing all matching elements,
399 # @defreturn a generated sequence of Element instances
401 def iterfind(self
, path
, namespaces
=None):
402 return ElementPath
.iterfind(self
, path
, namespaces
)
405 # Resets an element. This function removes all subelements, clears
406 # all attributes, and sets the <b>text</b> and <b>tail</b> attributes
412 self
.text
= self
.tail
= None
415 # Gets an element attribute. Equivalent to <b>attrib.get</b>, but
416 # some implementations may handle this a bit more efficiently.
418 # @param key What attribute to look for.
419 # @param default What to return if the attribute was not found.
420 # @return The attribute value, or the default value, if the
421 # attribute was not found.
422 # @defreturn string or None
424 def get(self
, key
, default
=None):
425 return self
.attrib
.get(key
, default
)
428 # Sets an element attribute. Equivalent to <b>attrib[key] = value</b>,
429 # but some implementations may handle this a bit more efficiently.
431 # @param key What attribute to set.
432 # @param value The attribute value.
434 def set(self
, key
, value
):
435 self
.attrib
[key
] = value
438 # Gets a list of attribute names. The names are returned in an
439 # arbitrary order (just like for an ordinary Python dictionary).
440 # Equivalent to <b>attrib.keys()</b>.
442 # @return A list of element attribute names.
443 # @defreturn list of strings
446 return self
.attrib
.keys()
449 # Gets element attributes, as a sequence. The attributes are
450 # returned in an arbitrary order. Equivalent to <b>attrib.items()</b>.
452 # @return A list of (name, value) tuples for all attributes.
453 # @defreturn list of (string, string) tuples
456 return self
.attrib
.items()
459 # Creates a tree iterator. The iterator loops over this element
460 # and all subelements, in document order, and returns all elements
461 # with a matching tag.
463 # If the tree structure is modified during iteration, new or removed
464 # elements may or may not be included. To get a stable set, use the
465 # list() function on the iterator, and loop over the resulting list.
467 # @param tag What tags to look for (default is to return all elements).
468 # @return An iterator containing all the matching elements.
469 # @defreturn iterator
471 def iter(self
, tag
=None):
474 if tag
is None or self
.tag
== tag
:
476 for e
in self
._children
:
477 for e
in e
.iter(tag
):
481 def getiterator(self
, tag
=None):
482 # Change for a DeprecationWarning in 1.4
484 "This method will be removed in future versions. "
485 "Use 'elem.iter()' or 'list(elem.iter())' instead.",
486 PendingDeprecationWarning
, stacklevel
=2
488 return list(self
.iter(tag
))
491 # Creates a text iterator. The iterator loops over this element
492 # and all subelements, in document order, and returns all inner
495 # @return An iterator containing all inner text.
496 # @defreturn iterator
500 if not isinstance(tag
, basestring
) and tag
is not None:
505 for s
in e
.itertext():
511 _Element
= _ElementInterface
= Element
514 # Subelement factory. This function creates an element instance, and
515 # appends it to an existing element.
517 # The element name, attribute names, and attribute values can be
518 # either 8-bit ASCII strings or Unicode strings.
520 # @param parent The parent element.
521 # @param tag The subelement name.
522 # @param attrib An optional dictionary, containing element attributes.
523 # @param **extra Additional attributes, given as keyword arguments.
524 # @return An element instance.
527 def SubElement(parent
, tag
, attrib
={}, **extra
):
528 attrib
= attrib
.copy()
530 element
= parent
.makeelement(tag
, attrib
)
531 parent
.append(element
)
535 # Comment element factory. This factory function creates a special
536 # element that will be serialized as an XML comment by the standard
539 # The comment string can be either an 8-bit ASCII string or a Unicode
542 # @param text A string containing the comment string.
543 # @return An element instance, representing a comment.
546 def Comment(text
=None):
547 element
= Element(Comment
)
552 # PI element factory. This factory function creates a special element
553 # that will be serialized as an XML processing instruction by the standard
556 # @param target A string containing the PI target.
557 # @param text A string containing the PI contents, if any.
558 # @return An element instance, representing a PI.
561 def ProcessingInstruction(target
, text
=None):
562 element
= Element(ProcessingInstruction
)
563 element
.text
= target
565 element
.text
= element
.text
+ " " + text
568 PI
= ProcessingInstruction
571 # QName wrapper. This can be used to wrap a QName attribute value, in
572 # order to get proper namespace handling on output.
574 # @param text A string containing the QName value, in the form {uri}local,
575 # or, if the tag argument is given, the URI part of a QName.
576 # @param tag Optional tag. If given, the first argument is interpreted as
577 # an URI, and this argument is interpreted as a local name.
578 # @return An opaque object, representing the QName.
581 def __init__(self
, text_or_uri
, tag
=None):
583 text_or_uri
= "{%s}%s" % (text_or_uri
, tag
)
584 self
.text
= text_or_uri
588 return hash(self
.text
)
589 def __cmp__(self
, other
):
590 if isinstance(other
, QName
):
591 return cmp(self
.text
, other
.text
)
592 return cmp(self
.text
, other
)
594 # --------------------------------------------------------------------
597 # ElementTree wrapper class. This class represents an entire element
598 # hierarchy, and adds some extra support for serialization to and from
601 # @param element Optional root element.
602 # @keyparam file Optional file handle or file name. If given, the
603 # tree is initialized with the contents of this XML file.
605 class ElementTree(object):
607 def __init__(self
, element
=None, file=None):
608 # assert element is None or iselement(element)
609 self
._root
= element
# first node
614 # Gets the root element for this tree.
616 # @return An element instance.
623 # Replaces the root element for this tree. This discards the
624 # current contents of the tree, and replaces it with the given
625 # element. Use with care.
627 # @param element An element instance.
629 def _setroot(self
, element
):
630 # assert iselement(element)
634 # Loads an external XML document into this element tree.
636 # @param source A file name or file object. If a file object is
637 # given, it only has to implement a <b>read(n)</b> method.
638 # @keyparam parser An optional parser instance. If not given, the
639 # standard {@link XMLParser} parser is used.
640 # @return The document root element.
642 # @exception ParseError If the parser fails to parse the document.
644 def parse(self
, source
, parser
=None):
645 if not hasattr(source
, "read"):
646 source
= open(source
, "rb")
648 parser
= XMLParser(target
=TreeBuilder())
650 data
= source
.read(65536)
654 self
._root
= parser
.close()
658 # Creates a tree iterator for the root element. The iterator loops
659 # over all elements in this tree, in document order.
661 # @param tag What tags to look for (default is to return all elements)
662 # @return An iterator.
663 # @defreturn iterator
665 def iter(self
, tag
=None):
666 # assert self._root is not None
667 return self
._root
.iter(tag
)
670 def getiterator(self
, tag
=None):
671 # Change for a DeprecationWarning in 1.4
673 "This method will be removed in future versions. "
674 "Use 'tree.iter()' or 'list(tree.iter())' instead.",
675 PendingDeprecationWarning
, stacklevel
=2
677 return list(self
.iter(tag
))
680 # Finds the first toplevel element with given tag.
681 # Same as getroot().find(path).
683 # @param path What element to look for.
684 # @keyparam namespaces Optional namespace prefix map.
685 # @return The first matching element, or None if no element was found.
686 # @defreturn Element or None
688 def find(self
, path
, namespaces
=None):
689 # assert self._root is not None
693 "This search is broken in 1.3 and earlier, and will be "
694 "fixed in a future version. If you rely on the current "
695 "behaviour, change it to %r" % path
,
696 FutureWarning
, stacklevel
=2
698 return self
._root
.find(path
, namespaces
)
701 # Finds the element text for the first toplevel element with given
702 # tag. Same as getroot().findtext(path).
704 # @param path What toplevel element to look for.
705 # @param default What to return if the element was not found.
706 # @keyparam namespaces Optional namespace prefix map.
707 # @return The text content of the first matching element, or the
708 # default value no element was found. Note that if the element
709 # is found, but has no text content, this method returns an
713 def findtext(self
, path
, default
=None, namespaces
=None):
714 # assert self._root is not None
718 "This search is broken in 1.3 and earlier, and will be "
719 "fixed in a future version. If you rely on the current "
720 "behaviour, change it to %r" % path
,
721 FutureWarning
, stacklevel
=2
723 return self
._root
.findtext(path
, default
, namespaces
)
726 # Finds all toplevel elements with the given tag.
727 # Same as getroot().findall(path).
729 # @param path What element to look for.
730 # @keyparam namespaces Optional namespace prefix map.
731 # @return A list or iterator containing all matching elements,
733 # @defreturn list of Element instances
735 def findall(self
, path
, namespaces
=None):
736 # assert self._root is not None
740 "This search is broken in 1.3 and earlier, and will be "
741 "fixed in a future version. If you rely on the current "
742 "behaviour, change it to %r" % path
,
743 FutureWarning
, stacklevel
=2
745 return self
._root
.findall(path
, namespaces
)
748 # Finds all matching subelements, by tag name or path.
749 # Same as getroot().iterfind(path).
751 # @param path What element to look for.
752 # @keyparam namespaces Optional namespace prefix map.
753 # @return An iterator or sequence containing all matching elements,
755 # @defreturn a generated sequence of Element instances
757 def iterfind(self
, path
, namespaces
=None):
758 # assert self._root is not None
762 "This search is broken in 1.3 and earlier, and will be "
763 "fixed in a future version. If you rely on the current "
764 "behaviour, change it to %r" % path
,
765 FutureWarning
, stacklevel
=2
767 return self
._root
.iterfind(path
, namespaces
)
770 # Writes the element tree to a file, as XML.
772 # @def write(file, **options)
773 # @param file A file name, or a file object opened for writing.
774 # @param **options Options, given as keyword arguments.
775 # @keyparam encoding Optional output encoding (default is US-ASCII).
776 # @keyparam method Optional output method ("xml", "html", "text" or
777 # "c14n"; default is "xml").
778 # @keyparam xml_declaration Controls if an XML declaration should
779 # be added to the file. Use False for never, True for always,
780 # None for only if not US-ASCII or UTF-8. None is default.
782 def write(self
, file_or_filename
,
785 xml_declaration
=None,
786 default_namespace
=None,
788 # assert self._root is not None
791 elif method
not in _serialize
:
792 # FIXME: raise an ImportError for c14n if ElementC14N is missing?
793 raise ValueError("unknown method %r" % method
)
794 if hasattr(file_or_filename
, "write"):
795 file = file_or_filename
797 file = open(file_or_filename
, "wb")
803 encoding
= "us-ascii"
804 elif xml_declaration
or (xml_declaration
is None and
805 encoding
not in ("utf-8", "us-ascii")):
807 write("<?xml version='1.0' encoding='%s'?>\n" % encoding
)
809 _serialize_text(write
, self
._root
, encoding
)
811 qnames
, namespaces
= _namespaces(
812 self
._root
, encoding
, default_namespace
814 serialize
= _serialize
[method
]
815 serialize(write
, self
._root
, encoding
, qnames
, namespaces
)
816 if file_or_filename
is not file:
819 def write_c14n(self
, file):
820 # lxml.etree compatibility. use output method instead
821 return self
.write(file, method
="c14n")
823 # --------------------------------------------------------------------
824 # serialization support
826 def _namespaces(elem
, encoding
, default_namespace
=None):
827 # identify namespaces used in this tree
829 # maps qnames to *encoded* prefix:local names
830 qnames
= {None: None}
832 # maps uri:s to prefixes
834 if default_namespace
:
835 namespaces
[default_namespace
] = ""
838 return text
.encode(encoding
)
840 def add_qname(qname
):
841 # calculate serialized qname representation
844 uri
, tag
= qname
[1:].rsplit("}", 1)
845 prefix
= namespaces
.get(uri
)
847 prefix
= _namespace_map
.get(uri
)
849 prefix
= "ns%d" % len(namespaces
)
851 namespaces
[uri
] = prefix
853 qnames
[qname
] = encode("%s:%s" % (prefix
, tag
))
855 qnames
[qname
] = encode(tag
) # default element
857 if default_namespace
:
858 # FIXME: can this be handled in XML 1.0?
860 "cannot use non-qualified names with "
861 "default_namespace option"
863 qnames
[qname
] = encode(qname
)
865 _raise_serialization_error(qname
)
867 # populate qname and namespaces table
870 except AttributeError:
871 iterate
= elem
.getiterator
# cET compatibility
872 for elem
in iterate():
874 if isinstance(tag
, QName
):
875 if tag
.text
not in qnames
:
877 elif isinstance(tag
, basestring
):
878 if tag
not in qnames
:
880 elif tag
is not None and tag
is not Comment
and tag
is not PI
:
881 _raise_serialization_error(tag
)
882 for key
, value
in elem
.items():
883 if isinstance(key
, QName
):
885 if key
not in qnames
:
887 if isinstance(value
, QName
) and value
.text
not in qnames
:
888 add_qname(value
.text
)
890 if isinstance(text
, QName
) and text
.text
not in qnames
:
892 return qnames
, namespaces
894 def _serialize_xml(write
, elem
, encoding
, qnames
, namespaces
):
898 write("<!--%s-->" % _encode(text
, encoding
))
899 elif tag
is ProcessingInstruction
:
900 write("<?%s?>" % _encode(text
, encoding
))
905 write(_escape_cdata(text
, encoding
))
907 _serialize_xml(write
, e
, encoding
, qnames
, None)
911 if items
or namespaces
:
913 for v
, k
in sorted(namespaces
.items(),
914 key
=lambda x
: x
[1]): # sort on prefix
917 write(" xmlns%s=\"%s\"" % (
919 _escape_attrib(v
, encoding
)
921 for k
, v
in sorted(items
): # lexical order
922 if isinstance(k
, QName
):
924 if isinstance(v
, QName
):
927 v
= _escape_attrib(v
, encoding
)
928 write(" %s=\"%s\"" % (qnames
[k
], v
))
929 if text
or len(elem
):
932 write(_escape_cdata(text
, encoding
))
934 _serialize_xml(write
, e
, encoding
, qnames
, None)
935 write("</" + tag
+ ">")
939 write(_escape_cdata(elem
.tail
, encoding
))
941 HTML_EMPTY
= ("area", "base", "basefont", "br", "col", "frame", "hr",
942 "img", "input", "isindex", "link", "meta" "param")
945 HTML_EMPTY
= set(HTML_EMPTY
)
949 def _serialize_html(write
, elem
, encoding
, qnames
, namespaces
):
953 write("<!--%s-->" % _escape_cdata(text
, encoding
))
954 elif tag
is ProcessingInstruction
:
955 write("<?%s?>" % _escape_cdata(text
, encoding
))
960 write(_escape_cdata(text
, encoding
))
962 _serialize_html(write
, e
, encoding
, qnames
, None)
966 if items
or namespaces
:
968 for v
, k
in sorted(namespaces
.items(),
969 key
=lambda x
: x
[1]): # sort on prefix
972 write(" xmlns%s=\"%s\"" % (
974 _escape_attrib(v
, encoding
)
976 for k
, v
in sorted(items
): # lexical order
977 if isinstance(k
, QName
):
979 if isinstance(v
, QName
):
982 v
= _escape_attrib_html(v
, encoding
)
983 # FIXME: handle boolean attributes
984 write(" %s=\"%s\"" % (qnames
[k
], v
))
988 if tag
== "script" or tag
== "style":
989 write(_encode(text
, encoding
))
991 write(_escape_cdata(text
, encoding
))
993 _serialize_html(write
, e
, encoding
, qnames
, None)
994 if tag
not in HTML_EMPTY
:
995 write("</" + tag
+ ">")
997 write(_escape_cdata(elem
.tail
, encoding
))
999 def _serialize_text(write
, elem
, encoding
):
1000 for part
in elem
.itertext():
1001 write(part
.encode(encoding
))
1003 write(elem
.tail
.encode(encoding
))
1006 "xml": _serialize_xml
,
1007 "html": _serialize_html
,
1008 "text": _serialize_text
,
1009 # this optional method is imported at the end of the module
1010 # "c14n": _serialize_c14n,
1014 # Registers a namespace prefix. The registry is global, and any
1015 # existing mapping for either the given prefix or the namespace URI
1018 # @param prefix Namespace prefix.
1019 # @param uri Namespace uri. Tags and attributes in this namespace
1020 # will be serialized with the given prefix, if at all possible.
1021 # @exception ValueError If the prefix is reserved, or is otherwise
1024 def register_namespace(prefix
, uri
):
1025 if re
.match("ns\d+$", prefix
):
1026 raise ValueError("Prefix format reserved for internal use")
1027 for k
, v
in _namespace_map
.items():
1028 if k
== uri
or v
== prefix
:
1029 del _namespace_map
[k
]
1030 _namespace_map
[uri
] = prefix
1033 # "well-known" namespace prefixes
1034 "http://www.w3.org/XML/1998/namespace": "xml",
1035 "http://www.w3.org/1999/xhtml": "html",
1036 "http://www.w3.org/1999/02/22-rdf-syntax-ns#": "rdf",
1037 "http://schemas.xmlsoap.org/wsdl/": "wsdl",
1039 "http://www.w3.org/2001/XMLSchema": "xs",
1040 "http://www.w3.org/2001/XMLSchema-instance": "xsi",
1042 "http://purl.org/dc/elements/1.1/": "dc",
1045 def _raise_serialization_error(text
):
1047 "cannot serialize %r (type %s)" % (text
, type(text
).__name
__)
1050 def _encode(text
, encoding
):
1052 return text
.encode(encoding
, "xmlcharrefreplace")
1053 except (TypeError, AttributeError):
1054 _raise_serialization_error(text
)
1056 def _escape_cdata(text
, encoding
):
1057 # escape character data
1059 # it's worth avoiding do-nothing calls for strings that are
1060 # shorter than 500 character, or so. assume that's, by far,
1061 # the most common case in most applications.
1063 text
= text
.replace("&", "&")
1065 text
= text
.replace("<", "<")
1067 text
= text
.replace(">", ">")
1068 return text
.encode(encoding
, "xmlcharrefreplace")
1069 except (TypeError, AttributeError):
1070 _raise_serialization_error(text
)
1072 def _escape_attrib(text
, encoding
):
1073 # escape attribute value
1076 text
= text
.replace("&", "&")
1078 text
= text
.replace("<", "<")
1080 text
= text
.replace(">", ">")
1082 text
= text
.replace("\"", """)
1084 text
= text
.replace("\n", " ")
1085 return text
.encode(encoding
, "xmlcharrefreplace")
1086 except (TypeError, AttributeError):
1087 _raise_serialization_error(text
)
1089 def _escape_attrib_html(text
, encoding
):
1090 # escape attribute value
1093 text
= text
.replace("&", "&")
1095 text
= text
.replace(">", ">")
1097 text
= text
.replace("\"", """)
1098 return text
.encode(encoding
, "xmlcharrefreplace")
1099 except (TypeError, AttributeError):
1100 _raise_serialization_error(text
)
1102 # --------------------------------------------------------------------
1105 # Generates a string representation of an XML element, including all
1108 # @param element An Element instance.
1109 # @keyparam encoding Optional output encoding (default is US-ASCII).
1110 # @keyparam method Optional output method ("xml", "html", "text" or
1111 # "c14n"; default is "xml").
1112 # @return An encoded string containing the XML data.
1115 def tostring(element
, encoding
=None, method
=None):
1120 file.write
= data
.append
1121 ElementTree(element
).write(file, encoding
, method
=method
)
1122 return "".join(data
)
1125 # Generates a string representation of an XML element, including all
1126 # subelements. The string is returned as a sequence of string fragments.
1128 # @param element An Element instance.
1129 # @keyparam encoding Optional output encoding (default is US-ASCII).
1130 # @keyparam method Optional output method ("xml", "html", "text" or
1131 # "c14n"; default is "xml").
1132 # @return A sequence object containing the XML data.
1133 # @defreturn sequence
1136 def tostringlist(element
, encoding
=None, method
=None):
1141 file.write
= data
.append
1142 ElementTree(element
).write(file, encoding
, method
=method
)
1143 # FIXME: merge small fragments into larger parts
1147 # Writes an element tree or element structure to sys.stdout. This
1148 # function should be used for debugging only.
1150 # The exact output format is implementation dependent. In this
1151 # version, it's written as an ordinary XML file.
1153 # @param elem An element tree or an individual element.
1157 if not isinstance(elem
, ElementTree
):
1158 elem
= ElementTree(elem
)
1159 elem
.write(sys
.stdout
)
1160 tail
= elem
.getroot().tail
1161 if not tail
or tail
[-1] != "\n":
1162 sys
.stdout
.write("\n")
1164 # --------------------------------------------------------------------
1168 # Parses an XML document into an element tree.
1170 # @param source A filename or file object containing XML data.
1171 # @param parser An optional parser instance. If not given, the
1172 # standard {@link XMLParser} parser is used.
1173 # @return An ElementTree instance
1175 def parse(source
, parser
=None):
1176 tree
= ElementTree()
1177 tree
.parse(source
, parser
)
1181 # Parses an XML document into an element tree incrementally, and reports
1182 # what's going on to the user.
1184 # @param source A filename or file object containing XML data.
1185 # @param events A list of events to report back. If omitted, only "end"
1186 # events are reported.
1187 # @param parser An optional parser instance. If not given, the
1188 # standard {@link XMLParser} parser is used.
1189 # @return A (event, elem) iterator.
1191 def iterparse(source
, events
=None, parser
=None):
1192 if not hasattr(source
, "read"):
1193 source
= open(source
, "rb")
1195 parser
= XMLParser(target
=TreeBuilder())
1196 return _IterParseIterator(source
, events
, parser
)
1198 class _IterParseIterator(object):
1200 def __init__(self
, source
, events
, parser
):
1204 self
.root
= self
._root
= None
1205 self
._parser
= parser
1206 # wire up the parser for event reporting
1207 parser
= self
._parser
._parser
1208 append
= self
._events
.append
1211 for event
in events
:
1212 if event
== "start":
1214 parser
.ordered_attributes
= 1
1215 parser
.specified_attributes
= 1
1216 def handler(tag
, attrib_in
, event
=event
, append
=append
,
1217 start
=self
._parser
._start
_list
):
1218 append((event
, start(tag
, attrib_in
)))
1219 parser
.StartElementHandler
= handler
1220 except AttributeError:
1221 def handler(tag
, attrib_in
, event
=event
, append
=append
,
1222 start
=self
._parser
._start
):
1223 append((event
, start(tag
, attrib_in
)))
1224 parser
.StartElementHandler
= handler
1225 elif event
== "end":
1226 def handler(tag
, event
=event
, append
=append
,
1227 end
=self
._parser
._end
):
1228 append((event
, end(tag
)))
1229 parser
.EndElementHandler
= handler
1230 elif event
== "start-ns":
1231 def handler(prefix
, uri
, event
=event
, append
=append
):
1233 uri
= (uri
or "").encode("ascii")
1234 except UnicodeError:
1236 append((event
, (prefix
or "", uri
or "")))
1237 parser
.StartNamespaceDeclHandler
= handler
1238 elif event
== "end-ns":
1239 def handler(prefix
, event
=event
, append
=append
):
1240 append((event
, None))
1241 parser
.EndNamespaceDeclHandler
= handler
1243 raise ValueError("unknown event %r" % event
)
1248 item
= self
._events
[self
._index
]
1250 if self
._parser
is None:
1251 self
.root
= self
._root
1256 data
= self
._file
.read(16384)
1258 self
._parser
.feed(data
)
1260 self
._root
= self
._parser
.close()
1263 self
._index
= self
._index
+ 1
1270 # Parses an XML document from a string constant. This function can
1271 # be used to embed "XML literals" in Python code.
1273 # @param source A string containing XML data.
1274 # @param parser An optional parser instance. If not given, the
1275 # standard {@link XMLParser} parser is used.
1276 # @return An Element instance.
1277 # @defreturn Element
1279 def XML(text
, parser
=None):
1281 parser
= XMLParser(target
=TreeBuilder())
1283 return parser
.close()
1286 # Parses an XML document from a string constant, and also returns
1287 # a dictionary which maps from element id:s to elements.
1289 # @param source A string containing XML data.
1290 # @param parser An optional parser instance. If not given, the
1291 # standard {@link XMLParser} parser is used.
1292 # @return A tuple containing an Element instance and a dictionary.
1293 # @defreturn (Element, dictionary)
1295 def XMLID(text
, parser
=None):
1297 parser
= XMLParser(target
=TreeBuilder())
1299 tree
= parser
.close()
1301 for elem
in tree
.iter():
1308 # Parses an XML document from a string constant. Same as {@link #XML}.
1310 # @def fromstring(text)
1311 # @param source A string containing XML data.
1312 # @return An Element instance.
1313 # @defreturn Element
1318 # Parses an XML document from a sequence of string fragments.
1320 # @param sequence A list or other sequence containing XML data fragments.
1321 # @param parser An optional parser instance. If not given, the
1322 # standard {@link XMLParser} parser is used.
1323 # @return An Element instance.
1324 # @defreturn Element
1327 def fromstringlist(sequence
, parser
=None):
1329 parser
= XMLParser(target
=TreeBuilder())
1330 for text
in sequence
:
1332 return parser
.close()
1334 # --------------------------------------------------------------------
1337 # Generic element structure builder. This builder converts a sequence
1338 # of {@link #TreeBuilder.start}, {@link #TreeBuilder.data}, and {@link
1339 # #TreeBuilder.end} method calls to a well-formed element structure.
1341 # You can use this class to build an element structure using a custom XML
1342 # parser, or a parser for some other XML-like format.
1344 # @param element_factory Optional element factory. This factory
1345 # is called to create new Element instances, as necessary.
1347 class TreeBuilder(object):
1349 def __init__(self
, element_factory
=None):
1350 self
._data
= [] # data collector
1351 self
._elem
= [] # element stack
1352 self
._last
= None # last element
1353 self
._tail
= None # true if we're after an end tag
1354 if element_factory
is None:
1355 element_factory
= Element
1356 self
._factory
= element_factory
1359 # Flushes the builder buffers, and returns the toplevel document
1362 # @return An Element instance.
1363 # @defreturn Element
1366 assert len(self
._elem
) == 0, "missing end tags"
1367 assert self
._last
is not None, "missing toplevel element"
1372 if self
._last
is not None:
1373 text
= "".join(self
._data
)
1375 assert self
._last
.tail
is None, "internal error (tail)"
1376 self
._last
.tail
= text
1378 assert self
._last
.text
is None, "internal error (text)"
1379 self
._last
.text
= text
1383 # Adds text to the current element.
1385 # @param data A string. This should be either an 8-bit string
1386 # containing ASCII text, or a Unicode string.
1388 def data(self
, data
):
1389 self
._data
.append(data
)
1392 # Opens a new element.
1394 # @param tag The element name.
1395 # @param attrib A dictionary containing element attributes.
1396 # @return The opened element.
1397 # @defreturn Element
1399 def start(self
, tag
, attrs
):
1401 self
._last
= elem
= self
._factory
(tag
, attrs
)
1403 self
._elem
[-1].append(elem
)
1404 self
._elem
.append(elem
)
1409 # Closes the current element.
1411 # @param tag The element name.
1412 # @return The closed element.
1413 # @defreturn Element
1417 self
._last
= self
._elem
.pop()
1418 assert self
._last
.tag
== tag
,\
1419 "end tag mismatch (expected %s, got %s)" % (
1420 self
._last
.tag
, tag
)
1425 # Element structure builder for XML source data, based on the
1426 # <b>expat</b> parser.
1428 # @keyparam target Target object. If omitted, the builder uses an
1429 # instance of the standard {@link #TreeBuilder} class.
1430 # @keyparam html Predefine HTML entities. This flag is not supported
1431 # by the current implementation.
1432 # @keyparam encoding Optional encoding. If given, the value overrides
1433 # the encoding specified in the XML file.
1437 class XMLParser(object):
1439 def __init__(self
, html
=0, target
=None, encoding
=None):
1441 from xml
.parsers
import expat
1444 import pyexpat
as expat
1447 "No module named expat; use SimpleXMLTreeBuilder instead"
1449 parser
= expat
.ParserCreate(encoding
, "}")
1451 target
= TreeBuilder()
1452 # underscored names are provided for compatibility only
1453 self
.parser
= self
._parser
= parser
1454 self
.target
= self
._target
= target
1455 self
._error
= expat
.error
1456 self
._names
= {} # name memo cache
1458 parser
.DefaultHandlerExpand
= self
._default
1459 parser
.StartElementHandler
= self
._start
1460 parser
.EndElementHandler
= self
._end
1461 parser
.CharacterDataHandler
= self
._data
1462 # optional callbacks
1463 parser
.CommentHandler
= self
._comment
1464 parser
.ProcessingInstructionHandler
= self
._pi
1465 # let expat do the buffering, if supported
1467 self
._parser
.buffer_text
= 1
1468 except AttributeError:
1470 # use new-style attribute handling, if supported
1472 self
._parser
.ordered_attributes
= 1
1473 self
._parser
.specified_attributes
= 1
1474 parser
.StartElementHandler
= self
._start
_list
1475 except AttributeError:
1477 self
._doctype
= None
1480 self
.version
= "Expat %d.%d.%d" % expat
.version_info
1481 except AttributeError:
1484 def _raiseerror(self
, value
):
1485 err
= ParseError(value
)
1486 err
.code
= value
.code
1487 err
.position
= value
.lineno
, value
.offset
1490 def _fixtext(self
, text
):
1491 # convert text string to ascii, if possible
1493 return text
.encode("ascii")
1494 except UnicodeError:
1497 def _fixname(self
, key
):
1498 # expand qname, and convert name string to ascii, if possible
1500 name
= self
._names
[key
]
1505 self
._names
[key
] = name
= self
._fixtext
(name
)
1508 def _start(self
, tag
, attrib_in
):
1509 fixname
= self
._fixname
1510 fixtext
= self
._fixtext
1513 for key
, value
in attrib_in
.items():
1514 attrib
[fixname(key
)] = fixtext(value
)
1515 return self
.target
.start(tag
, attrib
)
1517 def _start_list(self
, tag
, attrib_in
):
1518 fixname
= self
._fixname
1519 fixtext
= self
._fixtext
1523 for i
in range(0, len(attrib_in
), 2):
1524 attrib
[fixname(attrib_in
[i
])] = fixtext(attrib_in
[i
+1])
1525 return self
.target
.start(tag
, attrib
)
1527 def _data(self
, text
):
1528 return self
.target
.data(self
._fixtext
(text
))
1530 def _end(self
, tag
):
1531 return self
.target
.end(self
._fixname
(tag
))
1533 def _comment(self
, data
):
1535 comment
= self
.target
.comment
1536 except AttributeError:
1539 return comment(self
._fixtext
(data
))
1541 def _pi(self
, target
, data
):
1544 except AttributeError:
1547 return pi(self
._fixtext
(target
), self
._fixtext
(data
))
1549 def _default(self
, text
):
1552 # deal with undefined entities
1554 self
.target
.data(self
.entity
[text
[1:-1]])
1556 from xml
.parsers
import expat
1558 "undefined entity %s: line %d, column %d" %
1559 (text
, self
._parser
.ErrorLineNumber
,
1560 self
._parser
.ErrorColumnNumber
)
1562 err
.code
= 11 # XML_ERROR_UNDEFINED_ENTITY
1563 err
.lineno
= self
._parser
.ErrorLineNumber
1564 err
.offset
= self
._parser
.ErrorColumnNumber
1566 elif prefix
== "<" and text
[:9] == "<!DOCTYPE":
1567 self
._doctype
= [] # inside a doctype declaration
1568 elif self
._doctype
is not None:
1569 # parse doctype contents
1571 self
._doctype
= None
1576 self
._doctype
.append(text
)
1577 n
= len(self
._doctype
)
1579 type = self
._doctype
[1]
1580 if type == "PUBLIC" and n
== 4:
1581 name
, type, pubid
, system
= self
._doctype
1582 elif type == "SYSTEM" and n
== 3:
1583 name
, type, system
= self
._doctype
1589 if hasattr(self
.target
, "doctype"):
1590 self
.target
.doctype(name
, pubid
, system
[1:-1])
1591 elif self
.doctype
is not self
._XMLParser
__doctype
:
1592 # warn about deprecated call
1593 self
._XMLParser
__doctype
(name
, pubid
, system
[1:-1])
1594 self
.doctype(name
, pubid
, system
[1:-1])
1595 self
._doctype
= None
1598 # (Deprecated) Handles a doctype declaration.
1600 # @param name Doctype name.
1601 # @param pubid Public identifier.
1602 # @param system System identifier.
1604 def doctype(self
, name
, pubid
, system
):
1605 """This method of XMLParser is deprecated."""
1607 "This method of XMLParser is deprecated. Define doctype() "
1608 "method on the TreeBuilder target.",
1612 # sentinel, if doctype is redefined in a subclass
1616 # Feeds data to the parser.
1618 # @param data Encoded data.
1620 def feed(self
, data
):
1622 self
._parser
.Parse(data
, 0)
1623 except self
._error
, v
:
1627 # Finishes feeding data to the parser.
1629 # @return An element structure.
1630 # @defreturn Element
1634 self
._parser
.Parse("", 1) # end of data
1635 except self
._error
, v
:
1637 tree
= self
.target
.close()
1638 del self
.target
, self
._parser
# get rid of circular references
1642 XMLTreeBuilder
= XMLParser
1644 # workaround circular import.
1646 from ElementC14N
import _serialize_c14n
1647 _serialize
["c14n"] = _serialize_c14n