--- /dev/null
+"""\r
+SAX driver for the pyexpat C module. This driver works with\r
+pyexpat.__version__ == '2.22'.\r
+"""\r
+\r
+version = "0.20"\r
+\r
+from xml.sax._exceptions import *\r
+from xml.sax.handler import feature_validation, feature_namespaces\r
+from xml.sax.handler import feature_namespace_prefixes\r
+from xml.sax.handler import feature_external_ges, feature_external_pes\r
+from xml.sax.handler import feature_string_interning\r
+from xml.sax.handler import property_xml_string, property_interning_dict\r
+\r
+# xml.parsers.expat does not raise ImportError in Jython\r
+import sys\r
+if sys.platform[:4] == "java":\r
+ raise SAXReaderNotAvailable("expat not available in Java", None)\r
+del sys\r
+\r
+try:\r
+ from xml.parsers import expat\r
+except ImportError:\r
+ raise SAXReaderNotAvailable("expat not supported", None)\r
+else:\r
+ if not hasattr(expat, "ParserCreate"):\r
+ raise SAXReaderNotAvailable("expat not supported", None)\r
+from xml.sax import xmlreader, saxutils, handler\r
+\r
+AttributesImpl = xmlreader.AttributesImpl\r
+AttributesNSImpl = xmlreader.AttributesNSImpl\r
+\r
+# If we're using a sufficiently recent version of Python, we can use\r
+# weak references to avoid cycles between the parser and content\r
+# handler, otherwise we'll just have to pretend.\r
+try:\r
+ import _weakref\r
+except ImportError:\r
+ def _mkproxy(o):\r
+ return o\r
+else:\r
+ import weakref\r
+ _mkproxy = weakref.proxy\r
+ del weakref, _weakref\r
+\r
+class _ClosedParser:\r
+ pass\r
+\r
+# --- ExpatLocator\r
+\r
+class ExpatLocator(xmlreader.Locator):\r
+ """Locator for use with the ExpatParser class.\r
+\r
+ This uses a weak reference to the parser object to avoid creating\r
+ a circular reference between the parser and the content handler.\r
+ """\r
+ def __init__(self, parser):\r
+ self._ref = _mkproxy(parser)\r
+\r
+ def getColumnNumber(self):\r
+ parser = self._ref\r
+ if parser._parser is None:\r
+ return None\r
+ return parser._parser.ErrorColumnNumber\r
+\r
+ def getLineNumber(self):\r
+ parser = self._ref\r
+ if parser._parser is None:\r
+ return 1\r
+ return parser._parser.ErrorLineNumber\r
+\r
+ def getPublicId(self):\r
+ parser = self._ref\r
+ if parser is None:\r
+ return None\r
+ return parser._source.getPublicId()\r
+\r
+ def getSystemId(self):\r
+ parser = self._ref\r
+ if parser is None:\r
+ return None\r
+ return parser._source.getSystemId()\r
+\r
+\r
+# --- ExpatParser\r
+\r
+class ExpatParser(xmlreader.IncrementalParser, xmlreader.Locator):\r
+ """SAX driver for the pyexpat C module."""\r
+\r
+ def __init__(self, namespaceHandling=0, bufsize=2**16-20):\r
+ xmlreader.IncrementalParser.__init__(self, bufsize)\r
+ self._source = xmlreader.InputSource()\r
+ self._parser = None\r
+ self._namespaces = namespaceHandling\r
+ self._lex_handler_prop = None\r
+ self._parsing = 0\r
+ self._entity_stack = []\r
+ self._external_ges = 1\r
+ self._interning = None\r
+\r
+ # XMLReader methods\r
+\r
+ def parse(self, source):\r
+ "Parse an XML document from a URL or an InputSource."\r
+ source = saxutils.prepare_input_source(source)\r
+\r
+ self._source = source\r
+ self.reset()\r
+ self._cont_handler.setDocumentLocator(ExpatLocator(self))\r
+ xmlreader.IncrementalParser.parse(self, source)\r
+\r
+ def prepareParser(self, source):\r
+ if source.getSystemId() is not None:\r
+ base = source.getSystemId()\r
+ if isinstance(base, unicode):\r
+ base = base.encode('utf-8')\r
+ self._parser.SetBase(base)\r
+\r
+ # Redefined setContentHandler to allow changing handlers during parsing\r
+\r
+ def setContentHandler(self, handler):\r
+ xmlreader.IncrementalParser.setContentHandler(self, handler)\r
+ if self._parsing:\r
+ self._reset_cont_handler()\r
+\r
+ def getFeature(self, name):\r
+ if name == feature_namespaces:\r
+ return self._namespaces\r
+ elif name == feature_string_interning:\r
+ return self._interning is not None\r
+ elif name in (feature_validation, feature_external_pes,\r
+ feature_namespace_prefixes):\r
+ return 0\r
+ elif name == feature_external_ges:\r
+ return self._external_ges\r
+ raise SAXNotRecognizedException("Feature '%s' not recognized" % name)\r
+\r
+ def setFeature(self, name, state):\r
+ if self._parsing:\r
+ raise SAXNotSupportedException("Cannot set features while parsing")\r
+\r
+ if name == feature_namespaces:\r
+ self._namespaces = state\r
+ elif name == feature_external_ges:\r
+ self._external_ges = state\r
+ elif name == feature_string_interning:\r
+ if state:\r
+ if self._interning is None:\r
+ self._interning = {}\r
+ else:\r
+ self._interning = None\r
+ elif name == feature_validation:\r
+ if state:\r
+ raise SAXNotSupportedException(\r
+ "expat does not support validation")\r
+ elif name == feature_external_pes:\r
+ if state:\r
+ raise SAXNotSupportedException(\r
+ "expat does not read external parameter entities")\r
+ elif name == feature_namespace_prefixes:\r
+ if state:\r
+ raise SAXNotSupportedException(\r
+ "expat does not report namespace prefixes")\r
+ else:\r
+ raise SAXNotRecognizedException(\r
+ "Feature '%s' not recognized" % name)\r
+\r
+ def getProperty(self, name):\r
+ if name == handler.property_lexical_handler:\r
+ return self._lex_handler_prop\r
+ elif name == property_interning_dict:\r
+ return self._interning\r
+ elif name == property_xml_string:\r
+ if self._parser:\r
+ if hasattr(self._parser, "GetInputContext"):\r
+ return self._parser.GetInputContext()\r
+ else:\r
+ raise SAXNotRecognizedException(\r
+ "This version of expat does not support getting"\r
+ " the XML string")\r
+ else:\r
+ raise SAXNotSupportedException(\r
+ "XML string cannot be returned when not parsing")\r
+ raise SAXNotRecognizedException("Property '%s' not recognized" % name)\r
+\r
+ def setProperty(self, name, value):\r
+ if name == handler.property_lexical_handler:\r
+ self._lex_handler_prop = value\r
+ if self._parsing:\r
+ self._reset_lex_handler_prop()\r
+ elif name == property_interning_dict:\r
+ self._interning = value\r
+ elif name == property_xml_string:\r
+ raise SAXNotSupportedException("Property '%s' cannot be set" %\r
+ name)\r
+ else:\r
+ raise SAXNotRecognizedException("Property '%s' not recognized" %\r
+ name)\r
+\r
+ # IncrementalParser methods\r
+\r
+ def feed(self, data, isFinal = 0):\r
+ if not self._parsing:\r
+ self.reset()\r
+ self._parsing = 1\r
+ self._cont_handler.startDocument()\r
+\r
+ try:\r
+ # The isFinal parameter is internal to the expat reader.\r
+ # If it is set to true, expat will check validity of the entire\r
+ # document. When feeding chunks, they are not normally final -\r
+ # except when invoked from close.\r
+ self._parser.Parse(data, isFinal)\r
+ except expat.error, e:\r
+ exc = SAXParseException(expat.ErrorString(e.code), e, self)\r
+ # FIXME: when to invoke error()?\r
+ self._err_handler.fatalError(exc)\r
+\r
+ def close(self):\r
+ if (self._entity_stack or self._parser is None or\r
+ isinstance(self._parser, _ClosedParser)):\r
+ # If we are completing an external entity, do nothing here\r
+ return\r
+ try:\r
+ self.feed("", isFinal = 1)\r
+ self._cont_handler.endDocument()\r
+ self._parsing = 0\r
+ # break cycle created by expat handlers pointing to our methods\r
+ self._parser = None\r
+ finally:\r
+ self._parsing = 0\r
+ if self._parser is not None:\r
+ # Keep ErrorColumnNumber and ErrorLineNumber after closing.\r
+ parser = _ClosedParser()\r
+ parser.ErrorColumnNumber = self._parser.ErrorColumnNumber\r
+ parser.ErrorLineNumber = self._parser.ErrorLineNumber\r
+ self._parser = parser\r
+\r
+ def _reset_cont_handler(self):\r
+ self._parser.ProcessingInstructionHandler = \\r
+ self._cont_handler.processingInstruction\r
+ self._parser.CharacterDataHandler = self._cont_handler.characters\r
+\r
+ def _reset_lex_handler_prop(self):\r
+ lex = self._lex_handler_prop\r
+ parser = self._parser\r
+ if lex is None:\r
+ parser.CommentHandler = None\r
+ parser.StartCdataSectionHandler = None\r
+ parser.EndCdataSectionHandler = None\r
+ parser.StartDoctypeDeclHandler = None\r
+ parser.EndDoctypeDeclHandler = None\r
+ else:\r
+ parser.CommentHandler = lex.comment\r
+ parser.StartCdataSectionHandler = lex.startCDATA\r
+ parser.EndCdataSectionHandler = lex.endCDATA\r
+ parser.StartDoctypeDeclHandler = self.start_doctype_decl\r
+ parser.EndDoctypeDeclHandler = lex.endDTD\r
+\r
+ def reset(self):\r
+ if self._namespaces:\r
+ self._parser = expat.ParserCreate(self._source.getEncoding(), " ",\r
+ intern=self._interning)\r
+ self._parser.namespace_prefixes = 1\r
+ self._parser.StartElementHandler = self.start_element_ns\r
+ self._parser.EndElementHandler = self.end_element_ns\r
+ else:\r
+ self._parser = expat.ParserCreate(self._source.getEncoding(),\r
+ intern = self._interning)\r
+ self._parser.StartElementHandler = self.start_element\r
+ self._parser.EndElementHandler = self.end_element\r
+\r
+ self._reset_cont_handler()\r
+ self._parser.UnparsedEntityDeclHandler = self.unparsed_entity_decl\r
+ self._parser.NotationDeclHandler = self.notation_decl\r
+ self._parser.StartNamespaceDeclHandler = self.start_namespace_decl\r
+ self._parser.EndNamespaceDeclHandler = self.end_namespace_decl\r
+\r
+ self._decl_handler_prop = None\r
+ if self._lex_handler_prop:\r
+ self._reset_lex_handler_prop()\r
+# self._parser.DefaultHandler =\r
+# self._parser.DefaultHandlerExpand =\r
+# self._parser.NotStandaloneHandler =\r
+ self._parser.ExternalEntityRefHandler = self.external_entity_ref\r
+ try:\r
+ self._parser.SkippedEntityHandler = self.skipped_entity_handler\r
+ except AttributeError:\r
+ # This pyexpat does not support SkippedEntity\r
+ pass\r
+ self._parser.SetParamEntityParsing(\r
+ expat.XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE)\r
+\r
+ self._parsing = 0\r
+ self._entity_stack = []\r
+\r
+ # Locator methods\r
+\r
+ def getColumnNumber(self):\r
+ if self._parser is None:\r
+ return None\r
+ return self._parser.ErrorColumnNumber\r
+\r
+ def getLineNumber(self):\r
+ if self._parser is None:\r
+ return 1\r
+ return self._parser.ErrorLineNumber\r
+\r
+ def getPublicId(self):\r
+ return self._source.getPublicId()\r
+\r
+ def getSystemId(self):\r
+ return self._source.getSystemId()\r
+\r
+ # event handlers\r
+ def start_element(self, name, attrs):\r
+ self._cont_handler.startElement(name, AttributesImpl(attrs))\r
+\r
+ def end_element(self, name):\r
+ self._cont_handler.endElement(name)\r
+\r
+ def start_element_ns(self, name, attrs):\r
+ pair = name.split()\r
+ if len(pair) == 1:\r
+ # no namespace\r
+ pair = (None, name)\r
+ elif len(pair) == 3:\r
+ pair = pair[0], pair[1]\r
+ else:\r
+ # default namespace\r
+ pair = tuple(pair)\r
+\r
+ newattrs = {}\r
+ qnames = {}\r
+ for (aname, value) in attrs.items():\r
+ parts = aname.split()\r
+ length = len(parts)\r
+ if length == 1:\r
+ # no namespace\r
+ qname = aname\r
+ apair = (None, aname)\r
+ elif length == 3:\r
+ qname = "%s:%s" % (parts[2], parts[1])\r
+ apair = parts[0], parts[1]\r
+ else:\r
+ # default namespace\r
+ qname = parts[1]\r
+ apair = tuple(parts)\r
+\r
+ newattrs[apair] = value\r
+ qnames[apair] = qname\r
+\r
+ self._cont_handler.startElementNS(pair, None,\r
+ AttributesNSImpl(newattrs, qnames))\r
+\r
+ def end_element_ns(self, name):\r
+ pair = name.split()\r
+ if len(pair) == 1:\r
+ pair = (None, name)\r
+ elif len(pair) == 3:\r
+ pair = pair[0], pair[1]\r
+ else:\r
+ pair = tuple(pair)\r
+\r
+ self._cont_handler.endElementNS(pair, None)\r
+\r
+ # this is not used (call directly to ContentHandler)\r
+ def processing_instruction(self, target, data):\r
+ self._cont_handler.processingInstruction(target, data)\r
+\r
+ # this is not used (call directly to ContentHandler)\r
+ def character_data(self, data):\r
+ self._cont_handler.characters(data)\r
+\r
+ def start_namespace_decl(self, prefix, uri):\r
+ self._cont_handler.startPrefixMapping(prefix, uri)\r
+\r
+ def end_namespace_decl(self, prefix):\r
+ self._cont_handler.endPrefixMapping(prefix)\r
+\r
+ def start_doctype_decl(self, name, sysid, pubid, has_internal_subset):\r
+ self._lex_handler_prop.startDTD(name, pubid, sysid)\r
+\r
+ def unparsed_entity_decl(self, name, base, sysid, pubid, notation_name):\r
+ self._dtd_handler.unparsedEntityDecl(name, pubid, sysid, notation_name)\r
+\r
+ def notation_decl(self, name, base, sysid, pubid):\r
+ self._dtd_handler.notationDecl(name, pubid, sysid)\r
+\r
+ def external_entity_ref(self, context, base, sysid, pubid):\r
+ if not self._external_ges:\r
+ return 1\r
+\r
+ source = self._ent_handler.resolveEntity(pubid, sysid)\r
+ source = saxutils.prepare_input_source(source,\r
+ self._source.getSystemId() or\r
+ "")\r
+\r
+ self._entity_stack.append((self._parser, self._source))\r
+ self._parser = self._parser.ExternalEntityParserCreate(context)\r
+ self._source = source\r
+\r
+ try:\r
+ xmlreader.IncrementalParser.parse(self, source)\r
+ except:\r
+ return 0 # FIXME: save error info here?\r
+\r
+ (self._parser, self._source) = self._entity_stack[-1]\r
+ del self._entity_stack[-1]\r
+ return 1\r
+\r
+ def skipped_entity_handler(self, name, is_pe):\r
+ if is_pe:\r
+ # The SAX spec requires to report skipped PEs with a '%'\r
+ name = '%'+name\r
+ self._cont_handler.skippedEntity(name)\r
+\r
+# ---\r
+\r
+def create_parser(*args, **kwargs):\r
+ return ExpatParser(*args, **kwargs)\r
+\r
+# ---\r
+\r
+if __name__ == "__main__":\r
+ import xml.sax.saxutils\r
+ p = create_parser()\r
+ p.setContentHandler(xml.sax.saxutils.XMLGenerator())\r
+ p.setErrorHandler(xml.sax.ErrorHandler())\r
+ p.parse("http://www.ibiblio.org/xml/examples/shakespeare/hamlet.xml")\r