]>
Commit | Line | Data |
---|---|---|
3257aa99 DM |
1 | import xml.sax\r |
2 | import xml.sax.handler\r | |
3 | import types\r | |
4 | \r | |
5 | try:\r | |
6 | _StringTypes = [types.StringType, types.UnicodeType]\r | |
7 | except AttributeError:\r | |
8 | _StringTypes = [types.StringType]\r | |
9 | \r | |
10 | START_ELEMENT = "START_ELEMENT"\r | |
11 | END_ELEMENT = "END_ELEMENT"\r | |
12 | COMMENT = "COMMENT"\r | |
13 | START_DOCUMENT = "START_DOCUMENT"\r | |
14 | END_DOCUMENT = "END_DOCUMENT"\r | |
15 | PROCESSING_INSTRUCTION = "PROCESSING_INSTRUCTION"\r | |
16 | IGNORABLE_WHITESPACE = "IGNORABLE_WHITESPACE"\r | |
17 | CHARACTERS = "CHARACTERS"\r | |
18 | \r | |
19 | class PullDOM(xml.sax.ContentHandler):\r | |
20 | _locator = None\r | |
21 | document = None\r | |
22 | \r | |
23 | def __init__(self, documentFactory=None):\r | |
24 | from xml.dom import XML_NAMESPACE\r | |
25 | self.documentFactory = documentFactory\r | |
26 | self.firstEvent = [None, None]\r | |
27 | self.lastEvent = self.firstEvent\r | |
28 | self.elementStack = []\r | |
29 | self.push = self.elementStack.append\r | |
30 | try:\r | |
31 | self.pop = self.elementStack.pop\r | |
32 | except AttributeError:\r | |
33 | # use class' pop instead\r | |
34 | pass\r | |
35 | self._ns_contexts = [{XML_NAMESPACE:'xml'}] # contains uri -> prefix dicts\r | |
36 | self._current_context = self._ns_contexts[-1]\r | |
37 | self.pending_events = []\r | |
38 | \r | |
39 | def pop(self):\r | |
40 | result = self.elementStack[-1]\r | |
41 | del self.elementStack[-1]\r | |
42 | return result\r | |
43 | \r | |
44 | def setDocumentLocator(self, locator):\r | |
45 | self._locator = locator\r | |
46 | \r | |
47 | def startPrefixMapping(self, prefix, uri):\r | |
48 | if not hasattr(self, '_xmlns_attrs'):\r | |
49 | self._xmlns_attrs = []\r | |
50 | self._xmlns_attrs.append((prefix or 'xmlns', uri))\r | |
51 | self._ns_contexts.append(self._current_context.copy())\r | |
52 | self._current_context[uri] = prefix or None\r | |
53 | \r | |
54 | def endPrefixMapping(self, prefix):\r | |
55 | self._current_context = self._ns_contexts.pop()\r | |
56 | \r | |
57 | def startElementNS(self, name, tagName , attrs):\r | |
58 | # Retrieve xml namespace declaration attributes.\r | |
59 | xmlns_uri = 'http://www.w3.org/2000/xmlns/'\r | |
60 | xmlns_attrs = getattr(self, '_xmlns_attrs', None)\r | |
61 | if xmlns_attrs is not None:\r | |
62 | for aname, value in xmlns_attrs:\r | |
63 | attrs._attrs[(xmlns_uri, aname)] = value\r | |
64 | self._xmlns_attrs = []\r | |
65 | uri, localname = name\r | |
66 | if uri:\r | |
67 | # When using namespaces, the reader may or may not\r | |
68 | # provide us with the original name. If not, create\r | |
69 | # *a* valid tagName from the current context.\r | |
70 | if tagName is None:\r | |
71 | prefix = self._current_context[uri]\r | |
72 | if prefix:\r | |
73 | tagName = prefix + ":" + localname\r | |
74 | else:\r | |
75 | tagName = localname\r | |
76 | if self.document:\r | |
77 | node = self.document.createElementNS(uri, tagName)\r | |
78 | else:\r | |
79 | node = self.buildDocument(uri, tagName)\r | |
80 | else:\r | |
81 | # When the tagname is not prefixed, it just appears as\r | |
82 | # localname\r | |
83 | if self.document:\r | |
84 | node = self.document.createElement(localname)\r | |
85 | else:\r | |
86 | node = self.buildDocument(None, localname)\r | |
87 | \r | |
88 | for aname,value in attrs.items():\r | |
89 | a_uri, a_localname = aname\r | |
90 | if a_uri == xmlns_uri:\r | |
91 | if a_localname == 'xmlns':\r | |
92 | qname = a_localname\r | |
93 | else:\r | |
94 | qname = 'xmlns:' + a_localname\r | |
95 | attr = self.document.createAttributeNS(a_uri, qname)\r | |
96 | node.setAttributeNodeNS(attr)\r | |
97 | elif a_uri:\r | |
98 | prefix = self._current_context[a_uri]\r | |
99 | if prefix:\r | |
100 | qname = prefix + ":" + a_localname\r | |
101 | else:\r | |
102 | qname = a_localname\r | |
103 | attr = self.document.createAttributeNS(a_uri, qname)\r | |
104 | node.setAttributeNodeNS(attr)\r | |
105 | else:\r | |
106 | attr = self.document.createAttribute(a_localname)\r | |
107 | node.setAttributeNode(attr)\r | |
108 | attr.value = value\r | |
109 | \r | |
110 | self.lastEvent[1] = [(START_ELEMENT, node), None]\r | |
111 | self.lastEvent = self.lastEvent[1]\r | |
112 | self.push(node)\r | |
113 | \r | |
114 | def endElementNS(self, name, tagName):\r | |
115 | self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]\r | |
116 | self.lastEvent = self.lastEvent[1]\r | |
117 | \r | |
118 | def startElement(self, name, attrs):\r | |
119 | if self.document:\r | |
120 | node = self.document.createElement(name)\r | |
121 | else:\r | |
122 | node = self.buildDocument(None, name)\r | |
123 | \r | |
124 | for aname,value in attrs.items():\r | |
125 | attr = self.document.createAttribute(aname)\r | |
126 | attr.value = value\r | |
127 | node.setAttributeNode(attr)\r | |
128 | \r | |
129 | self.lastEvent[1] = [(START_ELEMENT, node), None]\r | |
130 | self.lastEvent = self.lastEvent[1]\r | |
131 | self.push(node)\r | |
132 | \r | |
133 | def endElement(self, name):\r | |
134 | self.lastEvent[1] = [(END_ELEMENT, self.pop()), None]\r | |
135 | self.lastEvent = self.lastEvent[1]\r | |
136 | \r | |
137 | def comment(self, s):\r | |
138 | if self.document:\r | |
139 | node = self.document.createComment(s)\r | |
140 | self.lastEvent[1] = [(COMMENT, node), None]\r | |
141 | self.lastEvent = self.lastEvent[1]\r | |
142 | else:\r | |
143 | event = [(COMMENT, s), None]\r | |
144 | self.pending_events.append(event)\r | |
145 | \r | |
146 | def processingInstruction(self, target, data):\r | |
147 | if self.document:\r | |
148 | node = self.document.createProcessingInstruction(target, data)\r | |
149 | self.lastEvent[1] = [(PROCESSING_INSTRUCTION, node), None]\r | |
150 | self.lastEvent = self.lastEvent[1]\r | |
151 | else:\r | |
152 | event = [(PROCESSING_INSTRUCTION, target, data), None]\r | |
153 | self.pending_events.append(event)\r | |
154 | \r | |
155 | def ignorableWhitespace(self, chars):\r | |
156 | node = self.document.createTextNode(chars)\r | |
157 | self.lastEvent[1] = [(IGNORABLE_WHITESPACE, node), None]\r | |
158 | self.lastEvent = self.lastEvent[1]\r | |
159 | \r | |
160 | def characters(self, chars):\r | |
161 | node = self.document.createTextNode(chars)\r | |
162 | self.lastEvent[1] = [(CHARACTERS, node), None]\r | |
163 | self.lastEvent = self.lastEvent[1]\r | |
164 | \r | |
165 | def startDocument(self):\r | |
166 | if self.documentFactory is None:\r | |
167 | import xml.dom.minidom\r | |
168 | self.documentFactory = xml.dom.minidom.Document.implementation\r | |
169 | \r | |
170 | def buildDocument(self, uri, tagname):\r | |
171 | # Can't do that in startDocument, since we need the tagname\r | |
172 | # XXX: obtain DocumentType\r | |
173 | node = self.documentFactory.createDocument(uri, tagname, None)\r | |
174 | self.document = node\r | |
175 | self.lastEvent[1] = [(START_DOCUMENT, node), None]\r | |
176 | self.lastEvent = self.lastEvent[1]\r | |
177 | self.push(node)\r | |
178 | # Put everything we have seen so far into the document\r | |
179 | for e in self.pending_events:\r | |
180 | if e[0][0] == PROCESSING_INSTRUCTION:\r | |
181 | _,target,data = e[0]\r | |
182 | n = self.document.createProcessingInstruction(target, data)\r | |
183 | e[0] = (PROCESSING_INSTRUCTION, n)\r | |
184 | elif e[0][0] == COMMENT:\r | |
185 | n = self.document.createComment(e[0][1])\r | |
186 | e[0] = (COMMENT, n)\r | |
187 | else:\r | |
188 | raise AssertionError("Unknown pending event ",e[0][0])\r | |
189 | self.lastEvent[1] = e\r | |
190 | self.lastEvent = e\r | |
191 | self.pending_events = None\r | |
192 | return node.firstChild\r | |
193 | \r | |
194 | def endDocument(self):\r | |
195 | self.lastEvent[1] = [(END_DOCUMENT, self.document), None]\r | |
196 | self.pop()\r | |
197 | \r | |
198 | def clear(self):\r | |
199 | "clear(): Explicitly release parsing structures"\r | |
200 | self.document = None\r | |
201 | \r | |
202 | class ErrorHandler:\r | |
203 | def warning(self, exception):\r | |
204 | print exception\r | |
205 | def error(self, exception):\r | |
206 | raise exception\r | |
207 | def fatalError(self, exception):\r | |
208 | raise exception\r | |
209 | \r | |
210 | class DOMEventStream:\r | |
211 | def __init__(self, stream, parser, bufsize):\r | |
212 | self.stream = stream\r | |
213 | self.parser = parser\r | |
214 | self.bufsize = bufsize\r | |
215 | if not hasattr(self.parser, 'feed'):\r | |
216 | self.getEvent = self._slurp\r | |
217 | self.reset()\r | |
218 | \r | |
219 | def reset(self):\r | |
220 | self.pulldom = PullDOM()\r | |
221 | # This content handler relies on namespace support\r | |
222 | self.parser.setFeature(xml.sax.handler.feature_namespaces, 1)\r | |
223 | self.parser.setContentHandler(self.pulldom)\r | |
224 | \r | |
225 | def __getitem__(self, pos):\r | |
226 | rc = self.getEvent()\r | |
227 | if rc:\r | |
228 | return rc\r | |
229 | raise IndexError\r | |
230 | \r | |
231 | def next(self):\r | |
232 | rc = self.getEvent()\r | |
233 | if rc:\r | |
234 | return rc\r | |
235 | raise StopIteration\r | |
236 | \r | |
237 | def __iter__(self):\r | |
238 | return self\r | |
239 | \r | |
240 | def expandNode(self, node):\r | |
241 | event = self.getEvent()\r | |
242 | parents = [node]\r | |
243 | while event:\r | |
244 | token, cur_node = event\r | |
245 | if cur_node is node:\r | |
246 | return\r | |
247 | if token != END_ELEMENT:\r | |
248 | parents[-1].appendChild(cur_node)\r | |
249 | if token == START_ELEMENT:\r | |
250 | parents.append(cur_node)\r | |
251 | elif token == END_ELEMENT:\r | |
252 | del parents[-1]\r | |
253 | event = self.getEvent()\r | |
254 | \r | |
255 | def getEvent(self):\r | |
256 | # use IncrementalParser interface, so we get the desired\r | |
257 | # pull effect\r | |
258 | if not self.pulldom.firstEvent[1]:\r | |
259 | self.pulldom.lastEvent = self.pulldom.firstEvent\r | |
260 | while not self.pulldom.firstEvent[1]:\r | |
261 | buf = self.stream.read(self.bufsize)\r | |
262 | if not buf:\r | |
263 | self.parser.close()\r | |
264 | return None\r | |
265 | self.parser.feed(buf)\r | |
266 | rc = self.pulldom.firstEvent[1][0]\r | |
267 | self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]\r | |
268 | return rc\r | |
269 | \r | |
270 | def _slurp(self):\r | |
271 | """ Fallback replacement for getEvent() using the\r | |
272 | standard SAX2 interface, which means we slurp the\r | |
273 | SAX events into memory (no performance gain, but\r | |
274 | we are compatible to all SAX parsers).\r | |
275 | """\r | |
276 | self.parser.parse(self.stream)\r | |
277 | self.getEvent = self._emit\r | |
278 | return self._emit()\r | |
279 | \r | |
280 | def _emit(self):\r | |
281 | """ Fallback replacement for getEvent() that emits\r | |
282 | the events that _slurp() read previously.\r | |
283 | """\r | |
284 | rc = self.pulldom.firstEvent[1][0]\r | |
285 | self.pulldom.firstEvent[1] = self.pulldom.firstEvent[1][1]\r | |
286 | return rc\r | |
287 | \r | |
288 | def clear(self):\r | |
289 | """clear(): Explicitly release parsing objects"""\r | |
290 | self.pulldom.clear()\r | |
291 | del self.pulldom\r | |
292 | self.parser = None\r | |
293 | self.stream = None\r | |
294 | \r | |
295 | class SAX2DOM(PullDOM):\r | |
296 | \r | |
297 | def startElementNS(self, name, tagName , attrs):\r | |
298 | PullDOM.startElementNS(self, name, tagName, attrs)\r | |
299 | curNode = self.elementStack[-1]\r | |
300 | parentNode = self.elementStack[-2]\r | |
301 | parentNode.appendChild(curNode)\r | |
302 | \r | |
303 | def startElement(self, name, attrs):\r | |
304 | PullDOM.startElement(self, name, attrs)\r | |
305 | curNode = self.elementStack[-1]\r | |
306 | parentNode = self.elementStack[-2]\r | |
307 | parentNode.appendChild(curNode)\r | |
308 | \r | |
309 | def processingInstruction(self, target, data):\r | |
310 | PullDOM.processingInstruction(self, target, data)\r | |
311 | node = self.lastEvent[0][1]\r | |
312 | parentNode = self.elementStack[-1]\r | |
313 | parentNode.appendChild(node)\r | |
314 | \r | |
315 | def ignorableWhitespace(self, chars):\r | |
316 | PullDOM.ignorableWhitespace(self, chars)\r | |
317 | node = self.lastEvent[0][1]\r | |
318 | parentNode = self.elementStack[-1]\r | |
319 | parentNode.appendChild(node)\r | |
320 | \r | |
321 | def characters(self, chars):\r | |
322 | PullDOM.characters(self, chars)\r | |
323 | node = self.lastEvent[0][1]\r | |
324 | parentNode = self.elementStack[-1]\r | |
325 | parentNode.appendChild(node)\r | |
326 | \r | |
327 | \r | |
328 | default_bufsize = (2 ** 14) - 20\r | |
329 | \r | |
330 | def parse(stream_or_string, parser=None, bufsize=None):\r | |
331 | if bufsize is None:\r | |
332 | bufsize = default_bufsize\r | |
333 | if type(stream_or_string) in _StringTypes:\r | |
334 | stream = open(stream_or_string)\r | |
335 | else:\r | |
336 | stream = stream_or_string\r | |
337 | if not parser:\r | |
338 | parser = xml.sax.make_parser()\r | |
339 | return DOMEventStream(stream, parser, bufsize)\r | |
340 | \r | |
341 | def parseString(string, parser=None):\r | |
342 | try:\r | |
343 | from cStringIO import StringIO\r | |
344 | except ImportError:\r | |
345 | from StringIO import StringIO\r | |
346 | \r | |
347 | bufsize = len(string)\r | |
348 | buf = StringIO(string)\r | |
349 | if not parser:\r | |
350 | parser = xml.sax.make_parser()\r | |
351 | return DOMEventStream(buf, parser, bufsize)\r |