]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | # Copyright (C) 2004-2006 Python Software Foundation\r |
2 | # Authors: Baxter, Wouters and Warsaw\r | |
3 | # Contact: email-sig@python.org\r | |
4 | \r | |
5 | """FeedParser - An email feed parser.\r | |
6 | \r | |
7 | The feed parser implements an interface for incrementally parsing an email\r | |
8 | message, line by line. This has advantages for certain applications, such as\r | |
9 | those reading email messages off a socket.\r | |
10 | \r | |
11 | FeedParser.feed() is the primary interface for pushing new data into the\r | |
12 | parser. It returns when there's nothing more it can do with the available\r | |
13 | data. When you have no more data to push into the parser, call .close().\r | |
14 | This completes the parsing and returns the root message object.\r | |
15 | \r | |
16 | The other advantage of this parser is that it will never throw a parsing\r | |
17 | exception. Instead, when it finds something unexpected, it adds a 'defect' to\r | |
18 | the current message. Defects are just instances that live on the message\r | |
19 | object's .defects attribute.\r | |
20 | """\r | |
21 | \r | |
22 | __all__ = ['FeedParser']\r | |
23 | \r | |
24 | import re\r | |
25 | \r | |
26 | from email import errors\r | |
27 | from email import message\r | |
28 | \r | |
29 | NLCRE = re.compile('\r\n|\r|\n')\r | |
30 | NLCRE_bol = re.compile('(\r\n|\r|\n)')\r | |
31 | NLCRE_eol = re.compile('(\r\n|\r|\n)\Z')\r | |
32 | NLCRE_crack = re.compile('(\r\n|\r|\n)')\r | |
33 | # RFC 2822 $3.6.8 Optional fields. ftext is %d33-57 / %d59-126, Any character\r | |
34 | # except controls, SP, and ":".\r | |
35 | headerRE = re.compile(r'^(From |[\041-\071\073-\176]{1,}:|[\t ])')\r | |
36 | EMPTYSTRING = ''\r | |
37 | NL = '\n'\r | |
38 | \r | |
39 | NeedMoreData = object()\r | |
40 | \r | |
41 | \r | |
42 | \f\r | |
43 | class BufferedSubFile(object):\r | |
44 | """A file-ish object that can have new data loaded into it.\r | |
45 | \r | |
46 | You can also push and pop line-matching predicates onto a stack. When the\r | |
47 | current predicate matches the current line, a false EOF response\r | |
48 | (i.e. empty string) is returned instead. This lets the parser adhere to a\r | |
49 | simple abstraction -- it parses until EOF closes the current message.\r | |
50 | """\r | |
51 | def __init__(self):\r | |
52 | # The last partial line pushed into this object.\r | |
53 | self._partial = ''\r | |
54 | # The list of full, pushed lines, in reverse order\r | |
55 | self._lines = []\r | |
56 | # The stack of false-EOF checking predicates.\r | |
57 | self._eofstack = []\r | |
58 | # A flag indicating whether the file has been closed or not.\r | |
59 | self._closed = False\r | |
60 | \r | |
61 | def push_eof_matcher(self, pred):\r | |
62 | self._eofstack.append(pred)\r | |
63 | \r | |
64 | def pop_eof_matcher(self):\r | |
65 | return self._eofstack.pop()\r | |
66 | \r | |
67 | def close(self):\r | |
68 | # Don't forget any trailing partial line.\r | |
69 | self._lines.append(self._partial)\r | |
70 | self._partial = ''\r | |
71 | self._closed = True\r | |
72 | \r | |
73 | def readline(self):\r | |
74 | if not self._lines:\r | |
75 | if self._closed:\r | |
76 | return ''\r | |
77 | return NeedMoreData\r | |
78 | # Pop the line off the stack and see if it matches the current\r | |
79 | # false-EOF predicate.\r | |
80 | line = self._lines.pop()\r | |
81 | # RFC 2046, section 5.1.2 requires us to recognize outer level\r | |
82 | # boundaries at any level of inner nesting. Do this, but be sure it's\r | |
83 | # in the order of most to least nested.\r | |
84 | for ateof in self._eofstack[::-1]:\r | |
85 | if ateof(line):\r | |
86 | # We're at the false EOF. But push the last line back first.\r | |
87 | self._lines.append(line)\r | |
88 | return ''\r | |
89 | return line\r | |
90 | \r | |
91 | def unreadline(self, line):\r | |
92 | # Let the consumer push a line back into the buffer.\r | |
93 | assert line is not NeedMoreData\r | |
94 | self._lines.append(line)\r | |
95 | \r | |
96 | def push(self, data):\r | |
97 | """Push some new data into this object."""\r | |
98 | # Handle any previous leftovers\r | |
99 | data, self._partial = self._partial + data, ''\r | |
100 | # Crack into lines, but preserve the newlines on the end of each\r | |
101 | parts = NLCRE_crack.split(data)\r | |
102 | # The *ahem* interesting behaviour of re.split when supplied grouping\r | |
103 | # parentheses is that the last element of the resulting list is the\r | |
104 | # data after the final RE. In the case of a NL/CR terminated string,\r | |
105 | # this is the empty string.\r | |
106 | self._partial = parts.pop()\r | |
107 | #GAN 29Mar09 bugs 1555570, 1721862 Confusion at 8K boundary ending with \r:\r | |
108 | # is there a \n to follow later?\r | |
109 | if not self._partial and parts and parts[-1].endswith('\r'):\r | |
110 | self._partial = parts.pop(-2)+parts.pop()\r | |
111 | # parts is a list of strings, alternating between the line contents\r | |
112 | # and the eol character(s). Gather up a list of lines after\r | |
113 | # re-attaching the newlines.\r | |
114 | lines = []\r | |
115 | for i in range(len(parts) // 2):\r | |
116 | lines.append(parts[i*2] + parts[i*2+1])\r | |
117 | self.pushlines(lines)\r | |
118 | \r | |
119 | def pushlines(self, lines):\r | |
120 | # Reverse and insert at the front of the lines.\r | |
121 | self._lines[:0] = lines[::-1]\r | |
122 | \r | |
123 | def is_closed(self):\r | |
124 | return self._closed\r | |
125 | \r | |
126 | def __iter__(self):\r | |
127 | return self\r | |
128 | \r | |
129 | def next(self):\r | |
130 | line = self.readline()\r | |
131 | if line == '':\r | |
132 | raise StopIteration\r | |
133 | return line\r | |
134 | \r | |
135 | \r | |
136 | \f\r | |
137 | class FeedParser:\r | |
138 | """A feed-style parser of email."""\r | |
139 | \r | |
140 | def __init__(self, _factory=message.Message):\r | |
141 | """_factory is called with no arguments to create a new message obj"""\r | |
142 | self._factory = _factory\r | |
143 | self._input = BufferedSubFile()\r | |
144 | self._msgstack = []\r | |
145 | self._parse = self._parsegen().next\r | |
146 | self._cur = None\r | |
147 | self._last = None\r | |
148 | self._headersonly = False\r | |
149 | \r | |
150 | # Non-public interface for supporting Parser's headersonly flag\r | |
151 | def _set_headersonly(self):\r | |
152 | self._headersonly = True\r | |
153 | \r | |
154 | def feed(self, data):\r | |
155 | """Push more data into the parser."""\r | |
156 | self._input.push(data)\r | |
157 | self._call_parse()\r | |
158 | \r | |
159 | def _call_parse(self):\r | |
160 | try:\r | |
161 | self._parse()\r | |
162 | except StopIteration:\r | |
163 | pass\r | |
164 | \r | |
165 | def close(self):\r | |
166 | """Parse all remaining data and return the root message object."""\r | |
167 | self._input.close()\r | |
168 | self._call_parse()\r | |
169 | root = self._pop_message()\r | |
170 | assert not self._msgstack\r | |
171 | # Look for final set of defects\r | |
172 | if root.get_content_maintype() == 'multipart' \\r | |
173 | and not root.is_multipart():\r | |
174 | root.defects.append(errors.MultipartInvariantViolationDefect())\r | |
175 | return root\r | |
176 | \r | |
177 | def _new_message(self):\r | |
178 | msg = self._factory()\r | |
179 | if self._cur and self._cur.get_content_type() == 'multipart/digest':\r | |
180 | msg.set_default_type('message/rfc822')\r | |
181 | if self._msgstack:\r | |
182 | self._msgstack[-1].attach(msg)\r | |
183 | self._msgstack.append(msg)\r | |
184 | self._cur = msg\r | |
185 | self._last = msg\r | |
186 | \r | |
187 | def _pop_message(self):\r | |
188 | retval = self._msgstack.pop()\r | |
189 | if self._msgstack:\r | |
190 | self._cur = self._msgstack[-1]\r | |
191 | else:\r | |
192 | self._cur = None\r | |
193 | return retval\r | |
194 | \r | |
195 | def _parsegen(self):\r | |
196 | # Create a new message and start by parsing headers.\r | |
197 | self._new_message()\r | |
198 | headers = []\r | |
199 | # Collect the headers, searching for a line that doesn't match the RFC\r | |
200 | # 2822 header or continuation pattern (including an empty line).\r | |
201 | for line in self._input:\r | |
202 | if line is NeedMoreData:\r | |
203 | yield NeedMoreData\r | |
204 | continue\r | |
205 | if not headerRE.match(line):\r | |
206 | # If we saw the RFC defined header/body separator\r | |
207 | # (i.e. newline), just throw it away. Otherwise the line is\r | |
208 | # part of the body so push it back.\r | |
209 | if not NLCRE.match(line):\r | |
210 | self._input.unreadline(line)\r | |
211 | break\r | |
212 | headers.append(line)\r | |
213 | # Done with the headers, so parse them and figure out what we're\r | |
214 | # supposed to see in the body of the message.\r | |
215 | self._parse_headers(headers)\r | |
216 | # Headers-only parsing is a backwards compatibility hack, which was\r | |
217 | # necessary in the older parser, which could throw errors. All\r | |
218 | # remaining lines in the input are thrown into the message body.\r | |
219 | if self._headersonly:\r | |
220 | lines = []\r | |
221 | while True:\r | |
222 | line = self._input.readline()\r | |
223 | if line is NeedMoreData:\r | |
224 | yield NeedMoreData\r | |
225 | continue\r | |
226 | if line == '':\r | |
227 | break\r | |
228 | lines.append(line)\r | |
229 | self._cur.set_payload(EMPTYSTRING.join(lines))\r | |
230 | return\r | |
231 | if self._cur.get_content_type() == 'message/delivery-status':\r | |
232 | # message/delivery-status contains blocks of headers separated by\r | |
233 | # a blank line. We'll represent each header block as a separate\r | |
234 | # nested message object, but the processing is a bit different\r | |
235 | # than standard message/* types because there is no body for the\r | |
236 | # nested messages. A blank line separates the subparts.\r | |
237 | while True:\r | |
238 | self._input.push_eof_matcher(NLCRE.match)\r | |
239 | for retval in self._parsegen():\r | |
240 | if retval is NeedMoreData:\r | |
241 | yield NeedMoreData\r | |
242 | continue\r | |
243 | break\r | |
244 | msg = self._pop_message()\r | |
245 | # We need to pop the EOF matcher in order to tell if we're at\r | |
246 | # the end of the current file, not the end of the last block\r | |
247 | # of message headers.\r | |
248 | self._input.pop_eof_matcher()\r | |
249 | # The input stream must be sitting at the newline or at the\r | |
250 | # EOF. We want to see if we're at the end of this subpart, so\r | |
251 | # first consume the blank line, then test the next line to see\r | |
252 | # if we're at this subpart's EOF.\r | |
253 | while True:\r | |
254 | line = self._input.readline()\r | |
255 | if line is NeedMoreData:\r | |
256 | yield NeedMoreData\r | |
257 | continue\r | |
258 | break\r | |
259 | while True:\r | |
260 | line = self._input.readline()\r | |
261 | if line is NeedMoreData:\r | |
262 | yield NeedMoreData\r | |
263 | continue\r | |
264 | break\r | |
265 | if line == '':\r | |
266 | break\r | |
267 | # Not at EOF so this is a line we're going to need.\r | |
268 | self._input.unreadline(line)\r | |
269 | return\r | |
270 | if self._cur.get_content_maintype() == 'message':\r | |
271 | # The message claims to be a message/* type, then what follows is\r | |
272 | # another RFC 2822 message.\r | |
273 | for retval in self._parsegen():\r | |
274 | if retval is NeedMoreData:\r | |
275 | yield NeedMoreData\r | |
276 | continue\r | |
277 | break\r | |
278 | self._pop_message()\r | |
279 | return\r | |
280 | if self._cur.get_content_maintype() == 'multipart':\r | |
281 | boundary = self._cur.get_boundary()\r | |
282 | if boundary is None:\r | |
283 | # The message /claims/ to be a multipart but it has not\r | |
284 | # defined a boundary. That's a problem which we'll handle by\r | |
285 | # reading everything until the EOF and marking the message as\r | |
286 | # defective.\r | |
287 | self._cur.defects.append(errors.NoBoundaryInMultipartDefect())\r | |
288 | lines = []\r | |
289 | for line in self._input:\r | |
290 | if line is NeedMoreData:\r | |
291 | yield NeedMoreData\r | |
292 | continue\r | |
293 | lines.append(line)\r | |
294 | self._cur.set_payload(EMPTYSTRING.join(lines))\r | |
295 | return\r | |
296 | # Create a line match predicate which matches the inter-part\r | |
297 | # boundary as well as the end-of-multipart boundary. Don't push\r | |
298 | # this onto the input stream until we've scanned past the\r | |
299 | # preamble.\r | |
300 | separator = '--' + boundary\r | |
301 | boundaryre = re.compile(\r | |
302 | '(?P<sep>' + re.escape(separator) +\r | |
303 | r')(?P<end>--)?(?P<ws>[ \t]*)(?P<linesep>\r\n|\r|\n)?$')\r | |
304 | capturing_preamble = True\r | |
305 | preamble = []\r | |
306 | linesep = False\r | |
307 | while True:\r | |
308 | line = self._input.readline()\r | |
309 | if line is NeedMoreData:\r | |
310 | yield NeedMoreData\r | |
311 | continue\r | |
312 | if line == '':\r | |
313 | break\r | |
314 | mo = boundaryre.match(line)\r | |
315 | if mo:\r | |
316 | # If we're looking at the end boundary, we're done with\r | |
317 | # this multipart. If there was a newline at the end of\r | |
318 | # the closing boundary, then we need to initialize the\r | |
319 | # epilogue with the empty string (see below).\r | |
320 | if mo.group('end'):\r | |
321 | linesep = mo.group('linesep')\r | |
322 | break\r | |
323 | # We saw an inter-part boundary. Were we in the preamble?\r | |
324 | if capturing_preamble:\r | |
325 | if preamble:\r | |
326 | # According to RFC 2046, the last newline belongs\r | |
327 | # to the boundary.\r | |
328 | lastline = preamble[-1]\r | |
329 | eolmo = NLCRE_eol.search(lastline)\r | |
330 | if eolmo:\r | |
331 | preamble[-1] = lastline[:-len(eolmo.group(0))]\r | |
332 | self._cur.preamble = EMPTYSTRING.join(preamble)\r | |
333 | capturing_preamble = False\r | |
334 | self._input.unreadline(line)\r | |
335 | continue\r | |
336 | # We saw a boundary separating two parts. Consume any\r | |
337 | # multiple boundary lines that may be following. Our\r | |
338 | # interpretation of RFC 2046 BNF grammar does not produce\r | |
339 | # body parts within such double boundaries.\r | |
340 | while True:\r | |
341 | line = self._input.readline()\r | |
342 | if line is NeedMoreData:\r | |
343 | yield NeedMoreData\r | |
344 | continue\r | |
345 | mo = boundaryre.match(line)\r | |
346 | if not mo:\r | |
347 | self._input.unreadline(line)\r | |
348 | break\r | |
349 | # Recurse to parse this subpart; the input stream points\r | |
350 | # at the subpart's first line.\r | |
351 | self._input.push_eof_matcher(boundaryre.match)\r | |
352 | for retval in self._parsegen():\r | |
353 | if retval is NeedMoreData:\r | |
354 | yield NeedMoreData\r | |
355 | continue\r | |
356 | break\r | |
357 | # Because of RFC 2046, the newline preceding the boundary\r | |
358 | # separator actually belongs to the boundary, not the\r | |
359 | # previous subpart's payload (or epilogue if the previous\r | |
360 | # part is a multipart).\r | |
361 | if self._last.get_content_maintype() == 'multipart':\r | |
362 | epilogue = self._last.epilogue\r | |
363 | if epilogue == '':\r | |
364 | self._last.epilogue = None\r | |
365 | elif epilogue is not None:\r | |
366 | mo = NLCRE_eol.search(epilogue)\r | |
367 | if mo:\r | |
368 | end = len(mo.group(0))\r | |
369 | self._last.epilogue = epilogue[:-end]\r | |
370 | else:\r | |
371 | payload = self._last.get_payload()\r | |
372 | if isinstance(payload, basestring):\r | |
373 | mo = NLCRE_eol.search(payload)\r | |
374 | if mo:\r | |
375 | payload = payload[:-len(mo.group(0))]\r | |
376 | self._last.set_payload(payload)\r | |
377 | self._input.pop_eof_matcher()\r | |
378 | self._pop_message()\r | |
379 | # Set the multipart up for newline cleansing, which will\r | |
380 | # happen if we're in a nested multipart.\r | |
381 | self._last = self._cur\r | |
382 | else:\r | |
383 | # I think we must be in the preamble\r | |
384 | assert capturing_preamble\r | |
385 | preamble.append(line)\r | |
386 | # We've seen either the EOF or the end boundary. If we're still\r | |
387 | # capturing the preamble, we never saw the start boundary. Note\r | |
388 | # that as a defect and store the captured text as the payload.\r | |
389 | # Everything from here to the EOF is epilogue.\r | |
390 | if capturing_preamble:\r | |
391 | self._cur.defects.append(errors.StartBoundaryNotFoundDefect())\r | |
392 | self._cur.set_payload(EMPTYSTRING.join(preamble))\r | |
393 | epilogue = []\r | |
394 | for line in self._input:\r | |
395 | if line is NeedMoreData:\r | |
396 | yield NeedMoreData\r | |
397 | continue\r | |
398 | self._cur.epilogue = EMPTYSTRING.join(epilogue)\r | |
399 | return\r | |
400 | # If the end boundary ended in a newline, we'll need to make sure\r | |
401 | # the epilogue isn't None\r | |
402 | if linesep:\r | |
403 | epilogue = ['']\r | |
404 | else:\r | |
405 | epilogue = []\r | |
406 | for line in self._input:\r | |
407 | if line is NeedMoreData:\r | |
408 | yield NeedMoreData\r | |
409 | continue\r | |
410 | epilogue.append(line)\r | |
411 | # Any CRLF at the front of the epilogue is not technically part of\r | |
412 | # the epilogue. Also, watch out for an empty string epilogue,\r | |
413 | # which means a single newline.\r | |
414 | if epilogue:\r | |
415 | firstline = epilogue[0]\r | |
416 | bolmo = NLCRE_bol.match(firstline)\r | |
417 | if bolmo:\r | |
418 | epilogue[0] = firstline[len(bolmo.group(0)):]\r | |
419 | self._cur.epilogue = EMPTYSTRING.join(epilogue)\r | |
420 | return\r | |
421 | # Otherwise, it's some non-multipart type, so the entire rest of the\r | |
422 | # file contents becomes the payload.\r | |
423 | lines = []\r | |
424 | for line in self._input:\r | |
425 | if line is NeedMoreData:\r | |
426 | yield NeedMoreData\r | |
427 | continue\r | |
428 | lines.append(line)\r | |
429 | self._cur.set_payload(EMPTYSTRING.join(lines))\r | |
430 | \r | |
431 | def _parse_headers(self, lines):\r | |
432 | # Passed a list of lines that make up the headers for the current msg\r | |
433 | lastheader = ''\r | |
434 | lastvalue = []\r | |
435 | for lineno, line in enumerate(lines):\r | |
436 | # Check for continuation\r | |
437 | if line[0] in ' \t':\r | |
438 | if not lastheader:\r | |
439 | # The first line of the headers was a continuation. This\r | |
440 | # is illegal, so let's note the defect, store the illegal\r | |
441 | # line, and ignore it for purposes of headers.\r | |
442 | defect = errors.FirstHeaderLineIsContinuationDefect(line)\r | |
443 | self._cur.defects.append(defect)\r | |
444 | continue\r | |
445 | lastvalue.append(line)\r | |
446 | continue\r | |
447 | if lastheader:\r | |
448 | # XXX reconsider the joining of folded lines\r | |
449 | lhdr = EMPTYSTRING.join(lastvalue)[:-1].rstrip('\r\n')\r | |
450 | self._cur[lastheader] = lhdr\r | |
451 | lastheader, lastvalue = '', []\r | |
452 | # Check for envelope header, i.e. unix-from\r | |
453 | if line.startswith('From '):\r | |
454 | if lineno == 0:\r | |
455 | # Strip off the trailing newline\r | |
456 | mo = NLCRE_eol.search(line)\r | |
457 | if mo:\r | |
458 | line = line[:-len(mo.group(0))]\r | |
459 | self._cur.set_unixfrom(line)\r | |
460 | continue\r | |
461 | elif lineno == len(lines) - 1:\r | |
462 | # Something looking like a unix-from at the end - it's\r | |
463 | # probably the first line of the body, so push back the\r | |
464 | # line and stop.\r | |
465 | self._input.unreadline(line)\r | |
466 | return\r | |
467 | else:\r | |
468 | # Weirdly placed unix-from line. Note this as a defect\r | |
469 | # and ignore it.\r | |
470 | defect = errors.MisplacedEnvelopeHeaderDefect(line)\r | |
471 | self._cur.defects.append(defect)\r | |
472 | continue\r | |
473 | # Split the line on the colon separating field name from value.\r | |
474 | i = line.find(':')\r | |
475 | if i < 0:\r | |
476 | defect = errors.MalformedHeaderDefect(line)\r | |
477 | self._cur.defects.append(defect)\r | |
478 | continue\r | |
479 | lastheader = line[:i]\r | |
480 | lastvalue = [line[i+1:].lstrip()]\r | |
481 | # Done with all the lines, so handle the last header.\r | |
482 | if lastheader:\r | |
483 | # XXX reconsider the joining of folded lines\r | |
484 | self._cur[lastheader] = EMPTYSTRING.join(lastvalue).rstrip('\r\n')\r |