]>
Commit | Line | Data |
---|---|---|
3257aa99 DM |
1 | """ codecs -- Python Codec Registry, API and helpers.\r |
2 | \r | |
3 | \r | |
4 | Written by Marc-Andre Lemburg (mal@lemburg.com).\r | |
5 | \r | |
6 | (c) Copyright CNRI, All Rights Reserved. NO WARRANTY.\r | |
7 | \r | |
8 | """#"\r | |
9 | \r | |
10 | import __builtin__, sys\r | |
11 | \r | |
12 | ### Registry and builtin stateless codec functions\r | |
13 | \r | |
14 | try:\r | |
15 | from _codecs import *\r | |
16 | except ImportError, why:\r | |
17 | raise SystemError('Failed to load the builtin codecs: %s' % why)\r | |
18 | \r | |
19 | __all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",\r | |
20 | "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",\r | |
21 | "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",\r | |
22 | "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",\r | |
23 | "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",\r | |
24 | "StreamReader", "StreamWriter",\r | |
25 | "StreamReaderWriter", "StreamRecoder",\r | |
26 | "getencoder", "getdecoder", "getincrementalencoder",\r | |
27 | "getincrementaldecoder", "getreader", "getwriter",\r | |
28 | "encode", "decode", "iterencode", "iterdecode",\r | |
29 | "strict_errors", "ignore_errors", "replace_errors",\r | |
30 | "xmlcharrefreplace_errors", "backslashreplace_errors",\r | |
31 | "register_error", "lookup_error"]\r | |
32 | \r | |
33 | ### Constants\r | |
34 | \r | |
35 | #\r | |
36 | # Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)\r | |
37 | # and its possible byte string values\r | |
38 | # for UTF8/UTF16/UTF32 output and little/big endian machines\r | |
39 | #\r | |
40 | \r | |
41 | # UTF-8\r | |
42 | BOM_UTF8 = '\xef\xbb\xbf'\r | |
43 | \r | |
44 | # UTF-16, little endian\r | |
45 | BOM_LE = BOM_UTF16_LE = '\xff\xfe'\r | |
46 | \r | |
47 | # UTF-16, big endian\r | |
48 | BOM_BE = BOM_UTF16_BE = '\xfe\xff'\r | |
49 | \r | |
50 | # UTF-32, little endian\r | |
51 | BOM_UTF32_LE = '\xff\xfe\x00\x00'\r | |
52 | \r | |
53 | # UTF-32, big endian\r | |
54 | BOM_UTF32_BE = '\x00\x00\xfe\xff'\r | |
55 | \r | |
56 | if sys.byteorder == 'little':\r | |
57 | \r | |
58 | # UTF-16, native endianness\r | |
59 | BOM = BOM_UTF16 = BOM_UTF16_LE\r | |
60 | \r | |
61 | # UTF-32, native endianness\r | |
62 | BOM_UTF32 = BOM_UTF32_LE\r | |
63 | \r | |
64 | else:\r | |
65 | \r | |
66 | # UTF-16, native endianness\r | |
67 | BOM = BOM_UTF16 = BOM_UTF16_BE\r | |
68 | \r | |
69 | # UTF-32, native endianness\r | |
70 | BOM_UTF32 = BOM_UTF32_BE\r | |
71 | \r | |
72 | # Old broken names (don't use in new code)\r | |
73 | BOM32_LE = BOM_UTF16_LE\r | |
74 | BOM32_BE = BOM_UTF16_BE\r | |
75 | BOM64_LE = BOM_UTF32_LE\r | |
76 | BOM64_BE = BOM_UTF32_BE\r | |
77 | \r | |
78 | \r | |
79 | ### Codec base classes (defining the API)\r | |
80 | \r | |
81 | class CodecInfo(tuple):\r | |
82 | \r | |
83 | def __new__(cls, encode, decode, streamreader=None, streamwriter=None,\r | |
84 | incrementalencoder=None, incrementaldecoder=None, name=None):\r | |
85 | self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))\r | |
86 | self.name = name\r | |
87 | self.encode = encode\r | |
88 | self.decode = decode\r | |
89 | self.incrementalencoder = incrementalencoder\r | |
90 | self.incrementaldecoder = incrementaldecoder\r | |
91 | self.streamwriter = streamwriter\r | |
92 | self.streamreader = streamreader\r | |
93 | return self\r | |
94 | \r | |
95 | def __repr__(self):\r | |
96 | return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))\r | |
97 | \r | |
98 | class Codec:\r | |
99 | \r | |
100 | """ Defines the interface for stateless encoders/decoders.\r | |
101 | \r | |
102 | The .encode()/.decode() methods may use different error\r | |
103 | handling schemes by providing the errors argument. These\r | |
104 | string values are predefined:\r | |
105 | \r | |
106 | 'strict' - raise a ValueError error (or a subclass)\r | |
107 | 'ignore' - ignore the character and continue with the next\r | |
108 | 'replace' - replace with a suitable replacement character;\r | |
109 | Python will use the official U+FFFD REPLACEMENT\r | |
110 | CHARACTER for the builtin Unicode codecs on\r | |
111 | decoding and '?' on encoding.\r | |
112 | 'xmlcharrefreplace' - Replace with the appropriate XML\r | |
113 | character reference (only for encoding).\r | |
114 | 'backslashreplace' - Replace with backslashed escape sequences\r | |
115 | (only for encoding).\r | |
116 | \r | |
117 | The set of allowed values can be extended via register_error.\r | |
118 | \r | |
119 | """\r | |
120 | def encode(self, input, errors='strict'):\r | |
121 | \r | |
122 | """ Encodes the object input and returns a tuple (output\r | |
123 | object, length consumed).\r | |
124 | \r | |
125 | errors defines the error handling to apply. It defaults to\r | |
126 | 'strict' handling.\r | |
127 | \r | |
128 | The method may not store state in the Codec instance. Use\r | |
129 | StreamCodec for codecs which have to keep state in order to\r | |
130 | make encoding/decoding efficient.\r | |
131 | \r | |
132 | The encoder must be able to handle zero length input and\r | |
133 | return an empty object of the output object type in this\r | |
134 | situation.\r | |
135 | \r | |
136 | """\r | |
137 | raise NotImplementedError\r | |
138 | \r | |
139 | def decode(self, input, errors='strict'):\r | |
140 | \r | |
141 | """ Decodes the object input and returns a tuple (output\r | |
142 | object, length consumed).\r | |
143 | \r | |
144 | input must be an object which provides the bf_getreadbuf\r | |
145 | buffer slot. Python strings, buffer objects and memory\r | |
146 | mapped files are examples of objects providing this slot.\r | |
147 | \r | |
148 | errors defines the error handling to apply. It defaults to\r | |
149 | 'strict' handling.\r | |
150 | \r | |
151 | The method may not store state in the Codec instance. Use\r | |
152 | StreamCodec for codecs which have to keep state in order to\r | |
153 | make encoding/decoding efficient.\r | |
154 | \r | |
155 | The decoder must be able to handle zero length input and\r | |
156 | return an empty object of the output object type in this\r | |
157 | situation.\r | |
158 | \r | |
159 | """\r | |
160 | raise NotImplementedError\r | |
161 | \r | |
162 | class IncrementalEncoder(object):\r | |
163 | """\r | |
164 | An IncrementalEncoder encodes an input in multiple steps. The input can be\r | |
165 | passed piece by piece to the encode() method. The IncrementalEncoder remembers\r | |
166 | the state of the Encoding process between calls to encode().\r | |
167 | """\r | |
168 | def __init__(self, errors='strict'):\r | |
169 | """\r | |
170 | Creates an IncrementalEncoder instance.\r | |
171 | \r | |
172 | The IncrementalEncoder may use different error handling schemes by\r | |
173 | providing the errors keyword argument. See the module docstring\r | |
174 | for a list of possible values.\r | |
175 | """\r | |
176 | self.errors = errors\r | |
177 | self.buffer = ""\r | |
178 | \r | |
179 | def encode(self, input, final=False):\r | |
180 | """\r | |
181 | Encodes input and returns the resulting object.\r | |
182 | """\r | |
183 | raise NotImplementedError\r | |
184 | \r | |
185 | def reset(self):\r | |
186 | """\r | |
187 | Resets the encoder to the initial state.\r | |
188 | """\r | |
189 | \r | |
190 | def getstate(self):\r | |
191 | """\r | |
192 | Return the current state of the encoder.\r | |
193 | """\r | |
194 | return 0\r | |
195 | \r | |
196 | def setstate(self, state):\r | |
197 | """\r | |
198 | Set the current state of the encoder. state must have been\r | |
199 | returned by getstate().\r | |
200 | """\r | |
201 | \r | |
202 | class BufferedIncrementalEncoder(IncrementalEncoder):\r | |
203 | """\r | |
204 | This subclass of IncrementalEncoder can be used as the baseclass for an\r | |
205 | incremental encoder if the encoder must keep some of the output in a\r | |
206 | buffer between calls to encode().\r | |
207 | """\r | |
208 | def __init__(self, errors='strict'):\r | |
209 | IncrementalEncoder.__init__(self, errors)\r | |
210 | self.buffer = "" # unencoded input that is kept between calls to encode()\r | |
211 | \r | |
212 | def _buffer_encode(self, input, errors, final):\r | |
213 | # Overwrite this method in subclasses: It must encode input\r | |
214 | # and return an (output, length consumed) tuple\r | |
215 | raise NotImplementedError\r | |
216 | \r | |
217 | def encode(self, input, final=False):\r | |
218 | # encode input (taking the buffer into account)\r | |
219 | data = self.buffer + input\r | |
220 | (result, consumed) = self._buffer_encode(data, self.errors, final)\r | |
221 | # keep unencoded input until the next call\r | |
222 | self.buffer = data[consumed:]\r | |
223 | return result\r | |
224 | \r | |
225 | def reset(self):\r | |
226 | IncrementalEncoder.reset(self)\r | |
227 | self.buffer = ""\r | |
228 | \r | |
229 | def getstate(self):\r | |
230 | return self.buffer or 0\r | |
231 | \r | |
232 | def setstate(self, state):\r | |
233 | self.buffer = state or ""\r | |
234 | \r | |
235 | class IncrementalDecoder(object):\r | |
236 | """\r | |
237 | An IncrementalDecoder decodes an input in multiple steps. The input can be\r | |
238 | passed piece by piece to the decode() method. The IncrementalDecoder\r | |
239 | remembers the state of the decoding process between calls to decode().\r | |
240 | """\r | |
241 | def __init__(self, errors='strict'):\r | |
242 | """\r | |
243 | Creates a IncrementalDecoder instance.\r | |
244 | \r | |
245 | The IncrementalDecoder may use different error handling schemes by\r | |
246 | providing the errors keyword argument. See the module docstring\r | |
247 | for a list of possible values.\r | |
248 | """\r | |
249 | self.errors = errors\r | |
250 | \r | |
251 | def decode(self, input, final=False):\r | |
252 | """\r | |
253 | Decodes input and returns the resulting object.\r | |
254 | """\r | |
255 | raise NotImplementedError\r | |
256 | \r | |
257 | def reset(self):\r | |
258 | """\r | |
259 | Resets the decoder to the initial state.\r | |
260 | """\r | |
261 | \r | |
262 | def getstate(self):\r | |
263 | """\r | |
264 | Return the current state of the decoder.\r | |
265 | \r | |
266 | This must be a (buffered_input, additional_state_info) tuple.\r | |
267 | buffered_input must be a bytes object containing bytes that\r | |
268 | were passed to decode() that have not yet been converted.\r | |
269 | additional_state_info must be a non-negative integer\r | |
270 | representing the state of the decoder WITHOUT yet having\r | |
271 | processed the contents of buffered_input. In the initial state\r | |
272 | and after reset(), getstate() must return (b"", 0).\r | |
273 | """\r | |
274 | return (b"", 0)\r | |
275 | \r | |
276 | def setstate(self, state):\r | |
277 | """\r | |
278 | Set the current state of the decoder.\r | |
279 | \r | |
280 | state must have been returned by getstate(). The effect of\r | |
281 | setstate((b"", 0)) must be equivalent to reset().\r | |
282 | """\r | |
283 | \r | |
284 | class BufferedIncrementalDecoder(IncrementalDecoder):\r | |
285 | """\r | |
286 | This subclass of IncrementalDecoder can be used as the baseclass for an\r | |
287 | incremental decoder if the decoder must be able to handle incomplete byte\r | |
288 | sequences.\r | |
289 | """\r | |
290 | def __init__(self, errors='strict'):\r | |
291 | IncrementalDecoder.__init__(self, errors)\r | |
292 | self.buffer = "" # undecoded input that is kept between calls to decode()\r | |
293 | \r | |
294 | def _buffer_decode(self, input, errors, final):\r | |
295 | # Overwrite this method in subclasses: It must decode input\r | |
296 | # and return an (output, length consumed) tuple\r | |
297 | raise NotImplementedError\r | |
298 | \r | |
299 | def decode(self, input, final=False):\r | |
300 | # decode input (taking the buffer into account)\r | |
301 | data = self.buffer + input\r | |
302 | (result, consumed) = self._buffer_decode(data, self.errors, final)\r | |
303 | # keep undecoded input until the next call\r | |
304 | self.buffer = data[consumed:]\r | |
305 | return result\r | |
306 | \r | |
307 | def reset(self):\r | |
308 | IncrementalDecoder.reset(self)\r | |
309 | self.buffer = ""\r | |
310 | \r | |
311 | def getstate(self):\r | |
312 | # additional state info is always 0\r | |
313 | return (self.buffer, 0)\r | |
314 | \r | |
315 | def setstate(self, state):\r | |
316 | # ignore additional state info\r | |
317 | self.buffer = state[0]\r | |
318 | \r | |
319 | #\r | |
320 | # The StreamWriter and StreamReader class provide generic working\r | |
321 | # interfaces which can be used to implement new encoding submodules\r | |
322 | # very easily. See encodings/utf_8.py for an example on how this is\r | |
323 | # done.\r | |
324 | #\r | |
325 | \r | |
326 | class StreamWriter(Codec):\r | |
327 | \r | |
328 | def __init__(self, stream, errors='strict'):\r | |
329 | \r | |
330 | """ Creates a StreamWriter instance.\r | |
331 | \r | |
332 | stream must be a file-like object open for writing\r | |
333 | (binary) data.\r | |
334 | \r | |
335 | The StreamWriter may use different error handling\r | |
336 | schemes by providing the errors keyword argument. These\r | |
337 | parameters are predefined:\r | |
338 | \r | |
339 | 'strict' - raise a ValueError (or a subclass)\r | |
340 | 'ignore' - ignore the character and continue with the next\r | |
341 | 'replace'- replace with a suitable replacement character\r | |
342 | 'xmlcharrefreplace' - Replace with the appropriate XML\r | |
343 | character reference.\r | |
344 | 'backslashreplace' - Replace with backslashed escape\r | |
345 | sequences (only for encoding).\r | |
346 | \r | |
347 | The set of allowed parameter values can be extended via\r | |
348 | register_error.\r | |
349 | """\r | |
350 | self.stream = stream\r | |
351 | self.errors = errors\r | |
352 | \r | |
353 | def write(self, object):\r | |
354 | \r | |
355 | """ Writes the object's contents encoded to self.stream.\r | |
356 | """\r | |
357 | data, consumed = self.encode(object, self.errors)\r | |
358 | self.stream.write(data)\r | |
359 | \r | |
360 | def writelines(self, list):\r | |
361 | \r | |
362 | """ Writes the concatenated list of strings to the stream\r | |
363 | using .write().\r | |
364 | """\r | |
365 | self.write(''.join(list))\r | |
366 | \r | |
367 | def reset(self):\r | |
368 | \r | |
369 | """ Flushes and resets the codec buffers used for keeping state.\r | |
370 | \r | |
371 | Calling this method should ensure that the data on the\r | |
372 | output is put into a clean state, that allows appending\r | |
373 | of new fresh data without having to rescan the whole\r | |
374 | stream to recover state.\r | |
375 | \r | |
376 | """\r | |
377 | pass\r | |
378 | \r | |
379 | def seek(self, offset, whence=0):\r | |
380 | self.stream.seek(offset, whence)\r | |
381 | if whence == 0 and offset == 0:\r | |
382 | self.reset()\r | |
383 | \r | |
384 | def __getattr__(self, name,\r | |
385 | getattr=getattr):\r | |
386 | \r | |
387 | """ Inherit all other methods from the underlying stream.\r | |
388 | """\r | |
389 | return getattr(self.stream, name)\r | |
390 | \r | |
391 | def __enter__(self):\r | |
392 | return self\r | |
393 | \r | |
394 | def __exit__(self, type, value, tb):\r | |
395 | self.stream.close()\r | |
396 | \r | |
397 | ###\r | |
398 | \r | |
399 | class StreamReader(Codec):\r | |
400 | \r | |
401 | def __init__(self, stream, errors='strict'):\r | |
402 | \r | |
403 | """ Creates a StreamReader instance.\r | |
404 | \r | |
405 | stream must be a file-like object open for reading\r | |
406 | (binary) data.\r | |
407 | \r | |
408 | The StreamReader may use different error handling\r | |
409 | schemes by providing the errors keyword argument. These\r | |
410 | parameters are predefined:\r | |
411 | \r | |
412 | 'strict' - raise a ValueError (or a subclass)\r | |
413 | 'ignore' - ignore the character and continue with the next\r | |
414 | 'replace'- replace with a suitable replacement character;\r | |
415 | \r | |
416 | The set of allowed parameter values can be extended via\r | |
417 | register_error.\r | |
418 | """\r | |
419 | self.stream = stream\r | |
420 | self.errors = errors\r | |
421 | self.bytebuffer = ""\r | |
422 | # For str->str decoding this will stay a str\r | |
423 | # For str->unicode decoding the first read will promote it to unicode\r | |
424 | self.charbuffer = ""\r | |
425 | self.linebuffer = None\r | |
426 | \r | |
427 | def decode(self, input, errors='strict'):\r | |
428 | raise NotImplementedError\r | |
429 | \r | |
430 | def read(self, size=-1, chars=-1, firstline=False):\r | |
431 | \r | |
432 | """ Decodes data from the stream self.stream and returns the\r | |
433 | resulting object.\r | |
434 | \r | |
435 | chars indicates the number of characters to read from the\r | |
436 | stream. read() will never return more than chars\r | |
437 | characters, but it might return less, if there are not enough\r | |
438 | characters available.\r | |
439 | \r | |
440 | size indicates the approximate maximum number of bytes to\r | |
441 | read from the stream for decoding purposes. The decoder\r | |
442 | can modify this setting as appropriate. The default value\r | |
443 | -1 indicates to read and decode as much as possible. size\r | |
444 | is intended to prevent having to decode huge files in one\r | |
445 | step.\r | |
446 | \r | |
447 | If firstline is true, and a UnicodeDecodeError happens\r | |
448 | after the first line terminator in the input only the first line\r | |
449 | will be returned, the rest of the input will be kept until the\r | |
450 | next call to read().\r | |
451 | \r | |
452 | The method should use a greedy read strategy meaning that\r | |
453 | it should read as much data as is allowed within the\r | |
454 | definition of the encoding and the given size, e.g. if\r | |
455 | optional encoding endings or state markers are available\r | |
456 | on the stream, these should be read too.\r | |
457 | """\r | |
458 | # If we have lines cached, first merge them back into characters\r | |
459 | if self.linebuffer:\r | |
460 | self.charbuffer = "".join(self.linebuffer)\r | |
461 | self.linebuffer = None\r | |
462 | \r | |
463 | # read until we get the required number of characters (if available)\r | |
464 | while True:\r | |
465 | # can the request be satisfied from the character buffer?\r | |
466 | if chars >= 0:\r | |
467 | if len(self.charbuffer) >= chars:\r | |
468 | break\r | |
469 | elif size >= 0:\r | |
470 | if len(self.charbuffer) >= size:\r | |
471 | break\r | |
472 | # we need more data\r | |
473 | if size < 0:\r | |
474 | newdata = self.stream.read()\r | |
475 | else:\r | |
476 | newdata = self.stream.read(size)\r | |
477 | # decode bytes (those remaining from the last call included)\r | |
478 | data = self.bytebuffer + newdata\r | |
479 | try:\r | |
480 | newchars, decodedbytes = self.decode(data, self.errors)\r | |
481 | except UnicodeDecodeError, exc:\r | |
482 | if firstline:\r | |
483 | newchars, decodedbytes = self.decode(data[:exc.start], self.errors)\r | |
484 | lines = newchars.splitlines(True)\r | |
485 | if len(lines)<=1:\r | |
486 | raise\r | |
487 | else:\r | |
488 | raise\r | |
489 | # keep undecoded bytes until the next call\r | |
490 | self.bytebuffer = data[decodedbytes:]\r | |
491 | # put new characters in the character buffer\r | |
492 | self.charbuffer += newchars\r | |
493 | # there was no data available\r | |
494 | if not newdata:\r | |
495 | break\r | |
496 | if chars < 0:\r | |
497 | # Return everything we've got\r | |
498 | result = self.charbuffer\r | |
499 | self.charbuffer = ""\r | |
500 | else:\r | |
501 | # Return the first chars characters\r | |
502 | result = self.charbuffer[:chars]\r | |
503 | self.charbuffer = self.charbuffer[chars:]\r | |
504 | return result\r | |
505 | \r | |
506 | def readline(self, size=None, keepends=True):\r | |
507 | \r | |
508 | """ Read one line from the input stream and return the\r | |
509 | decoded data.\r | |
510 | \r | |
511 | size, if given, is passed as size argument to the\r | |
512 | read() method.\r | |
513 | \r | |
514 | """\r | |
515 | # If we have lines cached from an earlier read, return\r | |
516 | # them unconditionally\r | |
517 | if self.linebuffer:\r | |
518 | line = self.linebuffer[0]\r | |
519 | del self.linebuffer[0]\r | |
520 | if len(self.linebuffer) == 1:\r | |
521 | # revert to charbuffer mode; we might need more data\r | |
522 | # next time\r | |
523 | self.charbuffer = self.linebuffer[0]\r | |
524 | self.linebuffer = None\r | |
525 | if not keepends:\r | |
526 | line = line.splitlines(False)[0]\r | |
527 | return line\r | |
528 | \r | |
529 | readsize = size or 72\r | |
530 | line = ""\r | |
531 | # If size is given, we call read() only once\r | |
532 | while True:\r | |
533 | data = self.read(readsize, firstline=True)\r | |
534 | if data:\r | |
535 | # If we're at a "\r" read one extra character (which might\r | |
536 | # be a "\n") to get a proper line ending. If the stream is\r | |
537 | # temporarily exhausted we return the wrong line ending.\r | |
538 | if data.endswith("\r"):\r | |
539 | data += self.read(size=1, chars=1)\r | |
540 | \r | |
541 | line += data\r | |
542 | lines = line.splitlines(True)\r | |
543 | if lines:\r | |
544 | if len(lines) > 1:\r | |
545 | # More than one line result; the first line is a full line\r | |
546 | # to return\r | |
547 | line = lines[0]\r | |
548 | del lines[0]\r | |
549 | if len(lines) > 1:\r | |
550 | # cache the remaining lines\r | |
551 | lines[-1] += self.charbuffer\r | |
552 | self.linebuffer = lines\r | |
553 | self.charbuffer = None\r | |
554 | else:\r | |
555 | # only one remaining line, put it back into charbuffer\r | |
556 | self.charbuffer = lines[0] + self.charbuffer\r | |
557 | if not keepends:\r | |
558 | line = line.splitlines(False)[0]\r | |
559 | break\r | |
560 | line0withend = lines[0]\r | |
561 | line0withoutend = lines[0].splitlines(False)[0]\r | |
562 | if line0withend != line0withoutend: # We really have a line end\r | |
563 | # Put the rest back together and keep it until the next call\r | |
564 | self.charbuffer = "".join(lines[1:]) + self.charbuffer\r | |
565 | if keepends:\r | |
566 | line = line0withend\r | |
567 | else:\r | |
568 | line = line0withoutend\r | |
569 | break\r | |
570 | # we didn't get anything or this was our only try\r | |
571 | if not data or size is not None:\r | |
572 | if line and not keepends:\r | |
573 | line = line.splitlines(False)[0]\r | |
574 | break\r | |
575 | if readsize<8000:\r | |
576 | readsize *= 2\r | |
577 | return line\r | |
578 | \r | |
579 | def readlines(self, sizehint=None, keepends=True):\r | |
580 | \r | |
581 | """ Read all lines available on the input stream\r | |
582 | and return them as list of lines.\r | |
583 | \r | |
584 | Line breaks are implemented using the codec's decoder\r | |
585 | method and are included in the list entries.\r | |
586 | \r | |
587 | sizehint, if given, is ignored since there is no efficient\r | |
588 | way to finding the true end-of-line.\r | |
589 | \r | |
590 | """\r | |
591 | data = self.read()\r | |
592 | return data.splitlines(keepends)\r | |
593 | \r | |
594 | def reset(self):\r | |
595 | \r | |
596 | """ Resets the codec buffers used for keeping state.\r | |
597 | \r | |
598 | Note that no stream repositioning should take place.\r | |
599 | This method is primarily intended to be able to recover\r | |
600 | from decoding errors.\r | |
601 | \r | |
602 | """\r | |
603 | self.bytebuffer = ""\r | |
604 | self.charbuffer = u""\r | |
605 | self.linebuffer = None\r | |
606 | \r | |
607 | def seek(self, offset, whence=0):\r | |
608 | """ Set the input stream's current position.\r | |
609 | \r | |
610 | Resets the codec buffers used for keeping state.\r | |
611 | """\r | |
612 | self.stream.seek(offset, whence)\r | |
613 | self.reset()\r | |
614 | \r | |
615 | def next(self):\r | |
616 | \r | |
617 | """ Return the next decoded line from the input stream."""\r | |
618 | line = self.readline()\r | |
619 | if line:\r | |
620 | return line\r | |
621 | raise StopIteration\r | |
622 | \r | |
623 | def __iter__(self):\r | |
624 | return self\r | |
625 | \r | |
626 | def __getattr__(self, name,\r | |
627 | getattr=getattr):\r | |
628 | \r | |
629 | """ Inherit all other methods from the underlying stream.\r | |
630 | """\r | |
631 | return getattr(self.stream, name)\r | |
632 | \r | |
633 | def __enter__(self):\r | |
634 | return self\r | |
635 | \r | |
636 | def __exit__(self, type, value, tb):\r | |
637 | self.stream.close()\r | |
638 | \r | |
639 | ###\r | |
640 | \r | |
641 | class StreamReaderWriter:\r | |
642 | \r | |
643 | """ StreamReaderWriter instances allow wrapping streams which\r | |
644 | work in both read and write modes.\r | |
645 | \r | |
646 | The design is such that one can use the factory functions\r | |
647 | returned by the codec.lookup() function to construct the\r | |
648 | instance.\r | |
649 | \r | |
650 | """\r | |
651 | # Optional attributes set by the file wrappers below\r | |
652 | encoding = 'unknown'\r | |
653 | \r | |
654 | def __init__(self, stream, Reader, Writer, errors='strict'):\r | |
655 | \r | |
656 | """ Creates a StreamReaderWriter instance.\r | |
657 | \r | |
658 | stream must be a Stream-like object.\r | |
659 | \r | |
660 | Reader, Writer must be factory functions or classes\r | |
661 | providing the StreamReader, StreamWriter interface resp.\r | |
662 | \r | |
663 | Error handling is done in the same way as defined for the\r | |
664 | StreamWriter/Readers.\r | |
665 | \r | |
666 | """\r | |
667 | self.stream = stream\r | |
668 | self.reader = Reader(stream, errors)\r | |
669 | self.writer = Writer(stream, errors)\r | |
670 | self.errors = errors\r | |
671 | \r | |
672 | def read(self, size=-1):\r | |
673 | \r | |
674 | return self.reader.read(size)\r | |
675 | \r | |
676 | def readline(self, size=None):\r | |
677 | \r | |
678 | return self.reader.readline(size)\r | |
679 | \r | |
680 | def readlines(self, sizehint=None):\r | |
681 | \r | |
682 | return self.reader.readlines(sizehint)\r | |
683 | \r | |
684 | def next(self):\r | |
685 | \r | |
686 | """ Return the next decoded line from the input stream."""\r | |
687 | return self.reader.next()\r | |
688 | \r | |
689 | def __iter__(self):\r | |
690 | return self\r | |
691 | \r | |
692 | def write(self, data):\r | |
693 | \r | |
694 | return self.writer.write(data)\r | |
695 | \r | |
696 | def writelines(self, list):\r | |
697 | \r | |
698 | return self.writer.writelines(list)\r | |
699 | \r | |
700 | def reset(self):\r | |
701 | \r | |
702 | self.reader.reset()\r | |
703 | self.writer.reset()\r | |
704 | \r | |
705 | def seek(self, offset, whence=0):\r | |
706 | self.stream.seek(offset, whence)\r | |
707 | self.reader.reset()\r | |
708 | if whence == 0 and offset == 0:\r | |
709 | self.writer.reset()\r | |
710 | \r | |
711 | def __getattr__(self, name,\r | |
712 | getattr=getattr):\r | |
713 | \r | |
714 | """ Inherit all other methods from the underlying stream.\r | |
715 | """\r | |
716 | return getattr(self.stream, name)\r | |
717 | \r | |
718 | # these are needed to make "with codecs.open(...)" work properly\r | |
719 | \r | |
720 | def __enter__(self):\r | |
721 | return self\r | |
722 | \r | |
723 | def __exit__(self, type, value, tb):\r | |
724 | self.stream.close()\r | |
725 | \r | |
726 | ###\r | |
727 | \r | |
728 | class StreamRecoder:\r | |
729 | \r | |
730 | """ StreamRecoder instances provide a frontend - backend\r | |
731 | view of encoding data.\r | |
732 | \r | |
733 | They use the complete set of APIs returned by the\r | |
734 | codecs.lookup() function to implement their task.\r | |
735 | \r | |
736 | Data written to the stream is first decoded into an\r | |
737 | intermediate format (which is dependent on the given codec\r | |
738 | combination) and then written to the stream using an instance\r | |
739 | of the provided Writer class.\r | |
740 | \r | |
741 | In the other direction, data is read from the stream using a\r | |
742 | Reader instance and then return encoded data to the caller.\r | |
743 | \r | |
744 | """\r | |
745 | # Optional attributes set by the file wrappers below\r | |
746 | data_encoding = 'unknown'\r | |
747 | file_encoding = 'unknown'\r | |
748 | \r | |
749 | def __init__(self, stream, encode, decode, Reader, Writer,\r | |
750 | errors='strict'):\r | |
751 | \r | |
752 | """ Creates a StreamRecoder instance which implements a two-way\r | |
753 | conversion: encode and decode work on the frontend (the\r | |
754 | input to .read() and output of .write()) while\r | |
755 | Reader and Writer work on the backend (reading and\r | |
756 | writing to the stream).\r | |
757 | \r | |
758 | You can use these objects to do transparent direct\r | |
759 | recodings from e.g. latin-1 to utf-8 and back.\r | |
760 | \r | |
761 | stream must be a file-like object.\r | |
762 | \r | |
763 | encode, decode must adhere to the Codec interface, Reader,\r | |
764 | Writer must be factory functions or classes providing the\r | |
765 | StreamReader, StreamWriter interface resp.\r | |
766 | \r | |
767 | encode and decode are needed for the frontend translation,\r | |
768 | Reader and Writer for the backend translation. Unicode is\r | |
769 | used as intermediate encoding.\r | |
770 | \r | |
771 | Error handling is done in the same way as defined for the\r | |
772 | StreamWriter/Readers.\r | |
773 | \r | |
774 | """\r | |
775 | self.stream = stream\r | |
776 | self.encode = encode\r | |
777 | self.decode = decode\r | |
778 | self.reader = Reader(stream, errors)\r | |
779 | self.writer = Writer(stream, errors)\r | |
780 | self.errors = errors\r | |
781 | \r | |
782 | def read(self, size=-1):\r | |
783 | \r | |
784 | data = self.reader.read(size)\r | |
785 | data, bytesencoded = self.encode(data, self.errors)\r | |
786 | return data\r | |
787 | \r | |
788 | def readline(self, size=None):\r | |
789 | \r | |
790 | if size is None:\r | |
791 | data = self.reader.readline()\r | |
792 | else:\r | |
793 | data = self.reader.readline(size)\r | |
794 | data, bytesencoded = self.encode(data, self.errors)\r | |
795 | return data\r | |
796 | \r | |
797 | def readlines(self, sizehint=None):\r | |
798 | \r | |
799 | data = self.reader.read()\r | |
800 | data, bytesencoded = self.encode(data, self.errors)\r | |
801 | return data.splitlines(1)\r | |
802 | \r | |
803 | def next(self):\r | |
804 | \r | |
805 | """ Return the next decoded line from the input stream."""\r | |
806 | data = self.reader.next()\r | |
807 | data, bytesencoded = self.encode(data, self.errors)\r | |
808 | return data\r | |
809 | \r | |
810 | def __iter__(self):\r | |
811 | return self\r | |
812 | \r | |
813 | def write(self, data):\r | |
814 | \r | |
815 | data, bytesdecoded = self.decode(data, self.errors)\r | |
816 | return self.writer.write(data)\r | |
817 | \r | |
818 | def writelines(self, list):\r | |
819 | \r | |
820 | data = ''.join(list)\r | |
821 | data, bytesdecoded = self.decode(data, self.errors)\r | |
822 | return self.writer.write(data)\r | |
823 | \r | |
824 | def reset(self):\r | |
825 | \r | |
826 | self.reader.reset()\r | |
827 | self.writer.reset()\r | |
828 | \r | |
829 | def __getattr__(self, name,\r | |
830 | getattr=getattr):\r | |
831 | \r | |
832 | """ Inherit all other methods from the underlying stream.\r | |
833 | """\r | |
834 | return getattr(self.stream, name)\r | |
835 | \r | |
836 | def __enter__(self):\r | |
837 | return self\r | |
838 | \r | |
839 | def __exit__(self, type, value, tb):\r | |
840 | self.stream.close()\r | |
841 | \r | |
842 | ### Shortcuts\r | |
843 | \r | |
844 | def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):\r | |
845 | \r | |
846 | """ Open an encoded file using the given mode and return\r | |
847 | a wrapped version providing transparent encoding/decoding.\r | |
848 | \r | |
849 | Note: The wrapped version will only accept the object format\r | |
850 | defined by the codecs, i.e. Unicode objects for most builtin\r | |
851 | codecs. Output is also codec dependent and will usually be\r | |
852 | Unicode as well.\r | |
853 | \r | |
854 | Files are always opened in binary mode, even if no binary mode\r | |
855 | was specified. This is done to avoid data loss due to encodings\r | |
856 | using 8-bit values. The default file mode is 'rb' meaning to\r | |
857 | open the file in binary read mode.\r | |
858 | \r | |
859 | encoding specifies the encoding which is to be used for the\r | |
860 | file.\r | |
861 | \r | |
862 | errors may be given to define the error handling. It defaults\r | |
863 | to 'strict' which causes ValueErrors to be raised in case an\r | |
864 | encoding error occurs.\r | |
865 | \r | |
866 | buffering has the same meaning as for the builtin open() API.\r | |
867 | It defaults to line buffered.\r | |
868 | \r | |
869 | The returned wrapped file object provides an extra attribute\r | |
870 | .encoding which allows querying the used encoding. This\r | |
871 | attribute is only available if an encoding was specified as\r | |
872 | parameter.\r | |
873 | \r | |
874 | """\r | |
875 | if encoding is not None:\r | |
876 | if 'U' in mode:\r | |
877 | # No automatic conversion of '\n' is done on reading and writing\r | |
878 | mode = mode.strip().replace('U', '')\r | |
879 | if mode[:1] not in set('rwa'):\r | |
880 | mode = 'r' + mode\r | |
881 | if 'b' not in mode:\r | |
882 | # Force opening of the file in binary mode\r | |
883 | mode = mode + 'b'\r | |
884 | file = __builtin__.open(filename, mode, buffering)\r | |
885 | if encoding is None:\r | |
886 | return file\r | |
887 | info = lookup(encoding)\r | |
888 | srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)\r | |
889 | # Add attributes to simplify introspection\r | |
890 | srw.encoding = encoding\r | |
891 | return srw\r | |
892 | \r | |
893 | def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):\r | |
894 | \r | |
895 | """ Return a wrapped version of file which provides transparent\r | |
896 | encoding translation.\r | |
897 | \r | |
898 | Strings written to the wrapped file are interpreted according\r | |
899 | to the given data_encoding and then written to the original\r | |
900 | file as string using file_encoding. The intermediate encoding\r | |
901 | will usually be Unicode but depends on the specified codecs.\r | |
902 | \r | |
903 | Strings are read from the file using file_encoding and then\r | |
904 | passed back to the caller as string using data_encoding.\r | |
905 | \r | |
906 | If file_encoding is not given, it defaults to data_encoding.\r | |
907 | \r | |
908 | errors may be given to define the error handling. It defaults\r | |
909 | to 'strict' which causes ValueErrors to be raised in case an\r | |
910 | encoding error occurs.\r | |
911 | \r | |
912 | The returned wrapped file object provides two extra attributes\r | |
913 | .data_encoding and .file_encoding which reflect the given\r | |
914 | parameters of the same name. The attributes can be used for\r | |
915 | introspection by Python programs.\r | |
916 | \r | |
917 | """\r | |
918 | if file_encoding is None:\r | |
919 | file_encoding = data_encoding\r | |
920 | data_info = lookup(data_encoding)\r | |
921 | file_info = lookup(file_encoding)\r | |
922 | sr = StreamRecoder(file, data_info.encode, data_info.decode,\r | |
923 | file_info.streamreader, file_info.streamwriter, errors)\r | |
924 | # Add attributes to simplify introspection\r | |
925 | sr.data_encoding = data_encoding\r | |
926 | sr.file_encoding = file_encoding\r | |
927 | return sr\r | |
928 | \r | |
929 | ### Helpers for codec lookup\r | |
930 | \r | |
931 | def getencoder(encoding):\r | |
932 | \r | |
933 | """ Lookup up the codec for the given encoding and return\r | |
934 | its encoder function.\r | |
935 | \r | |
936 | Raises a LookupError in case the encoding cannot be found.\r | |
937 | \r | |
938 | """\r | |
939 | return lookup(encoding).encode\r | |
940 | \r | |
941 | def getdecoder(encoding):\r | |
942 | \r | |
943 | """ Lookup up the codec for the given encoding and return\r | |
944 | its decoder function.\r | |
945 | \r | |
946 | Raises a LookupError in case the encoding cannot be found.\r | |
947 | \r | |
948 | """\r | |
949 | return lookup(encoding).decode\r | |
950 | \r | |
951 | def getincrementalencoder(encoding):\r | |
952 | \r | |
953 | """ Lookup up the codec for the given encoding and return\r | |
954 | its IncrementalEncoder class or factory function.\r | |
955 | \r | |
956 | Raises a LookupError in case the encoding cannot be found\r | |
957 | or the codecs doesn't provide an incremental encoder.\r | |
958 | \r | |
959 | """\r | |
960 | encoder = lookup(encoding).incrementalencoder\r | |
961 | if encoder is None:\r | |
962 | raise LookupError(encoding)\r | |
963 | return encoder\r | |
964 | \r | |
965 | def getincrementaldecoder(encoding):\r | |
966 | \r | |
967 | """ Lookup up the codec for the given encoding and return\r | |
968 | its IncrementalDecoder class or factory function.\r | |
969 | \r | |
970 | Raises a LookupError in case the encoding cannot be found\r | |
971 | or the codecs doesn't provide an incremental decoder.\r | |
972 | \r | |
973 | """\r | |
974 | decoder = lookup(encoding).incrementaldecoder\r | |
975 | if decoder is None:\r | |
976 | raise LookupError(encoding)\r | |
977 | return decoder\r | |
978 | \r | |
979 | def getreader(encoding):\r | |
980 | \r | |
981 | """ Lookup up the codec for the given encoding and return\r | |
982 | its StreamReader class or factory function.\r | |
983 | \r | |
984 | Raises a LookupError in case the encoding cannot be found.\r | |
985 | \r | |
986 | """\r | |
987 | return lookup(encoding).streamreader\r | |
988 | \r | |
989 | def getwriter(encoding):\r | |
990 | \r | |
991 | """ Lookup up the codec for the given encoding and return\r | |
992 | its StreamWriter class or factory function.\r | |
993 | \r | |
994 | Raises a LookupError in case the encoding cannot be found.\r | |
995 | \r | |
996 | """\r | |
997 | return lookup(encoding).streamwriter\r | |
998 | \r | |
999 | def iterencode(iterator, encoding, errors='strict', **kwargs):\r | |
1000 | """\r | |
1001 | Encoding iterator.\r | |
1002 | \r | |
1003 | Encodes the input strings from the iterator using a IncrementalEncoder.\r | |
1004 | \r | |
1005 | errors and kwargs are passed through to the IncrementalEncoder\r | |
1006 | constructor.\r | |
1007 | """\r | |
1008 | encoder = getincrementalencoder(encoding)(errors, **kwargs)\r | |
1009 | for input in iterator:\r | |
1010 | output = encoder.encode(input)\r | |
1011 | if output:\r | |
1012 | yield output\r | |
1013 | output = encoder.encode("", True)\r | |
1014 | if output:\r | |
1015 | yield output\r | |
1016 | \r | |
1017 | def iterdecode(iterator, encoding, errors='strict', **kwargs):\r | |
1018 | """\r | |
1019 | Decoding iterator.\r | |
1020 | \r | |
1021 | Decodes the input strings from the iterator using a IncrementalDecoder.\r | |
1022 | \r | |
1023 | errors and kwargs are passed through to the IncrementalDecoder\r | |
1024 | constructor.\r | |
1025 | """\r | |
1026 | decoder = getincrementaldecoder(encoding)(errors, **kwargs)\r | |
1027 | for input in iterator:\r | |
1028 | output = decoder.decode(input)\r | |
1029 | if output:\r | |
1030 | yield output\r | |
1031 | output = decoder.decode("", True)\r | |
1032 | if output:\r | |
1033 | yield output\r | |
1034 | \r | |
1035 | ### Helpers for charmap-based codecs\r | |
1036 | \r | |
1037 | def make_identity_dict(rng):\r | |
1038 | \r | |
1039 | """ make_identity_dict(rng) -> dict\r | |
1040 | \r | |
1041 | Return a dictionary where elements of the rng sequence are\r | |
1042 | mapped to themselves.\r | |
1043 | \r | |
1044 | """\r | |
1045 | res = {}\r | |
1046 | for i in rng:\r | |
1047 | res[i]=i\r | |
1048 | return res\r | |
1049 | \r | |
1050 | def make_encoding_map(decoding_map):\r | |
1051 | \r | |
1052 | """ Creates an encoding map from a decoding map.\r | |
1053 | \r | |
1054 | If a target mapping in the decoding map occurs multiple\r | |
1055 | times, then that target is mapped to None (undefined mapping),\r | |
1056 | causing an exception when encountered by the charmap codec\r | |
1057 | during translation.\r | |
1058 | \r | |
1059 | One example where this happens is cp875.py which decodes\r | |
1060 | multiple character to \\u001a.\r | |
1061 | \r | |
1062 | """\r | |
1063 | m = {}\r | |
1064 | for k,v in decoding_map.items():\r | |
1065 | if not v in m:\r | |
1066 | m[v] = k\r | |
1067 | else:\r | |
1068 | m[v] = None\r | |
1069 | return m\r | |
1070 | \r | |
1071 | ### error handlers\r | |
1072 | \r | |
1073 | try:\r | |
1074 | strict_errors = lookup_error("strict")\r | |
1075 | ignore_errors = lookup_error("ignore")\r | |
1076 | replace_errors = lookup_error("replace")\r | |
1077 | xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")\r | |
1078 | backslashreplace_errors = lookup_error("backslashreplace")\r | |
1079 | except LookupError:\r | |
1080 | # In --disable-unicode builds, these error handler are missing\r | |
1081 | strict_errors = None\r | |
1082 | ignore_errors = None\r | |
1083 | replace_errors = None\r | |
1084 | xmlcharrefreplace_errors = None\r | |
1085 | backslashreplace_errors = None\r | |
1086 | \r | |
1087 | # Tell modulefinder that using codecs probably needs the encodings\r | |
1088 | # package\r | |
1089 | _false = 0\r | |
1090 | if _false:\r | |
1091 | import encodings\r | |
1092 | \r | |
1093 | ### Tests\r | |
1094 | \r | |
1095 | if __name__ == '__main__':\r | |
1096 | \r | |
1097 | # Make stdout translate Latin-1 output into UTF-8 output\r | |
1098 | sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')\r | |
1099 | \r | |
1100 | # Have stdin translate Latin-1 input into UTF-8 input\r | |
1101 | sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')\r |