]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.10/Lib/codecs.py
AppPkg/Applications/Python/Python-2.7.10: Initial Checkin part 4/5.
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Lib / codecs.py
CommitLineData
3257aa99
DM
1""" codecs -- Python Codec Registry, API and helpers.\r
2\r
3\r
4Written by Marc-Andre Lemburg (mal@lemburg.com).\r
5\r
6(c) Copyright CNRI, All Rights Reserved. NO WARRANTY.\r
7\r
8"""#"\r
9\r
10import __builtin__, sys\r
11\r
12### Registry and builtin stateless codec functions\r
13\r
14try:\r
15 from _codecs import *\r
16except ImportError, why:\r
17 raise SystemError('Failed to load the builtin codecs: %s' % why)\r
18\r
19__all__ = ["register", "lookup", "open", "EncodedFile", "BOM", "BOM_BE",\r
20 "BOM_LE", "BOM32_BE", "BOM32_LE", "BOM64_BE", "BOM64_LE",\r
21 "BOM_UTF8", "BOM_UTF16", "BOM_UTF16_LE", "BOM_UTF16_BE",\r
22 "BOM_UTF32", "BOM_UTF32_LE", "BOM_UTF32_BE",\r
23 "CodecInfo", "Codec", "IncrementalEncoder", "IncrementalDecoder",\r
24 "StreamReader", "StreamWriter",\r
25 "StreamReaderWriter", "StreamRecoder",\r
26 "getencoder", "getdecoder", "getincrementalencoder",\r
27 "getincrementaldecoder", "getreader", "getwriter",\r
28 "encode", "decode", "iterencode", "iterdecode",\r
29 "strict_errors", "ignore_errors", "replace_errors",\r
30 "xmlcharrefreplace_errors", "backslashreplace_errors",\r
31 "register_error", "lookup_error"]\r
32\r
33### Constants\r
34\r
35#\r
36# Byte Order Mark (BOM = ZERO WIDTH NO-BREAK SPACE = U+FEFF)\r
37# and its possible byte string values\r
38# for UTF8/UTF16/UTF32 output and little/big endian machines\r
39#\r
40\r
41# UTF-8\r
42BOM_UTF8 = '\xef\xbb\xbf'\r
43\r
44# UTF-16, little endian\r
45BOM_LE = BOM_UTF16_LE = '\xff\xfe'\r
46\r
47# UTF-16, big endian\r
48BOM_BE = BOM_UTF16_BE = '\xfe\xff'\r
49\r
50# UTF-32, little endian\r
51BOM_UTF32_LE = '\xff\xfe\x00\x00'\r
52\r
53# UTF-32, big endian\r
54BOM_UTF32_BE = '\x00\x00\xfe\xff'\r
55\r
56if sys.byteorder == 'little':\r
57\r
58 # UTF-16, native endianness\r
59 BOM = BOM_UTF16 = BOM_UTF16_LE\r
60\r
61 # UTF-32, native endianness\r
62 BOM_UTF32 = BOM_UTF32_LE\r
63\r
64else:\r
65\r
66 # UTF-16, native endianness\r
67 BOM = BOM_UTF16 = BOM_UTF16_BE\r
68\r
69 # UTF-32, native endianness\r
70 BOM_UTF32 = BOM_UTF32_BE\r
71\r
72# Old broken names (don't use in new code)\r
73BOM32_LE = BOM_UTF16_LE\r
74BOM32_BE = BOM_UTF16_BE\r
75BOM64_LE = BOM_UTF32_LE\r
76BOM64_BE = BOM_UTF32_BE\r
77\r
78\r
79### Codec base classes (defining the API)\r
80\r
81class CodecInfo(tuple):\r
82\r
83 def __new__(cls, encode, decode, streamreader=None, streamwriter=None,\r
84 incrementalencoder=None, incrementaldecoder=None, name=None):\r
85 self = tuple.__new__(cls, (encode, decode, streamreader, streamwriter))\r
86 self.name = name\r
87 self.encode = encode\r
88 self.decode = decode\r
89 self.incrementalencoder = incrementalencoder\r
90 self.incrementaldecoder = incrementaldecoder\r
91 self.streamwriter = streamwriter\r
92 self.streamreader = streamreader\r
93 return self\r
94\r
95 def __repr__(self):\r
96 return "<%s.%s object for encoding %s at 0x%x>" % (self.__class__.__module__, self.__class__.__name__, self.name, id(self))\r
97\r
98class Codec:\r
99\r
100 """ Defines the interface for stateless encoders/decoders.\r
101\r
102 The .encode()/.decode() methods may use different error\r
103 handling schemes by providing the errors argument. These\r
104 string values are predefined:\r
105\r
106 'strict' - raise a ValueError error (or a subclass)\r
107 'ignore' - ignore the character and continue with the next\r
108 'replace' - replace with a suitable replacement character;\r
109 Python will use the official U+FFFD REPLACEMENT\r
110 CHARACTER for the builtin Unicode codecs on\r
111 decoding and '?' on encoding.\r
112 'xmlcharrefreplace' - Replace with the appropriate XML\r
113 character reference (only for encoding).\r
114 'backslashreplace' - Replace with backslashed escape sequences\r
115 (only for encoding).\r
116\r
117 The set of allowed values can be extended via register_error.\r
118\r
119 """\r
120 def encode(self, input, errors='strict'):\r
121\r
122 """ Encodes the object input and returns a tuple (output\r
123 object, length consumed).\r
124\r
125 errors defines the error handling to apply. It defaults to\r
126 'strict' handling.\r
127\r
128 The method may not store state in the Codec instance. Use\r
129 StreamCodec for codecs which have to keep state in order to\r
130 make encoding/decoding efficient.\r
131\r
132 The encoder must be able to handle zero length input and\r
133 return an empty object of the output object type in this\r
134 situation.\r
135\r
136 """\r
137 raise NotImplementedError\r
138\r
139 def decode(self, input, errors='strict'):\r
140\r
141 """ Decodes the object input and returns a tuple (output\r
142 object, length consumed).\r
143\r
144 input must be an object which provides the bf_getreadbuf\r
145 buffer slot. Python strings, buffer objects and memory\r
146 mapped files are examples of objects providing this slot.\r
147\r
148 errors defines the error handling to apply. It defaults to\r
149 'strict' handling.\r
150\r
151 The method may not store state in the Codec instance. Use\r
152 StreamCodec for codecs which have to keep state in order to\r
153 make encoding/decoding efficient.\r
154\r
155 The decoder must be able to handle zero length input and\r
156 return an empty object of the output object type in this\r
157 situation.\r
158\r
159 """\r
160 raise NotImplementedError\r
161\r
162class IncrementalEncoder(object):\r
163 """\r
164 An IncrementalEncoder encodes an input in multiple steps. The input can be\r
165 passed piece by piece to the encode() method. The IncrementalEncoder remembers\r
166 the state of the Encoding process between calls to encode().\r
167 """\r
168 def __init__(self, errors='strict'):\r
169 """\r
170 Creates an IncrementalEncoder instance.\r
171\r
172 The IncrementalEncoder may use different error handling schemes by\r
173 providing the errors keyword argument. See the module docstring\r
174 for a list of possible values.\r
175 """\r
176 self.errors = errors\r
177 self.buffer = ""\r
178\r
179 def encode(self, input, final=False):\r
180 """\r
181 Encodes input and returns the resulting object.\r
182 """\r
183 raise NotImplementedError\r
184\r
185 def reset(self):\r
186 """\r
187 Resets the encoder to the initial state.\r
188 """\r
189\r
190 def getstate(self):\r
191 """\r
192 Return the current state of the encoder.\r
193 """\r
194 return 0\r
195\r
196 def setstate(self, state):\r
197 """\r
198 Set the current state of the encoder. state must have been\r
199 returned by getstate().\r
200 """\r
201\r
202class BufferedIncrementalEncoder(IncrementalEncoder):\r
203 """\r
204 This subclass of IncrementalEncoder can be used as the baseclass for an\r
205 incremental encoder if the encoder must keep some of the output in a\r
206 buffer between calls to encode().\r
207 """\r
208 def __init__(self, errors='strict'):\r
209 IncrementalEncoder.__init__(self, errors)\r
210 self.buffer = "" # unencoded input that is kept between calls to encode()\r
211\r
212 def _buffer_encode(self, input, errors, final):\r
213 # Overwrite this method in subclasses: It must encode input\r
214 # and return an (output, length consumed) tuple\r
215 raise NotImplementedError\r
216\r
217 def encode(self, input, final=False):\r
218 # encode input (taking the buffer into account)\r
219 data = self.buffer + input\r
220 (result, consumed) = self._buffer_encode(data, self.errors, final)\r
221 # keep unencoded input until the next call\r
222 self.buffer = data[consumed:]\r
223 return result\r
224\r
225 def reset(self):\r
226 IncrementalEncoder.reset(self)\r
227 self.buffer = ""\r
228\r
229 def getstate(self):\r
230 return self.buffer or 0\r
231\r
232 def setstate(self, state):\r
233 self.buffer = state or ""\r
234\r
235class IncrementalDecoder(object):\r
236 """\r
237 An IncrementalDecoder decodes an input in multiple steps. The input can be\r
238 passed piece by piece to the decode() method. The IncrementalDecoder\r
239 remembers the state of the decoding process between calls to decode().\r
240 """\r
241 def __init__(self, errors='strict'):\r
242 """\r
243 Creates a IncrementalDecoder instance.\r
244\r
245 The IncrementalDecoder may use different error handling schemes by\r
246 providing the errors keyword argument. See the module docstring\r
247 for a list of possible values.\r
248 """\r
249 self.errors = errors\r
250\r
251 def decode(self, input, final=False):\r
252 """\r
253 Decodes input and returns the resulting object.\r
254 """\r
255 raise NotImplementedError\r
256\r
257 def reset(self):\r
258 """\r
259 Resets the decoder to the initial state.\r
260 """\r
261\r
262 def getstate(self):\r
263 """\r
264 Return the current state of the decoder.\r
265\r
266 This must be a (buffered_input, additional_state_info) tuple.\r
267 buffered_input must be a bytes object containing bytes that\r
268 were passed to decode() that have not yet been converted.\r
269 additional_state_info must be a non-negative integer\r
270 representing the state of the decoder WITHOUT yet having\r
271 processed the contents of buffered_input. In the initial state\r
272 and after reset(), getstate() must return (b"", 0).\r
273 """\r
274 return (b"", 0)\r
275\r
276 def setstate(self, state):\r
277 """\r
278 Set the current state of the decoder.\r
279\r
280 state must have been returned by getstate(). The effect of\r
281 setstate((b"", 0)) must be equivalent to reset().\r
282 """\r
283\r
284class BufferedIncrementalDecoder(IncrementalDecoder):\r
285 """\r
286 This subclass of IncrementalDecoder can be used as the baseclass for an\r
287 incremental decoder if the decoder must be able to handle incomplete byte\r
288 sequences.\r
289 """\r
290 def __init__(self, errors='strict'):\r
291 IncrementalDecoder.__init__(self, errors)\r
292 self.buffer = "" # undecoded input that is kept between calls to decode()\r
293\r
294 def _buffer_decode(self, input, errors, final):\r
295 # Overwrite this method in subclasses: It must decode input\r
296 # and return an (output, length consumed) tuple\r
297 raise NotImplementedError\r
298\r
299 def decode(self, input, final=False):\r
300 # decode input (taking the buffer into account)\r
301 data = self.buffer + input\r
302 (result, consumed) = self._buffer_decode(data, self.errors, final)\r
303 # keep undecoded input until the next call\r
304 self.buffer = data[consumed:]\r
305 return result\r
306\r
307 def reset(self):\r
308 IncrementalDecoder.reset(self)\r
309 self.buffer = ""\r
310\r
311 def getstate(self):\r
312 # additional state info is always 0\r
313 return (self.buffer, 0)\r
314\r
315 def setstate(self, state):\r
316 # ignore additional state info\r
317 self.buffer = state[0]\r
318\r
319#\r
320# The StreamWriter and StreamReader class provide generic working\r
321# interfaces which can be used to implement new encoding submodules\r
322# very easily. See encodings/utf_8.py for an example on how this is\r
323# done.\r
324#\r
325\r
326class StreamWriter(Codec):\r
327\r
328 def __init__(self, stream, errors='strict'):\r
329\r
330 """ Creates a StreamWriter instance.\r
331\r
332 stream must be a file-like object open for writing\r
333 (binary) data.\r
334\r
335 The StreamWriter may use different error handling\r
336 schemes by providing the errors keyword argument. These\r
337 parameters are predefined:\r
338\r
339 'strict' - raise a ValueError (or a subclass)\r
340 'ignore' - ignore the character and continue with the next\r
341 'replace'- replace with a suitable replacement character\r
342 'xmlcharrefreplace' - Replace with the appropriate XML\r
343 character reference.\r
344 'backslashreplace' - Replace with backslashed escape\r
345 sequences (only for encoding).\r
346\r
347 The set of allowed parameter values can be extended via\r
348 register_error.\r
349 """\r
350 self.stream = stream\r
351 self.errors = errors\r
352\r
353 def write(self, object):\r
354\r
355 """ Writes the object's contents encoded to self.stream.\r
356 """\r
357 data, consumed = self.encode(object, self.errors)\r
358 self.stream.write(data)\r
359\r
360 def writelines(self, list):\r
361\r
362 """ Writes the concatenated list of strings to the stream\r
363 using .write().\r
364 """\r
365 self.write(''.join(list))\r
366\r
367 def reset(self):\r
368\r
369 """ Flushes and resets the codec buffers used for keeping state.\r
370\r
371 Calling this method should ensure that the data on the\r
372 output is put into a clean state, that allows appending\r
373 of new fresh data without having to rescan the whole\r
374 stream to recover state.\r
375\r
376 """\r
377 pass\r
378\r
379 def seek(self, offset, whence=0):\r
380 self.stream.seek(offset, whence)\r
381 if whence == 0 and offset == 0:\r
382 self.reset()\r
383\r
384 def __getattr__(self, name,\r
385 getattr=getattr):\r
386\r
387 """ Inherit all other methods from the underlying stream.\r
388 """\r
389 return getattr(self.stream, name)\r
390\r
391 def __enter__(self):\r
392 return self\r
393\r
394 def __exit__(self, type, value, tb):\r
395 self.stream.close()\r
396\r
397###\r
398\r
399class StreamReader(Codec):\r
400\r
401 def __init__(self, stream, errors='strict'):\r
402\r
403 """ Creates a StreamReader instance.\r
404\r
405 stream must be a file-like object open for reading\r
406 (binary) data.\r
407\r
408 The StreamReader may use different error handling\r
409 schemes by providing the errors keyword argument. These\r
410 parameters are predefined:\r
411\r
412 'strict' - raise a ValueError (or a subclass)\r
413 'ignore' - ignore the character and continue with the next\r
414 'replace'- replace with a suitable replacement character;\r
415\r
416 The set of allowed parameter values can be extended via\r
417 register_error.\r
418 """\r
419 self.stream = stream\r
420 self.errors = errors\r
421 self.bytebuffer = ""\r
422 # For str->str decoding this will stay a str\r
423 # For str->unicode decoding the first read will promote it to unicode\r
424 self.charbuffer = ""\r
425 self.linebuffer = None\r
426\r
427 def decode(self, input, errors='strict'):\r
428 raise NotImplementedError\r
429\r
430 def read(self, size=-1, chars=-1, firstline=False):\r
431\r
432 """ Decodes data from the stream self.stream and returns the\r
433 resulting object.\r
434\r
435 chars indicates the number of characters to read from the\r
436 stream. read() will never return more than chars\r
437 characters, but it might return less, if there are not enough\r
438 characters available.\r
439\r
440 size indicates the approximate maximum number of bytes to\r
441 read from the stream for decoding purposes. The decoder\r
442 can modify this setting as appropriate. The default value\r
443 -1 indicates to read and decode as much as possible. size\r
444 is intended to prevent having to decode huge files in one\r
445 step.\r
446\r
447 If firstline is true, and a UnicodeDecodeError happens\r
448 after the first line terminator in the input only the first line\r
449 will be returned, the rest of the input will be kept until the\r
450 next call to read().\r
451\r
452 The method should use a greedy read strategy meaning that\r
453 it should read as much data as is allowed within the\r
454 definition of the encoding and the given size, e.g. if\r
455 optional encoding endings or state markers are available\r
456 on the stream, these should be read too.\r
457 """\r
458 # If we have lines cached, first merge them back into characters\r
459 if self.linebuffer:\r
460 self.charbuffer = "".join(self.linebuffer)\r
461 self.linebuffer = None\r
462\r
463 # read until we get the required number of characters (if available)\r
464 while True:\r
465 # can the request be satisfied from the character buffer?\r
466 if chars >= 0:\r
467 if len(self.charbuffer) >= chars:\r
468 break\r
469 elif size >= 0:\r
470 if len(self.charbuffer) >= size:\r
471 break\r
472 # we need more data\r
473 if size < 0:\r
474 newdata = self.stream.read()\r
475 else:\r
476 newdata = self.stream.read(size)\r
477 # decode bytes (those remaining from the last call included)\r
478 data = self.bytebuffer + newdata\r
479 try:\r
480 newchars, decodedbytes = self.decode(data, self.errors)\r
481 except UnicodeDecodeError, exc:\r
482 if firstline:\r
483 newchars, decodedbytes = self.decode(data[:exc.start], self.errors)\r
484 lines = newchars.splitlines(True)\r
485 if len(lines)<=1:\r
486 raise\r
487 else:\r
488 raise\r
489 # keep undecoded bytes until the next call\r
490 self.bytebuffer = data[decodedbytes:]\r
491 # put new characters in the character buffer\r
492 self.charbuffer += newchars\r
493 # there was no data available\r
494 if not newdata:\r
495 break\r
496 if chars < 0:\r
497 # Return everything we've got\r
498 result = self.charbuffer\r
499 self.charbuffer = ""\r
500 else:\r
501 # Return the first chars characters\r
502 result = self.charbuffer[:chars]\r
503 self.charbuffer = self.charbuffer[chars:]\r
504 return result\r
505\r
506 def readline(self, size=None, keepends=True):\r
507\r
508 """ Read one line from the input stream and return the\r
509 decoded data.\r
510\r
511 size, if given, is passed as size argument to the\r
512 read() method.\r
513\r
514 """\r
515 # If we have lines cached from an earlier read, return\r
516 # them unconditionally\r
517 if self.linebuffer:\r
518 line = self.linebuffer[0]\r
519 del self.linebuffer[0]\r
520 if len(self.linebuffer) == 1:\r
521 # revert to charbuffer mode; we might need more data\r
522 # next time\r
523 self.charbuffer = self.linebuffer[0]\r
524 self.linebuffer = None\r
525 if not keepends:\r
526 line = line.splitlines(False)[0]\r
527 return line\r
528\r
529 readsize = size or 72\r
530 line = ""\r
531 # If size is given, we call read() only once\r
532 while True:\r
533 data = self.read(readsize, firstline=True)\r
534 if data:\r
535 # If we're at a "\r" read one extra character (which might\r
536 # be a "\n") to get a proper line ending. If the stream is\r
537 # temporarily exhausted we return the wrong line ending.\r
538 if data.endswith("\r"):\r
539 data += self.read(size=1, chars=1)\r
540\r
541 line += data\r
542 lines = line.splitlines(True)\r
543 if lines:\r
544 if len(lines) > 1:\r
545 # More than one line result; the first line is a full line\r
546 # to return\r
547 line = lines[0]\r
548 del lines[0]\r
549 if len(lines) > 1:\r
550 # cache the remaining lines\r
551 lines[-1] += self.charbuffer\r
552 self.linebuffer = lines\r
553 self.charbuffer = None\r
554 else:\r
555 # only one remaining line, put it back into charbuffer\r
556 self.charbuffer = lines[0] + self.charbuffer\r
557 if not keepends:\r
558 line = line.splitlines(False)[0]\r
559 break\r
560 line0withend = lines[0]\r
561 line0withoutend = lines[0].splitlines(False)[0]\r
562 if line0withend != line0withoutend: # We really have a line end\r
563 # Put the rest back together and keep it until the next call\r
564 self.charbuffer = "".join(lines[1:]) + self.charbuffer\r
565 if keepends:\r
566 line = line0withend\r
567 else:\r
568 line = line0withoutend\r
569 break\r
570 # we didn't get anything or this was our only try\r
571 if not data or size is not None:\r
572 if line and not keepends:\r
573 line = line.splitlines(False)[0]\r
574 break\r
575 if readsize<8000:\r
576 readsize *= 2\r
577 return line\r
578\r
579 def readlines(self, sizehint=None, keepends=True):\r
580\r
581 """ Read all lines available on the input stream\r
582 and return them as list of lines.\r
583\r
584 Line breaks are implemented using the codec's decoder\r
585 method and are included in the list entries.\r
586\r
587 sizehint, if given, is ignored since there is no efficient\r
588 way to finding the true end-of-line.\r
589\r
590 """\r
591 data = self.read()\r
592 return data.splitlines(keepends)\r
593\r
594 def reset(self):\r
595\r
596 """ Resets the codec buffers used for keeping state.\r
597\r
598 Note that no stream repositioning should take place.\r
599 This method is primarily intended to be able to recover\r
600 from decoding errors.\r
601\r
602 """\r
603 self.bytebuffer = ""\r
604 self.charbuffer = u""\r
605 self.linebuffer = None\r
606\r
607 def seek(self, offset, whence=0):\r
608 """ Set the input stream's current position.\r
609\r
610 Resets the codec buffers used for keeping state.\r
611 """\r
612 self.stream.seek(offset, whence)\r
613 self.reset()\r
614\r
615 def next(self):\r
616\r
617 """ Return the next decoded line from the input stream."""\r
618 line = self.readline()\r
619 if line:\r
620 return line\r
621 raise StopIteration\r
622\r
623 def __iter__(self):\r
624 return self\r
625\r
626 def __getattr__(self, name,\r
627 getattr=getattr):\r
628\r
629 """ Inherit all other methods from the underlying stream.\r
630 """\r
631 return getattr(self.stream, name)\r
632\r
633 def __enter__(self):\r
634 return self\r
635\r
636 def __exit__(self, type, value, tb):\r
637 self.stream.close()\r
638\r
639###\r
640\r
641class StreamReaderWriter:\r
642\r
643 """ StreamReaderWriter instances allow wrapping streams which\r
644 work in both read and write modes.\r
645\r
646 The design is such that one can use the factory functions\r
647 returned by the codec.lookup() function to construct the\r
648 instance.\r
649\r
650 """\r
651 # Optional attributes set by the file wrappers below\r
652 encoding = 'unknown'\r
653\r
654 def __init__(self, stream, Reader, Writer, errors='strict'):\r
655\r
656 """ Creates a StreamReaderWriter instance.\r
657\r
658 stream must be a Stream-like object.\r
659\r
660 Reader, Writer must be factory functions or classes\r
661 providing the StreamReader, StreamWriter interface resp.\r
662\r
663 Error handling is done in the same way as defined for the\r
664 StreamWriter/Readers.\r
665\r
666 """\r
667 self.stream = stream\r
668 self.reader = Reader(stream, errors)\r
669 self.writer = Writer(stream, errors)\r
670 self.errors = errors\r
671\r
672 def read(self, size=-1):\r
673\r
674 return self.reader.read(size)\r
675\r
676 def readline(self, size=None):\r
677\r
678 return self.reader.readline(size)\r
679\r
680 def readlines(self, sizehint=None):\r
681\r
682 return self.reader.readlines(sizehint)\r
683\r
684 def next(self):\r
685\r
686 """ Return the next decoded line from the input stream."""\r
687 return self.reader.next()\r
688\r
689 def __iter__(self):\r
690 return self\r
691\r
692 def write(self, data):\r
693\r
694 return self.writer.write(data)\r
695\r
696 def writelines(self, list):\r
697\r
698 return self.writer.writelines(list)\r
699\r
700 def reset(self):\r
701\r
702 self.reader.reset()\r
703 self.writer.reset()\r
704\r
705 def seek(self, offset, whence=0):\r
706 self.stream.seek(offset, whence)\r
707 self.reader.reset()\r
708 if whence == 0 and offset == 0:\r
709 self.writer.reset()\r
710\r
711 def __getattr__(self, name,\r
712 getattr=getattr):\r
713\r
714 """ Inherit all other methods from the underlying stream.\r
715 """\r
716 return getattr(self.stream, name)\r
717\r
718 # these are needed to make "with codecs.open(...)" work properly\r
719\r
720 def __enter__(self):\r
721 return self\r
722\r
723 def __exit__(self, type, value, tb):\r
724 self.stream.close()\r
725\r
726###\r
727\r
728class StreamRecoder:\r
729\r
730 """ StreamRecoder instances provide a frontend - backend\r
731 view of encoding data.\r
732\r
733 They use the complete set of APIs returned by the\r
734 codecs.lookup() function to implement their task.\r
735\r
736 Data written to the stream is first decoded into an\r
737 intermediate format (which is dependent on the given codec\r
738 combination) and then written to the stream using an instance\r
739 of the provided Writer class.\r
740\r
741 In the other direction, data is read from the stream using a\r
742 Reader instance and then return encoded data to the caller.\r
743\r
744 """\r
745 # Optional attributes set by the file wrappers below\r
746 data_encoding = 'unknown'\r
747 file_encoding = 'unknown'\r
748\r
749 def __init__(self, stream, encode, decode, Reader, Writer,\r
750 errors='strict'):\r
751\r
752 """ Creates a StreamRecoder instance which implements a two-way\r
753 conversion: encode and decode work on the frontend (the\r
754 input to .read() and output of .write()) while\r
755 Reader and Writer work on the backend (reading and\r
756 writing to the stream).\r
757\r
758 You can use these objects to do transparent direct\r
759 recodings from e.g. latin-1 to utf-8 and back.\r
760\r
761 stream must be a file-like object.\r
762\r
763 encode, decode must adhere to the Codec interface, Reader,\r
764 Writer must be factory functions or classes providing the\r
765 StreamReader, StreamWriter interface resp.\r
766\r
767 encode and decode are needed for the frontend translation,\r
768 Reader and Writer for the backend translation. Unicode is\r
769 used as intermediate encoding.\r
770\r
771 Error handling is done in the same way as defined for the\r
772 StreamWriter/Readers.\r
773\r
774 """\r
775 self.stream = stream\r
776 self.encode = encode\r
777 self.decode = decode\r
778 self.reader = Reader(stream, errors)\r
779 self.writer = Writer(stream, errors)\r
780 self.errors = errors\r
781\r
782 def read(self, size=-1):\r
783\r
784 data = self.reader.read(size)\r
785 data, bytesencoded = self.encode(data, self.errors)\r
786 return data\r
787\r
788 def readline(self, size=None):\r
789\r
790 if size is None:\r
791 data = self.reader.readline()\r
792 else:\r
793 data = self.reader.readline(size)\r
794 data, bytesencoded = self.encode(data, self.errors)\r
795 return data\r
796\r
797 def readlines(self, sizehint=None):\r
798\r
799 data = self.reader.read()\r
800 data, bytesencoded = self.encode(data, self.errors)\r
801 return data.splitlines(1)\r
802\r
803 def next(self):\r
804\r
805 """ Return the next decoded line from the input stream."""\r
806 data = self.reader.next()\r
807 data, bytesencoded = self.encode(data, self.errors)\r
808 return data\r
809\r
810 def __iter__(self):\r
811 return self\r
812\r
813 def write(self, data):\r
814\r
815 data, bytesdecoded = self.decode(data, self.errors)\r
816 return self.writer.write(data)\r
817\r
818 def writelines(self, list):\r
819\r
820 data = ''.join(list)\r
821 data, bytesdecoded = self.decode(data, self.errors)\r
822 return self.writer.write(data)\r
823\r
824 def reset(self):\r
825\r
826 self.reader.reset()\r
827 self.writer.reset()\r
828\r
829 def __getattr__(self, name,\r
830 getattr=getattr):\r
831\r
832 """ Inherit all other methods from the underlying stream.\r
833 """\r
834 return getattr(self.stream, name)\r
835\r
836 def __enter__(self):\r
837 return self\r
838\r
839 def __exit__(self, type, value, tb):\r
840 self.stream.close()\r
841\r
842### Shortcuts\r
843\r
844def open(filename, mode='rb', encoding=None, errors='strict', buffering=1):\r
845\r
846 """ Open an encoded file using the given mode and return\r
847 a wrapped version providing transparent encoding/decoding.\r
848\r
849 Note: The wrapped version will only accept the object format\r
850 defined by the codecs, i.e. Unicode objects for most builtin\r
851 codecs. Output is also codec dependent and will usually be\r
852 Unicode as well.\r
853\r
854 Files are always opened in binary mode, even if no binary mode\r
855 was specified. This is done to avoid data loss due to encodings\r
856 using 8-bit values. The default file mode is 'rb' meaning to\r
857 open the file in binary read mode.\r
858\r
859 encoding specifies the encoding which is to be used for the\r
860 file.\r
861\r
862 errors may be given to define the error handling. It defaults\r
863 to 'strict' which causes ValueErrors to be raised in case an\r
864 encoding error occurs.\r
865\r
866 buffering has the same meaning as for the builtin open() API.\r
867 It defaults to line buffered.\r
868\r
869 The returned wrapped file object provides an extra attribute\r
870 .encoding which allows querying the used encoding. This\r
871 attribute is only available if an encoding was specified as\r
872 parameter.\r
873\r
874 """\r
875 if encoding is not None:\r
876 if 'U' in mode:\r
877 # No automatic conversion of '\n' is done on reading and writing\r
878 mode = mode.strip().replace('U', '')\r
879 if mode[:1] not in set('rwa'):\r
880 mode = 'r' + mode\r
881 if 'b' not in mode:\r
882 # Force opening of the file in binary mode\r
883 mode = mode + 'b'\r
884 file = __builtin__.open(filename, mode, buffering)\r
885 if encoding is None:\r
886 return file\r
887 info = lookup(encoding)\r
888 srw = StreamReaderWriter(file, info.streamreader, info.streamwriter, errors)\r
889 # Add attributes to simplify introspection\r
890 srw.encoding = encoding\r
891 return srw\r
892\r
893def EncodedFile(file, data_encoding, file_encoding=None, errors='strict'):\r
894\r
895 """ Return a wrapped version of file which provides transparent\r
896 encoding translation.\r
897\r
898 Strings written to the wrapped file are interpreted according\r
899 to the given data_encoding and then written to the original\r
900 file as string using file_encoding. The intermediate encoding\r
901 will usually be Unicode but depends on the specified codecs.\r
902\r
903 Strings are read from the file using file_encoding and then\r
904 passed back to the caller as string using data_encoding.\r
905\r
906 If file_encoding is not given, it defaults to data_encoding.\r
907\r
908 errors may be given to define the error handling. It defaults\r
909 to 'strict' which causes ValueErrors to be raised in case an\r
910 encoding error occurs.\r
911\r
912 The returned wrapped file object provides two extra attributes\r
913 .data_encoding and .file_encoding which reflect the given\r
914 parameters of the same name. The attributes can be used for\r
915 introspection by Python programs.\r
916\r
917 """\r
918 if file_encoding is None:\r
919 file_encoding = data_encoding\r
920 data_info = lookup(data_encoding)\r
921 file_info = lookup(file_encoding)\r
922 sr = StreamRecoder(file, data_info.encode, data_info.decode,\r
923 file_info.streamreader, file_info.streamwriter, errors)\r
924 # Add attributes to simplify introspection\r
925 sr.data_encoding = data_encoding\r
926 sr.file_encoding = file_encoding\r
927 return sr\r
928\r
929### Helpers for codec lookup\r
930\r
931def getencoder(encoding):\r
932\r
933 """ Lookup up the codec for the given encoding and return\r
934 its encoder function.\r
935\r
936 Raises a LookupError in case the encoding cannot be found.\r
937\r
938 """\r
939 return lookup(encoding).encode\r
940\r
941def getdecoder(encoding):\r
942\r
943 """ Lookup up the codec for the given encoding and return\r
944 its decoder function.\r
945\r
946 Raises a LookupError in case the encoding cannot be found.\r
947\r
948 """\r
949 return lookup(encoding).decode\r
950\r
951def getincrementalencoder(encoding):\r
952\r
953 """ Lookup up the codec for the given encoding and return\r
954 its IncrementalEncoder class or factory function.\r
955\r
956 Raises a LookupError in case the encoding cannot be found\r
957 or the codecs doesn't provide an incremental encoder.\r
958\r
959 """\r
960 encoder = lookup(encoding).incrementalencoder\r
961 if encoder is None:\r
962 raise LookupError(encoding)\r
963 return encoder\r
964\r
965def getincrementaldecoder(encoding):\r
966\r
967 """ Lookup up the codec for the given encoding and return\r
968 its IncrementalDecoder class or factory function.\r
969\r
970 Raises a LookupError in case the encoding cannot be found\r
971 or the codecs doesn't provide an incremental decoder.\r
972\r
973 """\r
974 decoder = lookup(encoding).incrementaldecoder\r
975 if decoder is None:\r
976 raise LookupError(encoding)\r
977 return decoder\r
978\r
979def getreader(encoding):\r
980\r
981 """ Lookup up the codec for the given encoding and return\r
982 its StreamReader class or factory function.\r
983\r
984 Raises a LookupError in case the encoding cannot be found.\r
985\r
986 """\r
987 return lookup(encoding).streamreader\r
988\r
989def getwriter(encoding):\r
990\r
991 """ Lookup up the codec for the given encoding and return\r
992 its StreamWriter class or factory function.\r
993\r
994 Raises a LookupError in case the encoding cannot be found.\r
995\r
996 """\r
997 return lookup(encoding).streamwriter\r
998\r
999def iterencode(iterator, encoding, errors='strict', **kwargs):\r
1000 """\r
1001 Encoding iterator.\r
1002\r
1003 Encodes the input strings from the iterator using a IncrementalEncoder.\r
1004\r
1005 errors and kwargs are passed through to the IncrementalEncoder\r
1006 constructor.\r
1007 """\r
1008 encoder = getincrementalencoder(encoding)(errors, **kwargs)\r
1009 for input in iterator:\r
1010 output = encoder.encode(input)\r
1011 if output:\r
1012 yield output\r
1013 output = encoder.encode("", True)\r
1014 if output:\r
1015 yield output\r
1016\r
1017def iterdecode(iterator, encoding, errors='strict', **kwargs):\r
1018 """\r
1019 Decoding iterator.\r
1020\r
1021 Decodes the input strings from the iterator using a IncrementalDecoder.\r
1022\r
1023 errors and kwargs are passed through to the IncrementalDecoder\r
1024 constructor.\r
1025 """\r
1026 decoder = getincrementaldecoder(encoding)(errors, **kwargs)\r
1027 for input in iterator:\r
1028 output = decoder.decode(input)\r
1029 if output:\r
1030 yield output\r
1031 output = decoder.decode("", True)\r
1032 if output:\r
1033 yield output\r
1034\r
1035### Helpers for charmap-based codecs\r
1036\r
1037def make_identity_dict(rng):\r
1038\r
1039 """ make_identity_dict(rng) -> dict\r
1040\r
1041 Return a dictionary where elements of the rng sequence are\r
1042 mapped to themselves.\r
1043\r
1044 """\r
1045 res = {}\r
1046 for i in rng:\r
1047 res[i]=i\r
1048 return res\r
1049\r
1050def make_encoding_map(decoding_map):\r
1051\r
1052 """ Creates an encoding map from a decoding map.\r
1053\r
1054 If a target mapping in the decoding map occurs multiple\r
1055 times, then that target is mapped to None (undefined mapping),\r
1056 causing an exception when encountered by the charmap codec\r
1057 during translation.\r
1058\r
1059 One example where this happens is cp875.py which decodes\r
1060 multiple character to \\u001a.\r
1061\r
1062 """\r
1063 m = {}\r
1064 for k,v in decoding_map.items():\r
1065 if not v in m:\r
1066 m[v] = k\r
1067 else:\r
1068 m[v] = None\r
1069 return m\r
1070\r
1071### error handlers\r
1072\r
1073try:\r
1074 strict_errors = lookup_error("strict")\r
1075 ignore_errors = lookup_error("ignore")\r
1076 replace_errors = lookup_error("replace")\r
1077 xmlcharrefreplace_errors = lookup_error("xmlcharrefreplace")\r
1078 backslashreplace_errors = lookup_error("backslashreplace")\r
1079except LookupError:\r
1080 # In --disable-unicode builds, these error handler are missing\r
1081 strict_errors = None\r
1082 ignore_errors = None\r
1083 replace_errors = None\r
1084 xmlcharrefreplace_errors = None\r
1085 backslashreplace_errors = None\r
1086\r
1087# Tell modulefinder that using codecs probably needs the encodings\r
1088# package\r
1089_false = 0\r
1090if _false:\r
1091 import encodings\r
1092\r
1093### Tests\r
1094\r
1095if __name__ == '__main__':\r
1096\r
1097 # Make stdout translate Latin-1 output into UTF-8 output\r
1098 sys.stdout = EncodedFile(sys.stdout, 'latin-1', 'utf-8')\r
1099\r
1100 # Have stdin translate Latin-1 input into UTF-8 input\r
1101 sys.stdin = EncodedFile(sys.stdin, 'utf-8', 'latin-1')\r