--- /dev/null
+""" Python 'utf-8-sig' Codec\r
+This work similar to UTF-8 with the following changes:\r
+\r
+* On encoding/writing a UTF-8 encoded BOM will be prepended/written as the\r
+ first three bytes.\r
+\r
+* On decoding/reading if the first three bytes are a UTF-8 encoded BOM, these\r
+ bytes will be skipped.\r
+"""\r
+import codecs\r
+\r
+### Codec APIs\r
+\r
+def encode(input, errors='strict'):\r
+ return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))\r
+\r
+def decode(input, errors='strict'):\r
+ prefix = 0\r
+ if input[:3] == codecs.BOM_UTF8:\r
+ input = input[3:]\r
+ prefix = 3\r
+ (output, consumed) = codecs.utf_8_decode(input, errors, True)\r
+ return (output, consumed+prefix)\r
+\r
+class IncrementalEncoder(codecs.IncrementalEncoder):\r
+ def __init__(self, errors='strict'):\r
+ codecs.IncrementalEncoder.__init__(self, errors)\r
+ self.first = 1\r
+\r
+ def encode(self, input, final=False):\r
+ if self.first:\r
+ self.first = 0\r
+ return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0]\r
+ else:\r
+ return codecs.utf_8_encode(input, self.errors)[0]\r
+\r
+ def reset(self):\r
+ codecs.IncrementalEncoder.reset(self)\r
+ self.first = 1\r
+\r
+ def getstate(self):\r
+ return self.first\r
+\r
+ def setstate(self, state):\r
+ self.first = state\r
+\r
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):\r
+ def __init__(self, errors='strict'):\r
+ codecs.BufferedIncrementalDecoder.__init__(self, errors)\r
+ self.first = True\r
+\r
+ def _buffer_decode(self, input, errors, final):\r
+ if self.first:\r
+ if len(input) < 3:\r
+ if codecs.BOM_UTF8.startswith(input):\r
+ # not enough data to decide if this really is a BOM\r
+ # => try again on the next call\r
+ return (u"", 0)\r
+ else:\r
+ self.first = None\r
+ else:\r
+ self.first = None\r
+ if input[:3] == codecs.BOM_UTF8:\r
+ (output, consumed) = codecs.utf_8_decode(input[3:], errors, final)\r
+ return (output, consumed+3)\r
+ return codecs.utf_8_decode(input, errors, final)\r
+\r
+ def reset(self):\r
+ codecs.BufferedIncrementalDecoder.reset(self)\r
+ self.first = True\r
+\r
+class StreamWriter(codecs.StreamWriter):\r
+ def reset(self):\r
+ codecs.StreamWriter.reset(self)\r
+ try:\r
+ del self.encode\r
+ except AttributeError:\r
+ pass\r
+\r
+ def encode(self, input, errors='strict'):\r
+ self.encode = codecs.utf_8_encode\r
+ return encode(input, errors)\r
+\r
+class StreamReader(codecs.StreamReader):\r
+ def reset(self):\r
+ codecs.StreamReader.reset(self)\r
+ try:\r
+ del self.decode\r
+ except AttributeError:\r
+ pass\r
+\r
+ def decode(self, input, errors='strict'):\r
+ if len(input) < 3:\r
+ if codecs.BOM_UTF8.startswith(input):\r
+ # not enough data to decide if this is a BOM\r
+ # => try again on the next call\r
+ return (u"", 0)\r
+ elif input[:3] == codecs.BOM_UTF8:\r
+ self.decode = codecs.utf_8_decode\r
+ (output, consumed) = codecs.utf_8_decode(input[3:],errors)\r
+ return (output, consumed+3)\r
+ # (else) no BOM present\r
+ self.decode = codecs.utf_8_decode\r
+ return codecs.utf_8_decode(input, errors)\r
+\r
+### encodings module API\r
+\r
+def getregentry():\r
+ return codecs.CodecInfo(\r
+ name='utf-8-sig',\r
+ encode=encode,\r
+ decode=decode,\r
+ incrementalencoder=IncrementalEncoder,\r
+ incrementaldecoder=IncrementalDecoder,\r
+ streamreader=StreamReader,\r
+ streamwriter=StreamWriter,\r
+ )\r