--- /dev/null
+# This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)\r
+\r
+import stringprep, re, codecs\r
+from unicodedata import ucd_3_2_0 as unicodedata\r
+\r
+# IDNA section 3.1\r
+dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")\r
+\r
+# IDNA section 5\r
+ace_prefix = "xn--"\r
+uace_prefix = unicode(ace_prefix, "ascii")\r
+\r
+# This assumes query strings, so AllowUnassigned is true\r
+def nameprep(label):\r
+ # Map\r
+ newlabel = []\r
+ for c in label:\r
+ if stringprep.in_table_b1(c):\r
+ # Map to nothing\r
+ continue\r
+ newlabel.append(stringprep.map_table_b2(c))\r
+ label = u"".join(newlabel)\r
+\r
+ # Normalize\r
+ label = unicodedata.normalize("NFKC", label)\r
+\r
+ # Prohibit\r
+ for c in label:\r
+ if stringprep.in_table_c12(c) or \\r
+ stringprep.in_table_c22(c) or \\r
+ stringprep.in_table_c3(c) or \\r
+ stringprep.in_table_c4(c) or \\r
+ stringprep.in_table_c5(c) or \\r
+ stringprep.in_table_c6(c) or \\r
+ stringprep.in_table_c7(c) or \\r
+ stringprep.in_table_c8(c) or \\r
+ stringprep.in_table_c9(c):\r
+ raise UnicodeError("Invalid character %r" % c)\r
+\r
+ # Check bidi\r
+ RandAL = map(stringprep.in_table_d1, label)\r
+ for c in RandAL:\r
+ if c:\r
+ # There is a RandAL char in the string. Must perform further\r
+ # tests:\r
+ # 1) The characters in section 5.8 MUST be prohibited.\r
+ # This is table C.8, which was already checked\r
+ # 2) If a string contains any RandALCat character, the string\r
+ # MUST NOT contain any LCat character.\r
+ if filter(stringprep.in_table_d2, label):\r
+ raise UnicodeError("Violation of BIDI requirement 2")\r
+\r
+ # 3) If a string contains any RandALCat character, a\r
+ # RandALCat character MUST be the first character of the\r
+ # string, and a RandALCat character MUST be the last\r
+ # character of the string.\r
+ if not RandAL[0] or not RandAL[-1]:\r
+ raise UnicodeError("Violation of BIDI requirement 3")\r
+\r
+ return label\r
+\r
+def ToASCII(label):\r
+ try:\r
+ # Step 1: try ASCII\r
+ label = label.encode("ascii")\r
+ except UnicodeError:\r
+ pass\r
+ else:\r
+ # Skip to step 3: UseSTD3ASCIIRules is false, so\r
+ # Skip to step 8.\r
+ if 0 < len(label) < 64:\r
+ return label\r
+ raise UnicodeError("label empty or too long")\r
+\r
+ # Step 2: nameprep\r
+ label = nameprep(label)\r
+\r
+ # Step 3: UseSTD3ASCIIRules is false\r
+ # Step 4: try ASCII\r
+ try:\r
+ label = label.encode("ascii")\r
+ except UnicodeError:\r
+ pass\r
+ else:\r
+ # Skip to step 8.\r
+ if 0 < len(label) < 64:\r
+ return label\r
+ raise UnicodeError("label empty or too long")\r
+\r
+ # Step 5: Check ACE prefix\r
+ if label.startswith(uace_prefix):\r
+ raise UnicodeError("Label starts with ACE prefix")\r
+\r
+ # Step 6: Encode with PUNYCODE\r
+ label = label.encode("punycode")\r
+\r
+ # Step 7: Prepend ACE prefix\r
+ label = ace_prefix + label\r
+\r
+ # Step 8: Check size\r
+ if 0 < len(label) < 64:\r
+ return label\r
+ raise UnicodeError("label empty or too long")\r
+\r
+def ToUnicode(label):\r
+ # Step 1: Check for ASCII\r
+ if isinstance(label, str):\r
+ pure_ascii = True\r
+ else:\r
+ try:\r
+ label = label.encode("ascii")\r
+ pure_ascii = True\r
+ except UnicodeError:\r
+ pure_ascii = False\r
+ if not pure_ascii:\r
+ # Step 2: Perform nameprep\r
+ label = nameprep(label)\r
+ # It doesn't say this, but apparently, it should be ASCII now\r
+ try:\r
+ label = label.encode("ascii")\r
+ except UnicodeError:\r
+ raise UnicodeError("Invalid character in IDN label")\r
+ # Step 3: Check for ACE prefix\r
+ if not label.startswith(ace_prefix):\r
+ return unicode(label, "ascii")\r
+\r
+ # Step 4: Remove ACE prefix\r
+ label1 = label[len(ace_prefix):]\r
+\r
+ # Step 5: Decode using PUNYCODE\r
+ result = label1.decode("punycode")\r
+\r
+ # Step 6: Apply ToASCII\r
+ label2 = ToASCII(result)\r
+\r
+ # Step 7: Compare the result of step 6 with the one of step 3\r
+ # label2 will already be in lower case.\r
+ if label.lower() != label2:\r
+ raise UnicodeError("IDNA does not round-trip", label, label2)\r
+\r
+ # Step 8: return the result of step 5\r
+ return result\r
+\r
+### Codec APIs\r
+\r
+class Codec(codecs.Codec):\r
+ def encode(self,input,errors='strict'):\r
+\r
+ if errors != 'strict':\r
+ # IDNA is quite clear that implementations must be strict\r
+ raise UnicodeError("unsupported error handling "+errors)\r
+\r
+ if not input:\r
+ return "", 0\r
+\r
+ result = []\r
+ labels = dots.split(input)\r
+ if labels and len(labels[-1])==0:\r
+ trailing_dot = '.'\r
+ del labels[-1]\r
+ else:\r
+ trailing_dot = ''\r
+ for label in labels:\r
+ result.append(ToASCII(label))\r
+ # Join with U+002E\r
+ return ".".join(result)+trailing_dot, len(input)\r
+\r
+ def decode(self,input,errors='strict'):\r
+\r
+ if errors != 'strict':\r
+ raise UnicodeError("Unsupported error handling "+errors)\r
+\r
+ if not input:\r
+ return u"", 0\r
+\r
+ # IDNA allows decoding to operate on Unicode strings, too.\r
+ if isinstance(input, unicode):\r
+ labels = dots.split(input)\r
+ else:\r
+ # Must be ASCII string\r
+ input = str(input)\r
+ unicode(input, "ascii")\r
+ labels = input.split(".")\r
+\r
+ if labels and len(labels[-1]) == 0:\r
+ trailing_dot = u'.'\r
+ del labels[-1]\r
+ else:\r
+ trailing_dot = u''\r
+\r
+ result = []\r
+ for label in labels:\r
+ result.append(ToUnicode(label))\r
+\r
+ return u".".join(result)+trailing_dot, len(input)\r
+\r
+class IncrementalEncoder(codecs.BufferedIncrementalEncoder):\r
+ def _buffer_encode(self, input, errors, final):\r
+ if errors != 'strict':\r
+ # IDNA is quite clear that implementations must be strict\r
+ raise UnicodeError("unsupported error handling "+errors)\r
+\r
+ if not input:\r
+ return ("", 0)\r
+\r
+ labels = dots.split(input)\r
+ trailing_dot = u''\r
+ if labels:\r
+ if not labels[-1]:\r
+ trailing_dot = '.'\r
+ del labels[-1]\r
+ elif not final:\r
+ # Keep potentially unfinished label until the next call\r
+ del labels[-1]\r
+ if labels:\r
+ trailing_dot = '.'\r
+\r
+ result = []\r
+ size = 0\r
+ for label in labels:\r
+ result.append(ToASCII(label))\r
+ if size:\r
+ size += 1\r
+ size += len(label)\r
+\r
+ # Join with U+002E\r
+ result = ".".join(result) + trailing_dot\r
+ size += len(trailing_dot)\r
+ return (result, size)\r
+\r
+class IncrementalDecoder(codecs.BufferedIncrementalDecoder):\r
+ def _buffer_decode(self, input, errors, final):\r
+ if errors != 'strict':\r
+ raise UnicodeError("Unsupported error handling "+errors)\r
+\r
+ if not input:\r
+ return (u"", 0)\r
+\r
+ # IDNA allows decoding to operate on Unicode strings, too.\r
+ if isinstance(input, unicode):\r
+ labels = dots.split(input)\r
+ else:\r
+ # Must be ASCII string\r
+ input = str(input)\r
+ unicode(input, "ascii")\r
+ labels = input.split(".")\r
+\r
+ trailing_dot = u''\r
+ if labels:\r
+ if not labels[-1]:\r
+ trailing_dot = u'.'\r
+ del labels[-1]\r
+ elif not final:\r
+ # Keep potentially unfinished label until the next call\r
+ del labels[-1]\r
+ if labels:\r
+ trailing_dot = u'.'\r
+\r
+ result = []\r
+ size = 0\r
+ for label in labels:\r
+ result.append(ToUnicode(label))\r
+ if size:\r
+ size += 1\r
+ size += len(label)\r
+\r
+ result = u".".join(result) + trailing_dot\r
+ size += len(trailing_dot)\r
+ return (result, size)\r
+\r
+class StreamWriter(Codec,codecs.StreamWriter):\r
+ pass\r
+\r
+class StreamReader(Codec,codecs.StreamReader):\r
+ pass\r
+\r
+### encodings module API\r
+\r
+def getregentry():\r
+ return codecs.CodecInfo(\r
+ name='idna',\r
+ encode=Codec().encode,\r
+ decode=Codec().decode,\r
+ incrementalencoder=IncrementalEncoder,\r
+ incrementaldecoder=IncrementalDecoder,\r
+ streamwriter=StreamWriter,\r
+ streamreader=StreamReader,\r
+ )\r