]>
Commit | Line | Data |
---|---|---|
3257aa99 DM |
1 | # This module implements the RFCs 3490 (IDNA) and 3491 (Nameprep)\r |
2 | \r | |
3 | import stringprep, re, codecs\r | |
4 | from unicodedata import ucd_3_2_0 as unicodedata\r | |
5 | \r | |
6 | # IDNA section 3.1\r | |
7 | dots = re.compile(u"[\u002E\u3002\uFF0E\uFF61]")\r | |
8 | \r | |
9 | # IDNA section 5\r | |
10 | ace_prefix = "xn--"\r | |
11 | uace_prefix = unicode(ace_prefix, "ascii")\r | |
12 | \r | |
13 | # This assumes query strings, so AllowUnassigned is true\r | |
14 | def nameprep(label):\r | |
15 | # Map\r | |
16 | newlabel = []\r | |
17 | for c in label:\r | |
18 | if stringprep.in_table_b1(c):\r | |
19 | # Map to nothing\r | |
20 | continue\r | |
21 | newlabel.append(stringprep.map_table_b2(c))\r | |
22 | label = u"".join(newlabel)\r | |
23 | \r | |
24 | # Normalize\r | |
25 | label = unicodedata.normalize("NFKC", label)\r | |
26 | \r | |
27 | # Prohibit\r | |
28 | for c in label:\r | |
29 | if stringprep.in_table_c12(c) or \\r | |
30 | stringprep.in_table_c22(c) or \\r | |
31 | stringprep.in_table_c3(c) or \\r | |
32 | stringprep.in_table_c4(c) or \\r | |
33 | stringprep.in_table_c5(c) or \\r | |
34 | stringprep.in_table_c6(c) or \\r | |
35 | stringprep.in_table_c7(c) or \\r | |
36 | stringprep.in_table_c8(c) or \\r | |
37 | stringprep.in_table_c9(c):\r | |
38 | raise UnicodeError("Invalid character %r" % c)\r | |
39 | \r | |
40 | # Check bidi\r | |
41 | RandAL = map(stringprep.in_table_d1, label)\r | |
42 | for c in RandAL:\r | |
43 | if c:\r | |
44 | # There is a RandAL char in the string. Must perform further\r | |
45 | # tests:\r | |
46 | # 1) The characters in section 5.8 MUST be prohibited.\r | |
47 | # This is table C.8, which was already checked\r | |
48 | # 2) If a string contains any RandALCat character, the string\r | |
49 | # MUST NOT contain any LCat character.\r | |
50 | if filter(stringprep.in_table_d2, label):\r | |
51 | raise UnicodeError("Violation of BIDI requirement 2")\r | |
52 | \r | |
53 | # 3) If a string contains any RandALCat character, a\r | |
54 | # RandALCat character MUST be the first character of the\r | |
55 | # string, and a RandALCat character MUST be the last\r | |
56 | # character of the string.\r | |
57 | if not RandAL[0] or not RandAL[-1]:\r | |
58 | raise UnicodeError("Violation of BIDI requirement 3")\r | |
59 | \r | |
60 | return label\r | |
61 | \r | |
62 | def ToASCII(label):\r | |
63 | try:\r | |
64 | # Step 1: try ASCII\r | |
65 | label = label.encode("ascii")\r | |
66 | except UnicodeError:\r | |
67 | pass\r | |
68 | else:\r | |
69 | # Skip to step 3: UseSTD3ASCIIRules is false, so\r | |
70 | # Skip to step 8.\r | |
71 | if 0 < len(label) < 64:\r | |
72 | return label\r | |
73 | raise UnicodeError("label empty or too long")\r | |
74 | \r | |
75 | # Step 2: nameprep\r | |
76 | label = nameprep(label)\r | |
77 | \r | |
78 | # Step 3: UseSTD3ASCIIRules is false\r | |
79 | # Step 4: try ASCII\r | |
80 | try:\r | |
81 | label = label.encode("ascii")\r | |
82 | except UnicodeError:\r | |
83 | pass\r | |
84 | else:\r | |
85 | # Skip to step 8.\r | |
86 | if 0 < len(label) < 64:\r | |
87 | return label\r | |
88 | raise UnicodeError("label empty or too long")\r | |
89 | \r | |
90 | # Step 5: Check ACE prefix\r | |
91 | if label.startswith(uace_prefix):\r | |
92 | raise UnicodeError("Label starts with ACE prefix")\r | |
93 | \r | |
94 | # Step 6: Encode with PUNYCODE\r | |
95 | label = label.encode("punycode")\r | |
96 | \r | |
97 | # Step 7: Prepend ACE prefix\r | |
98 | label = ace_prefix + label\r | |
99 | \r | |
100 | # Step 8: Check size\r | |
101 | if 0 < len(label) < 64:\r | |
102 | return label\r | |
103 | raise UnicodeError("label empty or too long")\r | |
104 | \r | |
105 | def ToUnicode(label):\r | |
106 | # Step 1: Check for ASCII\r | |
107 | if isinstance(label, str):\r | |
108 | pure_ascii = True\r | |
109 | else:\r | |
110 | try:\r | |
111 | label = label.encode("ascii")\r | |
112 | pure_ascii = True\r | |
113 | except UnicodeError:\r | |
114 | pure_ascii = False\r | |
115 | if not pure_ascii:\r | |
116 | # Step 2: Perform nameprep\r | |
117 | label = nameprep(label)\r | |
118 | # It doesn't say this, but apparently, it should be ASCII now\r | |
119 | try:\r | |
120 | label = label.encode("ascii")\r | |
121 | except UnicodeError:\r | |
122 | raise UnicodeError("Invalid character in IDN label")\r | |
123 | # Step 3: Check for ACE prefix\r | |
124 | if not label.startswith(ace_prefix):\r | |
125 | return unicode(label, "ascii")\r | |
126 | \r | |
127 | # Step 4: Remove ACE prefix\r | |
128 | label1 = label[len(ace_prefix):]\r | |
129 | \r | |
130 | # Step 5: Decode using PUNYCODE\r | |
131 | result = label1.decode("punycode")\r | |
132 | \r | |
133 | # Step 6: Apply ToASCII\r | |
134 | label2 = ToASCII(result)\r | |
135 | \r | |
136 | # Step 7: Compare the result of step 6 with the one of step 3\r | |
137 | # label2 will already be in lower case.\r | |
138 | if label.lower() != label2:\r | |
139 | raise UnicodeError("IDNA does not round-trip", label, label2)\r | |
140 | \r | |
141 | # Step 8: return the result of step 5\r | |
142 | return result\r | |
143 | \r | |
144 | ### Codec APIs\r | |
145 | \r | |
146 | class Codec(codecs.Codec):\r | |
147 | def encode(self,input,errors='strict'):\r | |
148 | \r | |
149 | if errors != 'strict':\r | |
150 | # IDNA is quite clear that implementations must be strict\r | |
151 | raise UnicodeError("unsupported error handling "+errors)\r | |
152 | \r | |
153 | if not input:\r | |
154 | return "", 0\r | |
155 | \r | |
156 | result = []\r | |
157 | labels = dots.split(input)\r | |
158 | if labels and len(labels[-1])==0:\r | |
159 | trailing_dot = '.'\r | |
160 | del labels[-1]\r | |
161 | else:\r | |
162 | trailing_dot = ''\r | |
163 | for label in labels:\r | |
164 | result.append(ToASCII(label))\r | |
165 | # Join with U+002E\r | |
166 | return ".".join(result)+trailing_dot, len(input)\r | |
167 | \r | |
168 | def decode(self,input,errors='strict'):\r | |
169 | \r | |
170 | if errors != 'strict':\r | |
171 | raise UnicodeError("Unsupported error handling "+errors)\r | |
172 | \r | |
173 | if not input:\r | |
174 | return u"", 0\r | |
175 | \r | |
176 | # IDNA allows decoding to operate on Unicode strings, too.\r | |
177 | if isinstance(input, unicode):\r | |
178 | labels = dots.split(input)\r | |
179 | else:\r | |
180 | # Must be ASCII string\r | |
181 | input = str(input)\r | |
182 | unicode(input, "ascii")\r | |
183 | labels = input.split(".")\r | |
184 | \r | |
185 | if labels and len(labels[-1]) == 0:\r | |
186 | trailing_dot = u'.'\r | |
187 | del labels[-1]\r | |
188 | else:\r | |
189 | trailing_dot = u''\r | |
190 | \r | |
191 | result = []\r | |
192 | for label in labels:\r | |
193 | result.append(ToUnicode(label))\r | |
194 | \r | |
195 | return u".".join(result)+trailing_dot, len(input)\r | |
196 | \r | |
197 | class IncrementalEncoder(codecs.BufferedIncrementalEncoder):\r | |
198 | def _buffer_encode(self, input, errors, final):\r | |
199 | if errors != 'strict':\r | |
200 | # IDNA is quite clear that implementations must be strict\r | |
201 | raise UnicodeError("unsupported error handling "+errors)\r | |
202 | \r | |
203 | if not input:\r | |
204 | return ("", 0)\r | |
205 | \r | |
206 | labels = dots.split(input)\r | |
207 | trailing_dot = u''\r | |
208 | if labels:\r | |
209 | if not labels[-1]:\r | |
210 | trailing_dot = '.'\r | |
211 | del labels[-1]\r | |
212 | elif not final:\r | |
213 | # Keep potentially unfinished label until the next call\r | |
214 | del labels[-1]\r | |
215 | if labels:\r | |
216 | trailing_dot = '.'\r | |
217 | \r | |
218 | result = []\r | |
219 | size = 0\r | |
220 | for label in labels:\r | |
221 | result.append(ToASCII(label))\r | |
222 | if size:\r | |
223 | size += 1\r | |
224 | size += len(label)\r | |
225 | \r | |
226 | # Join with U+002E\r | |
227 | result = ".".join(result) + trailing_dot\r | |
228 | size += len(trailing_dot)\r | |
229 | return (result, size)\r | |
230 | \r | |
231 | class IncrementalDecoder(codecs.BufferedIncrementalDecoder):\r | |
232 | def _buffer_decode(self, input, errors, final):\r | |
233 | if errors != 'strict':\r | |
234 | raise UnicodeError("Unsupported error handling "+errors)\r | |
235 | \r | |
236 | if not input:\r | |
237 | return (u"", 0)\r | |
238 | \r | |
239 | # IDNA allows decoding to operate on Unicode strings, too.\r | |
240 | if isinstance(input, unicode):\r | |
241 | labels = dots.split(input)\r | |
242 | else:\r | |
243 | # Must be ASCII string\r | |
244 | input = str(input)\r | |
245 | unicode(input, "ascii")\r | |
246 | labels = input.split(".")\r | |
247 | \r | |
248 | trailing_dot = u''\r | |
249 | if labels:\r | |
250 | if not labels[-1]:\r | |
251 | trailing_dot = u'.'\r | |
252 | del labels[-1]\r | |
253 | elif not final:\r | |
254 | # Keep potentially unfinished label until the next call\r | |
255 | del labels[-1]\r | |
256 | if labels:\r | |
257 | trailing_dot = u'.'\r | |
258 | \r | |
259 | result = []\r | |
260 | size = 0\r | |
261 | for label in labels:\r | |
262 | result.append(ToUnicode(label))\r | |
263 | if size:\r | |
264 | size += 1\r | |
265 | size += len(label)\r | |
266 | \r | |
267 | result = u".".join(result) + trailing_dot\r | |
268 | size += len(trailing_dot)\r | |
269 | return (result, size)\r | |
270 | \r | |
271 | class StreamWriter(Codec,codecs.StreamWriter):\r | |
272 | pass\r | |
273 | \r | |
274 | class StreamReader(Codec,codecs.StreamReader):\r | |
275 | pass\r | |
276 | \r | |
277 | ### encodings module API\r | |
278 | \r | |
279 | def getregentry():\r | |
280 | return codecs.CodecInfo(\r | |
281 | name='idna',\r | |
282 | encode=Codec().encode,\r | |
283 | decode=Codec().decode,\r | |
284 | incrementalencoder=IncrementalEncoder,\r | |
285 | incrementaldecoder=IncrementalDecoder,\r | |
286 | streamwriter=StreamWriter,\r | |
287 | streamreader=StreamReader,\r | |
288 | )\r |