]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.2/Lib/test/test_codecs.py
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Lib / test / test_codecs.py
1 from test import test_support
2 import unittest
3 import codecs
4 import sys, StringIO, _testcapi
5
6 class Queue(object):
7 """
8 queue: write bytes at one end, read bytes from the other end
9 """
10 def __init__(self):
11 self._buffer = ""
12
13 def write(self, chars):
14 self._buffer += chars
15
16 def read(self, size=-1):
17 if size<0:
18 s = self._buffer
19 self._buffer = ""
20 return s
21 else:
22 s = self._buffer[:size]
23 self._buffer = self._buffer[size:]
24 return s
25
26 class ReadTest(unittest.TestCase):
27 def check_partial(self, input, partialresults):
28 # get a StreamReader for the encoding and feed the bytestring version
29 # of input to the reader byte by byte. Read everything available from
30 # the StreamReader and check that the results equal the appropriate
31 # entries from partialresults.
32 q = Queue()
33 r = codecs.getreader(self.encoding)(q)
34 result = u""
35 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
36 q.write(c)
37 result += r.read()
38 self.assertEqual(result, partialresult)
39 # check that there's nothing left in the buffers
40 self.assertEqual(r.read(), u"")
41 self.assertEqual(r.bytebuffer, "")
42 self.assertEqual(r.charbuffer, u"")
43
44 # do the check again, this time using a incremental decoder
45 d = codecs.getincrementaldecoder(self.encoding)()
46 result = u""
47 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
48 result += d.decode(c)
49 self.assertEqual(result, partialresult)
50 # check that there's nothing left in the buffers
51 self.assertEqual(d.decode("", True), u"")
52 self.assertEqual(d.buffer, "")
53
54 # Check whether the reset method works properly
55 d.reset()
56 result = u""
57 for (c, partialresult) in zip(input.encode(self.encoding), partialresults):
58 result += d.decode(c)
59 self.assertEqual(result, partialresult)
60 # check that there's nothing left in the buffers
61 self.assertEqual(d.decode("", True), u"")
62 self.assertEqual(d.buffer, "")
63
64 # check iterdecode()
65 encoded = input.encode(self.encoding)
66 self.assertEqual(
67 input,
68 u"".join(codecs.iterdecode(encoded, self.encoding))
69 )
70
71 def test_readline(self):
72 def getreader(input):
73 stream = StringIO.StringIO(input.encode(self.encoding))
74 return codecs.getreader(self.encoding)(stream)
75
76 def readalllines(input, keepends=True, size=None):
77 reader = getreader(input)
78 lines = []
79 while True:
80 line = reader.readline(size=size, keepends=keepends)
81 if not line:
82 break
83 lines.append(line)
84 return "|".join(lines)
85
86 s = u"foo\nbar\r\nbaz\rspam\u2028eggs"
87 sexpected = u"foo\n|bar\r\n|baz\r|spam\u2028|eggs"
88 sexpectednoends = u"foo|bar|baz|spam|eggs"
89 self.assertEqual(readalllines(s, True), sexpected)
90 self.assertEqual(readalllines(s, False), sexpectednoends)
91 self.assertEqual(readalllines(s, True, 10), sexpected)
92 self.assertEqual(readalllines(s, False, 10), sexpectednoends)
93
94 # Test long lines (multiple calls to read() in readline())
95 vw = []
96 vwo = []
97 for (i, lineend) in enumerate(u"\n \r\n \r \u2028".split()):
98 vw.append((i*200)*u"\3042" + lineend)
99 vwo.append((i*200)*u"\3042")
100 self.assertEqual(readalllines("".join(vw), True), "".join(vw))
101 self.assertEqual(readalllines("".join(vw), False),"".join(vwo))
102
103 # Test lines where the first read might end with \r, so the
104 # reader has to look ahead whether this is a lone \r or a \r\n
105 for size in xrange(80):
106 for lineend in u"\n \r\n \r \u2028".split():
107 s = 10*(size*u"a" + lineend + u"xxx\n")
108 reader = getreader(s)
109 for i in xrange(10):
110 self.assertEqual(
111 reader.readline(keepends=True),
112 size*u"a" + lineend,
113 )
114 reader = getreader(s)
115 for i in xrange(10):
116 self.assertEqual(
117 reader.readline(keepends=False),
118 size*u"a",
119 )
120
121 def test_bug1175396(self):
122 s = [
123 '<%!--===================================================\r\n',
124 ' BLOG index page: show recent articles,\r\n',
125 ' today\'s articles, or articles of a specific date.\r\n',
126 '========================================================--%>\r\n',
127 '<%@inputencoding="ISO-8859-1"%>\r\n',
128 '<%@pagetemplate=TEMPLATE.y%>\r\n',
129 '<%@import=import frog.util, frog%>\r\n',
130 '<%@import=import frog.objects%>\r\n',
131 '<%@import=from frog.storageerrors import StorageError%>\r\n',
132 '<%\r\n',
133 '\r\n',
134 'import logging\r\n',
135 'log=logging.getLogger("Snakelets.logger")\r\n',
136 '\r\n',
137 '\r\n',
138 'user=self.SessionCtx.user\r\n',
139 'storageEngine=self.SessionCtx.storageEngine\r\n',
140 '\r\n',
141 '\r\n',
142 'def readArticlesFromDate(date, count=None):\r\n',
143 ' entryids=storageEngine.listBlogEntries(date)\r\n',
144 ' entryids.reverse() # descending\r\n',
145 ' if count:\r\n',
146 ' entryids=entryids[:count]\r\n',
147 ' try:\r\n',
148 ' return [ frog.objects.BlogEntry.load(storageEngine, date, Id) for Id in entryids ]\r\n',
149 ' except StorageError,x:\r\n',
150 ' log.error("Error loading articles: "+str(x))\r\n',
151 ' self.abort("cannot load articles")\r\n',
152 '\r\n',
153 'showdate=None\r\n',
154 '\r\n',
155 'arg=self.Request.getArg()\r\n',
156 'if arg=="today":\r\n',
157 ' #-------------------- TODAY\'S ARTICLES\r\n',
158 ' self.write("<h2>Today\'s articles</h2>")\r\n',
159 ' showdate = frog.util.isodatestr() \r\n',
160 ' entries = readArticlesFromDate(showdate)\r\n',
161 'elif arg=="active":\r\n',
162 ' #-------------------- ACTIVE ARTICLES redirect\r\n',
163 ' self.Yredirect("active.y")\r\n',
164 'elif arg=="login":\r\n',
165 ' #-------------------- LOGIN PAGE redirect\r\n',
166 ' self.Yredirect("login.y")\r\n',
167 'elif arg=="date":\r\n',
168 ' #-------------------- ARTICLES OF A SPECIFIC DATE\r\n',
169 ' showdate = self.Request.getParameter("date")\r\n',
170 ' self.write("<h2>Articles written on %s</h2>"% frog.util.mediumdatestr(showdate))\r\n',
171 ' entries = readArticlesFromDate(showdate)\r\n',
172 'else:\r\n',
173 ' #-------------------- RECENT ARTICLES\r\n',
174 ' self.write("<h2>Recent articles</h2>")\r\n',
175 ' dates=storageEngine.listBlogEntryDates()\r\n',
176 ' if dates:\r\n',
177 ' entries=[]\r\n',
178 ' SHOWAMOUNT=10\r\n',
179 ' for showdate in dates:\r\n',
180 ' entries.extend( readArticlesFromDate(showdate, SHOWAMOUNT-len(entries)) )\r\n',
181 ' if len(entries)>=SHOWAMOUNT:\r\n',
182 ' break\r\n',
183 ' \r\n',
184 ]
185 stream = StringIO.StringIO("".join(s).encode(self.encoding))
186 reader = codecs.getreader(self.encoding)(stream)
187 for (i, line) in enumerate(reader):
188 self.assertEqual(line, s[i])
189
190 def test_readlinequeue(self):
191 q = Queue()
192 writer = codecs.getwriter(self.encoding)(q)
193 reader = codecs.getreader(self.encoding)(q)
194
195 # No lineends
196 writer.write(u"foo\r")
197 self.assertEqual(reader.readline(keepends=False), u"foo")
198 writer.write(u"\nbar\r")
199 self.assertEqual(reader.readline(keepends=False), u"")
200 self.assertEqual(reader.readline(keepends=False), u"bar")
201 writer.write(u"baz")
202 self.assertEqual(reader.readline(keepends=False), u"baz")
203 self.assertEqual(reader.readline(keepends=False), u"")
204
205 # Lineends
206 writer.write(u"foo\r")
207 self.assertEqual(reader.readline(keepends=True), u"foo\r")
208 writer.write(u"\nbar\r")
209 self.assertEqual(reader.readline(keepends=True), u"\n")
210 self.assertEqual(reader.readline(keepends=True), u"bar\r")
211 writer.write(u"baz")
212 self.assertEqual(reader.readline(keepends=True), u"baz")
213 self.assertEqual(reader.readline(keepends=True), u"")
214 writer.write(u"foo\r\n")
215 self.assertEqual(reader.readline(keepends=True), u"foo\r\n")
216
217 def test_bug1098990_a(self):
218 s1 = u"xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy\r\n"
219 s2 = u"offending line: ladfj askldfj klasdj fskla dfzaskdj fasklfj laskd fjasklfzzzzaa%whereisthis!!!\r\n"
220 s3 = u"next line.\r\n"
221
222 s = (s1+s2+s3).encode(self.encoding)
223 stream = StringIO.StringIO(s)
224 reader = codecs.getreader(self.encoding)(stream)
225 self.assertEqual(reader.readline(), s1)
226 self.assertEqual(reader.readline(), s2)
227 self.assertEqual(reader.readline(), s3)
228 self.assertEqual(reader.readline(), u"")
229
230 def test_bug1098990_b(self):
231 s1 = u"aaaaaaaaaaaaaaaaaaaaaaaa\r\n"
232 s2 = u"bbbbbbbbbbbbbbbbbbbbbbbb\r\n"
233 s3 = u"stillokay:bbbbxx\r\n"
234 s4 = u"broken!!!!badbad\r\n"
235 s5 = u"againokay.\r\n"
236
237 s = (s1+s2+s3+s4+s5).encode(self.encoding)
238 stream = StringIO.StringIO(s)
239 reader = codecs.getreader(self.encoding)(stream)
240 self.assertEqual(reader.readline(), s1)
241 self.assertEqual(reader.readline(), s2)
242 self.assertEqual(reader.readline(), s3)
243 self.assertEqual(reader.readline(), s4)
244 self.assertEqual(reader.readline(), s5)
245 self.assertEqual(reader.readline(), u"")
246
247 class UTF32Test(ReadTest):
248 encoding = "utf-32"
249
250 spamle = ('\xff\xfe\x00\x00'
251 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00'
252 's\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m\x00\x00\x00')
253 spambe = ('\x00\x00\xfe\xff'
254 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m'
255 '\x00\x00\x00s\x00\x00\x00p\x00\x00\x00a\x00\x00\x00m')
256
257 def test_only_one_bom(self):
258 _,_,reader,writer = codecs.lookup(self.encoding)
259 # encode some stream
260 s = StringIO.StringIO()
261 f = writer(s)
262 f.write(u"spam")
263 f.write(u"spam")
264 d = s.getvalue()
265 # check whether there is exactly one BOM in it
266 self.assertTrue(d == self.spamle or d == self.spambe)
267 # try to read it back
268 s = StringIO.StringIO(d)
269 f = reader(s)
270 self.assertEqual(f.read(), u"spamspam")
271
272 def test_badbom(self):
273 s = StringIO.StringIO(4*"\xff")
274 f = codecs.getreader(self.encoding)(s)
275 self.assertRaises(UnicodeError, f.read)
276
277 s = StringIO.StringIO(8*"\xff")
278 f = codecs.getreader(self.encoding)(s)
279 self.assertRaises(UnicodeError, f.read)
280
281 def test_partial(self):
282 self.check_partial(
283 u"\x00\xff\u0100\uffff",
284 [
285 u"", # first byte of BOM read
286 u"", # second byte of BOM read
287 u"", # third byte of BOM read
288 u"", # fourth byte of BOM read => byteorder known
289 u"",
290 u"",
291 u"",
292 u"\x00",
293 u"\x00",
294 u"\x00",
295 u"\x00",
296 u"\x00\xff",
297 u"\x00\xff",
298 u"\x00\xff",
299 u"\x00\xff",
300 u"\x00\xff\u0100",
301 u"\x00\xff\u0100",
302 u"\x00\xff\u0100",
303 u"\x00\xff\u0100",
304 u"\x00\xff\u0100\uffff",
305 ]
306 )
307
308 def test_handlers(self):
309 self.assertEqual((u'\ufffd', 1),
310 codecs.utf_32_decode('\x01', 'replace', True))
311 self.assertEqual((u'', 1),
312 codecs.utf_32_decode('\x01', 'ignore', True))
313
314 def test_errors(self):
315 self.assertRaises(UnicodeDecodeError, codecs.utf_32_decode,
316 "\xff", "strict", True)
317
318 def test_issue8941(self):
319 # Issue #8941: insufficient result allocation when decoding into
320 # surrogate pairs on UCS-2 builds.
321 encoded_le = '\xff\xfe\x00\x00' + '\x00\x00\x01\x00' * 1024
322 self.assertEqual(u'\U00010000' * 1024,
323 codecs.utf_32_decode(encoded_le)[0])
324 encoded_be = '\x00\x00\xfe\xff' + '\x00\x01\x00\x00' * 1024
325 self.assertEqual(u'\U00010000' * 1024,
326 codecs.utf_32_decode(encoded_be)[0])
327
328 class UTF32LETest(ReadTest):
329 encoding = "utf-32-le"
330
331 def test_partial(self):
332 self.check_partial(
333 u"\x00\xff\u0100\uffff",
334 [
335 u"",
336 u"",
337 u"",
338 u"\x00",
339 u"\x00",
340 u"\x00",
341 u"\x00",
342 u"\x00\xff",
343 u"\x00\xff",
344 u"\x00\xff",
345 u"\x00\xff",
346 u"\x00\xff\u0100",
347 u"\x00\xff\u0100",
348 u"\x00\xff\u0100",
349 u"\x00\xff\u0100",
350 u"\x00\xff\u0100\uffff",
351 ]
352 )
353
354 def test_simple(self):
355 self.assertEqual(u"\U00010203".encode(self.encoding), "\x03\x02\x01\x00")
356
357 def test_errors(self):
358 self.assertRaises(UnicodeDecodeError, codecs.utf_32_le_decode,
359 "\xff", "strict", True)
360
361 def test_issue8941(self):
362 # Issue #8941: insufficient result allocation when decoding into
363 # surrogate pairs on UCS-2 builds.
364 encoded = '\x00\x00\x01\x00' * 1024
365 self.assertEqual(u'\U00010000' * 1024,
366 codecs.utf_32_le_decode(encoded)[0])
367
368 class UTF32BETest(ReadTest):
369 encoding = "utf-32-be"
370
371 def test_partial(self):
372 self.check_partial(
373 u"\x00\xff\u0100\uffff",
374 [
375 u"",
376 u"",
377 u"",
378 u"\x00",
379 u"\x00",
380 u"\x00",
381 u"\x00",
382 u"\x00\xff",
383 u"\x00\xff",
384 u"\x00\xff",
385 u"\x00\xff",
386 u"\x00\xff\u0100",
387 u"\x00\xff\u0100",
388 u"\x00\xff\u0100",
389 u"\x00\xff\u0100",
390 u"\x00\xff\u0100\uffff",
391 ]
392 )
393
394 def test_simple(self):
395 self.assertEqual(u"\U00010203".encode(self.encoding), "\x00\x01\x02\x03")
396
397 def test_errors(self):
398 self.assertRaises(UnicodeDecodeError, codecs.utf_32_be_decode,
399 "\xff", "strict", True)
400
401 def test_issue8941(self):
402 # Issue #8941: insufficient result allocation when decoding into
403 # surrogate pairs on UCS-2 builds.
404 encoded = '\x00\x01\x00\x00' * 1024
405 self.assertEqual(u'\U00010000' * 1024,
406 codecs.utf_32_be_decode(encoded)[0])
407
408
409 class UTF16Test(ReadTest):
410 encoding = "utf-16"
411
412 spamle = '\xff\xfes\x00p\x00a\x00m\x00s\x00p\x00a\x00m\x00'
413 spambe = '\xfe\xff\x00s\x00p\x00a\x00m\x00s\x00p\x00a\x00m'
414
415 def test_only_one_bom(self):
416 _,_,reader,writer = codecs.lookup(self.encoding)
417 # encode some stream
418 s = StringIO.StringIO()
419 f = writer(s)
420 f.write(u"spam")
421 f.write(u"spam")
422 d = s.getvalue()
423 # check whether there is exactly one BOM in it
424 self.assertTrue(d == self.spamle or d == self.spambe)
425 # try to read it back
426 s = StringIO.StringIO(d)
427 f = reader(s)
428 self.assertEqual(f.read(), u"spamspam")
429
430 def test_badbom(self):
431 s = StringIO.StringIO("\xff\xff")
432 f = codecs.getreader(self.encoding)(s)
433 self.assertRaises(UnicodeError, f.read)
434
435 s = StringIO.StringIO("\xff\xff\xff\xff")
436 f = codecs.getreader(self.encoding)(s)
437 self.assertRaises(UnicodeError, f.read)
438
439 def test_partial(self):
440 self.check_partial(
441 u"\x00\xff\u0100\uffff",
442 [
443 u"", # first byte of BOM read
444 u"", # second byte of BOM read => byteorder known
445 u"",
446 u"\x00",
447 u"\x00",
448 u"\x00\xff",
449 u"\x00\xff",
450 u"\x00\xff\u0100",
451 u"\x00\xff\u0100",
452 u"\x00\xff\u0100\uffff",
453 ]
454 )
455
456 def test_handlers(self):
457 self.assertEqual((u'\ufffd', 1),
458 codecs.utf_16_decode('\x01', 'replace', True))
459 self.assertEqual((u'', 1),
460 codecs.utf_16_decode('\x01', 'ignore', True))
461
462 def test_errors(self):
463 self.assertRaises(UnicodeDecodeError, codecs.utf_16_decode, "\xff", "strict", True)
464
465 def test_bug691291(self):
466 # Files are always opened in binary mode, even if no binary mode was
467 # specified. This means that no automatic conversion of '\n' is done
468 # on reading and writing.
469 s1 = u'Hello\r\nworld\r\n'
470
471 s = s1.encode(self.encoding)
472 self.addCleanup(test_support.unlink, test_support.TESTFN)
473 with open(test_support.TESTFN, 'wb') as fp:
474 fp.write(s)
475 with codecs.open(test_support.TESTFN, 'U', encoding=self.encoding) as reader:
476 self.assertEqual(reader.read(), s1)
477
478 class UTF16LETest(ReadTest):
479 encoding = "utf-16-le"
480
481 def test_partial(self):
482 self.check_partial(
483 u"\x00\xff\u0100\uffff",
484 [
485 u"",
486 u"\x00",
487 u"\x00",
488 u"\x00\xff",
489 u"\x00\xff",
490 u"\x00\xff\u0100",
491 u"\x00\xff\u0100",
492 u"\x00\xff\u0100\uffff",
493 ]
494 )
495
496 def test_errors(self):
497 self.assertRaises(UnicodeDecodeError, codecs.utf_16_le_decode, "\xff", "strict", True)
498
499 class UTF16BETest(ReadTest):
500 encoding = "utf-16-be"
501
502 def test_partial(self):
503 self.check_partial(
504 u"\x00\xff\u0100\uffff",
505 [
506 u"",
507 u"\x00",
508 u"\x00",
509 u"\x00\xff",
510 u"\x00\xff",
511 u"\x00\xff\u0100",
512 u"\x00\xff\u0100",
513 u"\x00\xff\u0100\uffff",
514 ]
515 )
516
517 def test_errors(self):
518 self.assertRaises(UnicodeDecodeError, codecs.utf_16_be_decode, "\xff", "strict", True)
519
520 class UTF8Test(ReadTest):
521 encoding = "utf-8"
522
523 def test_partial(self):
524 self.check_partial(
525 u"\x00\xff\u07ff\u0800\uffff",
526 [
527 u"\x00",
528 u"\x00",
529 u"\x00\xff",
530 u"\x00\xff",
531 u"\x00\xff\u07ff",
532 u"\x00\xff\u07ff",
533 u"\x00\xff\u07ff",
534 u"\x00\xff\u07ff\u0800",
535 u"\x00\xff\u07ff\u0800",
536 u"\x00\xff\u07ff\u0800",
537 u"\x00\xff\u07ff\u0800\uffff",
538 ]
539 )
540
541 class UTF7Test(ReadTest):
542 encoding = "utf-7"
543
544 def test_partial(self):
545 self.check_partial(
546 u"a+-b",
547 [
548 u"a",
549 u"a",
550 u"a+",
551 u"a+-",
552 u"a+-b",
553 ]
554 )
555
556 class UTF16ExTest(unittest.TestCase):
557
558 def test_errors(self):
559 self.assertRaises(UnicodeDecodeError, codecs.utf_16_ex_decode, "\xff", "strict", 0, True)
560
561 def test_bad_args(self):
562 self.assertRaises(TypeError, codecs.utf_16_ex_decode)
563
564 class ReadBufferTest(unittest.TestCase):
565
566 def test_array(self):
567 import array
568 self.assertEqual(
569 codecs.readbuffer_encode(array.array("c", "spam")),
570 ("spam", 4)
571 )
572
573 def test_empty(self):
574 self.assertEqual(codecs.readbuffer_encode(""), ("", 0))
575
576 def test_bad_args(self):
577 self.assertRaises(TypeError, codecs.readbuffer_encode)
578 self.assertRaises(TypeError, codecs.readbuffer_encode, 42)
579
580 class CharBufferTest(unittest.TestCase):
581
582 def test_string(self):
583 self.assertEqual(codecs.charbuffer_encode("spam"), ("spam", 4))
584
585 def test_empty(self):
586 self.assertEqual(codecs.charbuffer_encode(""), ("", 0))
587
588 def test_bad_args(self):
589 self.assertRaises(TypeError, codecs.charbuffer_encode)
590 self.assertRaises(TypeError, codecs.charbuffer_encode, 42)
591
592 class UTF8SigTest(ReadTest):
593 encoding = "utf-8-sig"
594
595 def test_partial(self):
596 self.check_partial(
597 u"\ufeff\x00\xff\u07ff\u0800\uffff",
598 [
599 u"",
600 u"",
601 u"", # First BOM has been read and skipped
602 u"",
603 u"",
604 u"\ufeff", # Second BOM has been read and emitted
605 u"\ufeff\x00", # "\x00" read and emitted
606 u"\ufeff\x00", # First byte of encoded u"\xff" read
607 u"\ufeff\x00\xff", # Second byte of encoded u"\xff" read
608 u"\ufeff\x00\xff", # First byte of encoded u"\u07ff" read
609 u"\ufeff\x00\xff\u07ff", # Second byte of encoded u"\u07ff" read
610 u"\ufeff\x00\xff\u07ff",
611 u"\ufeff\x00\xff\u07ff",
612 u"\ufeff\x00\xff\u07ff\u0800",
613 u"\ufeff\x00\xff\u07ff\u0800",
614 u"\ufeff\x00\xff\u07ff\u0800",
615 u"\ufeff\x00\xff\u07ff\u0800\uffff",
616 ]
617 )
618
619 def test_bug1601501(self):
620 # SF bug #1601501: check that the codec works with a buffer
621 unicode("\xef\xbb\xbf", "utf-8-sig")
622
623 def test_bom(self):
624 d = codecs.getincrementaldecoder("utf-8-sig")()
625 s = u"spam"
626 self.assertEqual(d.decode(s.encode("utf-8-sig")), s)
627
628 def test_stream_bom(self):
629 unistring = u"ABC\u00A1\u2200XYZ"
630 bytestring = codecs.BOM_UTF8 + "ABC\xC2\xA1\xE2\x88\x80XYZ"
631
632 reader = codecs.getreader("utf-8-sig")
633 for sizehint in [None] + range(1, 11) + \
634 [64, 128, 256, 512, 1024]:
635 istream = reader(StringIO.StringIO(bytestring))
636 ostream = StringIO.StringIO()
637 while 1:
638 if sizehint is not None:
639 data = istream.read(sizehint)
640 else:
641 data = istream.read()
642
643 if not data:
644 break
645 ostream.write(data)
646
647 got = ostream.getvalue()
648 self.assertEqual(got, unistring)
649
650 def test_stream_bare(self):
651 unistring = u"ABC\u00A1\u2200XYZ"
652 bytestring = "ABC\xC2\xA1\xE2\x88\x80XYZ"
653
654 reader = codecs.getreader("utf-8-sig")
655 for sizehint in [None] + range(1, 11) + \
656 [64, 128, 256, 512, 1024]:
657 istream = reader(StringIO.StringIO(bytestring))
658 ostream = StringIO.StringIO()
659 while 1:
660 if sizehint is not None:
661 data = istream.read(sizehint)
662 else:
663 data = istream.read()
664
665 if not data:
666 break
667 ostream.write(data)
668
669 got = ostream.getvalue()
670 self.assertEqual(got, unistring)
671
672 class EscapeDecodeTest(unittest.TestCase):
673 def test_empty(self):
674 self.assertEqual(codecs.escape_decode(""), ("", 0))
675
676 class RecodingTest(unittest.TestCase):
677 def test_recoding(self):
678 f = StringIO.StringIO()
679 f2 = codecs.EncodedFile(f, "unicode_internal", "utf-8")
680 f2.write(u"a")
681 f2.close()
682 # Python used to crash on this at exit because of a refcount
683 # bug in _codecsmodule.c
684
685 # From RFC 3492
686 punycode_testcases = [
687 # A Arabic (Egyptian):
688 (u"\u0644\u064A\u0647\u0645\u0627\u0628\u062A\u0643\u0644"
689 u"\u0645\u0648\u0634\u0639\u0631\u0628\u064A\u061F",
690 "egbpdaj6bu4bxfgehfvwxn"),
691 # B Chinese (simplified):
692 (u"\u4ED6\u4EEC\u4E3A\u4EC0\u4E48\u4E0D\u8BF4\u4E2D\u6587",
693 "ihqwcrb4cv8a8dqg056pqjye"),
694 # C Chinese (traditional):
695 (u"\u4ED6\u5011\u7232\u4EC0\u9EBD\u4E0D\u8AAA\u4E2D\u6587",
696 "ihqwctvzc91f659drss3x8bo0yb"),
697 # D Czech: Pro<ccaron>prost<ecaron>nemluv<iacute><ccaron>esky
698 (u"\u0050\u0072\u006F\u010D\u0070\u0072\u006F\u0073\u0074"
699 u"\u011B\u006E\u0065\u006D\u006C\u0075\u0076\u00ED\u010D"
700 u"\u0065\u0073\u006B\u0079",
701 "Proprostnemluvesky-uyb24dma41a"),
702 # E Hebrew:
703 (u"\u05DC\u05DE\u05D4\u05D4\u05DD\u05E4\u05E9\u05D5\u05D8"
704 u"\u05DC\u05D0\u05DE\u05D3\u05D1\u05E8\u05D9\u05DD\u05E2"
705 u"\u05D1\u05E8\u05D9\u05EA",
706 "4dbcagdahymbxekheh6e0a7fei0b"),
707 # F Hindi (Devanagari):
708 (u"\u092F\u0939\u0932\u094B\u0917\u0939\u093F\u0928\u094D"
709 u"\u0926\u0940\u0915\u094D\u092F\u094B\u0902\u0928\u0939"
710 u"\u0940\u0902\u092C\u094B\u0932\u0938\u0915\u0924\u0947"
711 u"\u0939\u0948\u0902",
712 "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"),
713
714 #(G) Japanese (kanji and hiragana):
715 (u"\u306A\u305C\u307F\u3093\u306A\u65E5\u672C\u8A9E\u3092"
716 u"\u8A71\u3057\u3066\u304F\u308C\u306A\u3044\u306E\u304B",
717 "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"),
718
719 # (H) Korean (Hangul syllables):
720 (u"\uC138\uACC4\uC758\uBAA8\uB4E0\uC0AC\uB78C\uB4E4\uC774"
721 u"\uD55C\uAD6D\uC5B4\uB97C\uC774\uD574\uD55C\uB2E4\uBA74"
722 u"\uC5BC\uB9C8\uB098\uC88B\uC744\uAE4C",
723 "989aomsvi5e83db1d2a355cv1e0vak1dwrv93d5xbh15a0dt30a5j"
724 "psd879ccm6fea98c"),
725
726 # (I) Russian (Cyrillic):
727 (u"\u043F\u043E\u0447\u0435\u043C\u0443\u0436\u0435\u043E"
728 u"\u043D\u0438\u043D\u0435\u0433\u043E\u0432\u043E\u0440"
729 u"\u044F\u0442\u043F\u043E\u0440\u0443\u0441\u0441\u043A"
730 u"\u0438",
731 "b1abfaaepdrnnbgefbaDotcwatmq2g4l"),
732
733 # (J) Spanish: Porqu<eacute>nopuedensimplementehablarenEspa<ntilde>ol
734 (u"\u0050\u006F\u0072\u0071\u0075\u00E9\u006E\u006F\u0070"
735 u"\u0075\u0065\u0064\u0065\u006E\u0073\u0069\u006D\u0070"
736 u"\u006C\u0065\u006D\u0065\u006E\u0074\u0065\u0068\u0061"
737 u"\u0062\u006C\u0061\u0072\u0065\u006E\u0045\u0073\u0070"
738 u"\u0061\u00F1\u006F\u006C",
739 "PorqunopuedensimplementehablarenEspaol-fmd56a"),
740
741 # (K) Vietnamese:
742 # T<adotbelow>isaoh<odotbelow>kh<ocirc>ngth<ecirchookabove>ch\
743 # <ihookabove>n<oacute>iti<ecircacute>ngVi<ecircdotbelow>t
744 (u"\u0054\u1EA1\u0069\u0073\u0061\u006F\u0068\u1ECD\u006B"
745 u"\u0068\u00F4\u006E\u0067\u0074\u0068\u1EC3\u0063\u0068"
746 u"\u1EC9\u006E\u00F3\u0069\u0074\u0069\u1EBF\u006E\u0067"
747 u"\u0056\u0069\u1EC7\u0074",
748 "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"),
749
750 #(L) 3<nen>B<gumi><kinpachi><sensei>
751 (u"\u0033\u5E74\u0042\u7D44\u91D1\u516B\u5148\u751F",
752 "3B-ww4c5e180e575a65lsy2b"),
753
754 # (M) <amuro><namie>-with-SUPER-MONKEYS
755 (u"\u5B89\u5BA4\u5948\u7F8E\u6075\u002D\u0077\u0069\u0074"
756 u"\u0068\u002D\u0053\u0055\u0050\u0045\u0052\u002D\u004D"
757 u"\u004F\u004E\u004B\u0045\u0059\u0053",
758 "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"),
759
760 # (N) Hello-Another-Way-<sorezore><no><basho>
761 (u"\u0048\u0065\u006C\u006C\u006F\u002D\u0041\u006E\u006F"
762 u"\u0074\u0068\u0065\u0072\u002D\u0057\u0061\u0079\u002D"
763 u"\u305D\u308C\u305E\u308C\u306E\u5834\u6240",
764 "Hello-Another-Way--fc4qua05auwb3674vfr0b"),
765
766 # (O) <hitotsu><yane><no><shita>2
767 (u"\u3072\u3068\u3064\u5C4B\u6839\u306E\u4E0B\u0032",
768 "2-u9tlzr9756bt3uc0v"),
769
770 # (P) Maji<de>Koi<suru>5<byou><mae>
771 (u"\u004D\u0061\u006A\u0069\u3067\u004B\u006F\u0069\u3059"
772 u"\u308B\u0035\u79D2\u524D",
773 "MajiKoi5-783gue6qz075azm5e"),
774
775 # (Q) <pafii>de<runba>
776 (u"\u30D1\u30D5\u30A3\u30FC\u0064\u0065\u30EB\u30F3\u30D0",
777 "de-jg4avhby1noc0d"),
778
779 # (R) <sono><supiido><de>
780 (u"\u305D\u306E\u30B9\u30D4\u30FC\u30C9\u3067",
781 "d9juau41awczczp"),
782
783 # (S) -> $1.00 <-
784 (u"\u002D\u003E\u0020\u0024\u0031\u002E\u0030\u0030\u0020"
785 u"\u003C\u002D",
786 "-> $1.00 <--")
787 ]
788
789 for i in punycode_testcases:
790 if len(i)!=2:
791 print repr(i)
792
793 class PunycodeTest(unittest.TestCase):
794 def test_encode(self):
795 for uni, puny in punycode_testcases:
796 # Need to convert both strings to lower case, since
797 # some of the extended encodings use upper case, but our
798 # code produces only lower case. Converting just puny to
799 # lower is also insufficient, since some of the input characters
800 # are upper case.
801 self.assertEqual(uni.encode("punycode").lower(), puny.lower())
802
803 def test_decode(self):
804 for uni, puny in punycode_testcases:
805 self.assertEqual(uni, puny.decode("punycode"))
806
807 class UnicodeInternalTest(unittest.TestCase):
808 def test_bug1251300(self):
809 # Decoding with unicode_internal used to not correctly handle "code
810 # points" above 0x10ffff on UCS-4 builds.
811 if sys.maxunicode > 0xffff:
812 ok = [
813 ("\x00\x10\xff\xff", u"\U0010ffff"),
814 ("\x00\x00\x01\x01", u"\U00000101"),
815 ("", u""),
816 ]
817 not_ok = [
818 "\x7f\xff\xff\xff",
819 "\x80\x00\x00\x00",
820 "\x81\x00\x00\x00",
821 "\x00",
822 "\x00\x00\x00\x00\x00",
823 ]
824 for internal, uni in ok:
825 if sys.byteorder == "little":
826 internal = "".join(reversed(internal))
827 self.assertEqual(uni, internal.decode("unicode_internal"))
828 for internal in not_ok:
829 if sys.byteorder == "little":
830 internal = "".join(reversed(internal))
831 self.assertRaises(UnicodeDecodeError, internal.decode,
832 "unicode_internal")
833
834 def test_decode_error_attributes(self):
835 if sys.maxunicode > 0xffff:
836 try:
837 "\x00\x00\x00\x00\x00\x11\x11\x00".decode("unicode_internal")
838 except UnicodeDecodeError, ex:
839 self.assertEqual("unicode_internal", ex.encoding)
840 self.assertEqual("\x00\x00\x00\x00\x00\x11\x11\x00", ex.object)
841 self.assertEqual(4, ex.start)
842 self.assertEqual(8, ex.end)
843 else:
844 self.fail()
845
846 def test_decode_callback(self):
847 if sys.maxunicode > 0xffff:
848 codecs.register_error("UnicodeInternalTest", codecs.ignore_errors)
849 decoder = codecs.getdecoder("unicode_internal")
850 ab = u"ab".encode("unicode_internal")
851 ignored = decoder("%s\x22\x22\x22\x22%s" % (ab[:4], ab[4:]),
852 "UnicodeInternalTest")
853 self.assertEqual((u"ab", 12), ignored)
854
855 def test_encode_length(self):
856 # Issue 3739
857 encoder = codecs.getencoder("unicode_internal")
858 self.assertEqual(encoder(u"a")[1], 1)
859 self.assertEqual(encoder(u"\xe9\u0142")[1], 2)
860
861 encoder = codecs.getencoder("string-escape")
862 self.assertEqual(encoder(r'\x00')[1], 4)
863
864 # From http://www.gnu.org/software/libidn/draft-josefsson-idn-test-vectors.html
865 nameprep_tests = [
866 # 3.1 Map to nothing.
867 ('foo\xc2\xad\xcd\x8f\xe1\xa0\x86\xe1\xa0\x8bbar'
868 '\xe2\x80\x8b\xe2\x81\xa0baz\xef\xb8\x80\xef\xb8\x88\xef'
869 '\xb8\x8f\xef\xbb\xbf',
870 'foobarbaz'),
871 # 3.2 Case folding ASCII U+0043 U+0041 U+0046 U+0045.
872 ('CAFE',
873 'cafe'),
874 # 3.3 Case folding 8bit U+00DF (german sharp s).
875 # The original test case is bogus; it says \xc3\xdf
876 ('\xc3\x9f',
877 'ss'),
878 # 3.4 Case folding U+0130 (turkish capital I with dot).
879 ('\xc4\xb0',
880 'i\xcc\x87'),
881 # 3.5 Case folding multibyte U+0143 U+037A.
882 ('\xc5\x83\xcd\xba',
883 '\xc5\x84 \xce\xb9'),
884 # 3.6 Case folding U+2121 U+33C6 U+1D7BB.
885 # XXX: skip this as it fails in UCS-2 mode
886 #('\xe2\x84\xa1\xe3\x8f\x86\xf0\x9d\x9e\xbb',
887 # 'telc\xe2\x88\x95kg\xcf\x83'),
888 (None, None),
889 # 3.7 Normalization of U+006a U+030c U+00A0 U+00AA.
890 ('j\xcc\x8c\xc2\xa0\xc2\xaa',
891 '\xc7\xb0 a'),
892 # 3.8 Case folding U+1FB7 and normalization.
893 ('\xe1\xbe\xb7',
894 '\xe1\xbe\xb6\xce\xb9'),
895 # 3.9 Self-reverting case folding U+01F0 and normalization.
896 # The original test case is bogus, it says `\xc7\xf0'
897 ('\xc7\xb0',
898 '\xc7\xb0'),
899 # 3.10 Self-reverting case folding U+0390 and normalization.
900 ('\xce\x90',
901 '\xce\x90'),
902 # 3.11 Self-reverting case folding U+03B0 and normalization.
903 ('\xce\xb0',
904 '\xce\xb0'),
905 # 3.12 Self-reverting case folding U+1E96 and normalization.
906 ('\xe1\xba\x96',
907 '\xe1\xba\x96'),
908 # 3.13 Self-reverting case folding U+1F56 and normalization.
909 ('\xe1\xbd\x96',
910 '\xe1\xbd\x96'),
911 # 3.14 ASCII space character U+0020.
912 (' ',
913 ' '),
914 # 3.15 Non-ASCII 8bit space character U+00A0.
915 ('\xc2\xa0',
916 ' '),
917 # 3.16 Non-ASCII multibyte space character U+1680.
918 ('\xe1\x9a\x80',
919 None),
920 # 3.17 Non-ASCII multibyte space character U+2000.
921 ('\xe2\x80\x80',
922 ' '),
923 # 3.18 Zero Width Space U+200b.
924 ('\xe2\x80\x8b',
925 ''),
926 # 3.19 Non-ASCII multibyte space character U+3000.
927 ('\xe3\x80\x80',
928 ' '),
929 # 3.20 ASCII control characters U+0010 U+007F.
930 ('\x10\x7f',
931 '\x10\x7f'),
932 # 3.21 Non-ASCII 8bit control character U+0085.
933 ('\xc2\x85',
934 None),
935 # 3.22 Non-ASCII multibyte control character U+180E.
936 ('\xe1\xa0\x8e',
937 None),
938 # 3.23 Zero Width No-Break Space U+FEFF.
939 ('\xef\xbb\xbf',
940 ''),
941 # 3.24 Non-ASCII control character U+1D175.
942 ('\xf0\x9d\x85\xb5',
943 None),
944 # 3.25 Plane 0 private use character U+F123.
945 ('\xef\x84\xa3',
946 None),
947 # 3.26 Plane 15 private use character U+F1234.
948 ('\xf3\xb1\x88\xb4',
949 None),
950 # 3.27 Plane 16 private use character U+10F234.
951 ('\xf4\x8f\x88\xb4',
952 None),
953 # 3.28 Non-character code point U+8FFFE.
954 ('\xf2\x8f\xbf\xbe',
955 None),
956 # 3.29 Non-character code point U+10FFFF.
957 ('\xf4\x8f\xbf\xbf',
958 None),
959 # 3.30 Surrogate code U+DF42.
960 ('\xed\xbd\x82',
961 None),
962 # 3.31 Non-plain text character U+FFFD.
963 ('\xef\xbf\xbd',
964 None),
965 # 3.32 Ideographic description character U+2FF5.
966 ('\xe2\xbf\xb5',
967 None),
968 # 3.33 Display property character U+0341.
969 ('\xcd\x81',
970 '\xcc\x81'),
971 # 3.34 Left-to-right mark U+200E.
972 ('\xe2\x80\x8e',
973 None),
974 # 3.35 Deprecated U+202A.
975 ('\xe2\x80\xaa',
976 None),
977 # 3.36 Language tagging character U+E0001.
978 ('\xf3\xa0\x80\x81',
979 None),
980 # 3.37 Language tagging character U+E0042.
981 ('\xf3\xa0\x81\x82',
982 None),
983 # 3.38 Bidi: RandALCat character U+05BE and LCat characters.
984 ('foo\xd6\xbebar',
985 None),
986 # 3.39 Bidi: RandALCat character U+FD50 and LCat characters.
987 ('foo\xef\xb5\x90bar',
988 None),
989 # 3.40 Bidi: RandALCat character U+FB38 and LCat characters.
990 ('foo\xef\xb9\xb6bar',
991 'foo \xd9\x8ebar'),
992 # 3.41 Bidi: RandALCat without trailing RandALCat U+0627 U+0031.
993 ('\xd8\xa71',
994 None),
995 # 3.42 Bidi: RandALCat character U+0627 U+0031 U+0628.
996 ('\xd8\xa71\xd8\xa8',
997 '\xd8\xa71\xd8\xa8'),
998 # 3.43 Unassigned code point U+E0002.
999 # Skip this test as we allow unassigned
1000 #('\xf3\xa0\x80\x82',
1001 # None),
1002 (None, None),
1003 # 3.44 Larger test (shrinking).
1004 # Original test case reads \xc3\xdf
1005 ('X\xc2\xad\xc3\x9f\xc4\xb0\xe2\x84\xa1j\xcc\x8c\xc2\xa0\xc2'
1006 '\xaa\xce\xb0\xe2\x80\x80',
1007 'xssi\xcc\x87tel\xc7\xb0 a\xce\xb0 '),
1008 # 3.45 Larger test (expanding).
1009 # Original test case reads \xc3\x9f
1010 ('X\xc3\x9f\xe3\x8c\x96\xc4\xb0\xe2\x84\xa1\xe2\x92\x9f\xe3\x8c'
1011 '\x80',
1012 'xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3'
1013 '\x83\x88\xe3\x83\xabi\xcc\x87tel\x28d\x29\xe3\x82'
1014 '\xa2\xe3\x83\x91\xe3\x83\xbc\xe3\x83\x88')
1015 ]
1016
1017
1018 class NameprepTest(unittest.TestCase):
1019 def test_nameprep(self):
1020 from encodings.idna import nameprep
1021 for pos, (orig, prepped) in enumerate(nameprep_tests):
1022 if orig is None:
1023 # Skipped
1024 continue
1025 # The Unicode strings are given in UTF-8
1026 orig = unicode(orig, "utf-8")
1027 if prepped is None:
1028 # Input contains prohibited characters
1029 self.assertRaises(UnicodeError, nameprep, orig)
1030 else:
1031 prepped = unicode(prepped, "utf-8")
1032 try:
1033 self.assertEqual(nameprep(orig), prepped)
1034 except Exception,e:
1035 raise test_support.TestFailed("Test 3.%d: %s" % (pos+1, str(e)))
1036
1037 class IDNACodecTest(unittest.TestCase):
1038 def test_builtin_decode(self):
1039 self.assertEqual(unicode("python.org", "idna"), u"python.org")
1040 self.assertEqual(unicode("python.org.", "idna"), u"python.org.")
1041 self.assertEqual(unicode("xn--pythn-mua.org", "idna"), u"pyth\xf6n.org")
1042 self.assertEqual(unicode("xn--pythn-mua.org.", "idna"), u"pyth\xf6n.org.")
1043
1044 def test_builtin_encode(self):
1045 self.assertEqual(u"python.org".encode("idna"), "python.org")
1046 self.assertEqual("python.org.".encode("idna"), "python.org.")
1047 self.assertEqual(u"pyth\xf6n.org".encode("idna"), "xn--pythn-mua.org")
1048 self.assertEqual(u"pyth\xf6n.org.".encode("idna"), "xn--pythn-mua.org.")
1049
1050 def test_stream(self):
1051 import StringIO
1052 r = codecs.getreader("idna")(StringIO.StringIO("abc"))
1053 r.read(3)
1054 self.assertEqual(r.read(), u"")
1055
1056 def test_incremental_decode(self):
1057 self.assertEqual(
1058 "".join(codecs.iterdecode("python.org", "idna")),
1059 u"python.org"
1060 )
1061 self.assertEqual(
1062 "".join(codecs.iterdecode("python.org.", "idna")),
1063 u"python.org."
1064 )
1065 self.assertEqual(
1066 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1067 u"pyth\xf6n.org."
1068 )
1069 self.assertEqual(
1070 "".join(codecs.iterdecode("xn--pythn-mua.org.", "idna")),
1071 u"pyth\xf6n.org."
1072 )
1073
1074 decoder = codecs.getincrementaldecoder("idna")()
1075 self.assertEqual(decoder.decode("xn--xam", ), u"")
1076 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1077 self.assertEqual(decoder.decode(u"rg"), u"")
1078 self.assertEqual(decoder.decode(u"", True), u"org")
1079
1080 decoder.reset()
1081 self.assertEqual(decoder.decode("xn--xam", ), u"")
1082 self.assertEqual(decoder.decode("ple-9ta.o", ), u"\xe4xample.")
1083 self.assertEqual(decoder.decode("rg."), u"org.")
1084 self.assertEqual(decoder.decode("", True), u"")
1085
1086 def test_incremental_encode(self):
1087 self.assertEqual(
1088 "".join(codecs.iterencode(u"python.org", "idna")),
1089 "python.org"
1090 )
1091 self.assertEqual(
1092 "".join(codecs.iterencode(u"python.org.", "idna")),
1093 "python.org."
1094 )
1095 self.assertEqual(
1096 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1097 "xn--pythn-mua.org."
1098 )
1099 self.assertEqual(
1100 "".join(codecs.iterencode(u"pyth\xf6n.org.", "idna")),
1101 "xn--pythn-mua.org."
1102 )
1103
1104 encoder = codecs.getincrementalencoder("idna")()
1105 self.assertEqual(encoder.encode(u"\xe4x"), "")
1106 self.assertEqual(encoder.encode(u"ample.org"), "xn--xample-9ta.")
1107 self.assertEqual(encoder.encode(u"", True), "org")
1108
1109 encoder.reset()
1110 self.assertEqual(encoder.encode(u"\xe4x"), "")
1111 self.assertEqual(encoder.encode(u"ample.org."), "xn--xample-9ta.org.")
1112 self.assertEqual(encoder.encode(u"", True), "")
1113
1114 class CodecsModuleTest(unittest.TestCase):
1115
1116 def test_decode(self):
1117 self.assertEqual(codecs.decode('\xe4\xf6\xfc', 'latin-1'),
1118 u'\xe4\xf6\xfc')
1119 self.assertRaises(TypeError, codecs.decode)
1120 self.assertEqual(codecs.decode('abc'), u'abc')
1121 self.assertRaises(UnicodeDecodeError, codecs.decode, '\xff', 'ascii')
1122
1123 def test_encode(self):
1124 self.assertEqual(codecs.encode(u'\xe4\xf6\xfc', 'latin-1'),
1125 '\xe4\xf6\xfc')
1126 self.assertRaises(TypeError, codecs.encode)
1127 self.assertRaises(LookupError, codecs.encode, "foo", "__spam__")
1128 self.assertEqual(codecs.encode(u'abc'), 'abc')
1129 self.assertRaises(UnicodeEncodeError, codecs.encode, u'\xffff', 'ascii')
1130
1131 def test_register(self):
1132 self.assertRaises(TypeError, codecs.register)
1133 self.assertRaises(TypeError, codecs.register, 42)
1134
1135 def test_lookup(self):
1136 self.assertRaises(TypeError, codecs.lookup)
1137 self.assertRaises(LookupError, codecs.lookup, "__spam__")
1138 self.assertRaises(LookupError, codecs.lookup, " ")
1139
1140 def test_getencoder(self):
1141 self.assertRaises(TypeError, codecs.getencoder)
1142 self.assertRaises(LookupError, codecs.getencoder, "__spam__")
1143
1144 def test_getdecoder(self):
1145 self.assertRaises(TypeError, codecs.getdecoder)
1146 self.assertRaises(LookupError, codecs.getdecoder, "__spam__")
1147
1148 def test_getreader(self):
1149 self.assertRaises(TypeError, codecs.getreader)
1150 self.assertRaises(LookupError, codecs.getreader, "__spam__")
1151
1152 def test_getwriter(self):
1153 self.assertRaises(TypeError, codecs.getwriter)
1154 self.assertRaises(LookupError, codecs.getwriter, "__spam__")
1155
1156 class StreamReaderTest(unittest.TestCase):
1157
1158 def setUp(self):
1159 self.reader = codecs.getreader('utf-8')
1160 self.stream = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1161
1162 def test_readlines(self):
1163 f = self.reader(self.stream)
1164 self.assertEqual(f.readlines(), [u'\ud55c\n', u'\uae00'])
1165
1166 class EncodedFileTest(unittest.TestCase):
1167
1168 def test_basic(self):
1169 f = StringIO.StringIO('\xed\x95\x9c\n\xea\xb8\x80')
1170 ef = codecs.EncodedFile(f, 'utf-16-le', 'utf-8')
1171 self.assertEqual(ef.read(), '\\\xd5\n\x00\x00\xae')
1172
1173 f = StringIO.StringIO()
1174 ef = codecs.EncodedFile(f, 'utf-8', 'latin1')
1175 ef.write('\xc3\xbc')
1176 self.assertEqual(f.getvalue(), '\xfc')
1177
1178 class Str2StrTest(unittest.TestCase):
1179
1180 def test_read(self):
1181 sin = "\x80".encode("base64_codec")
1182 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1183 sout = reader.read()
1184 self.assertEqual(sout, "\x80")
1185 self.assertIsInstance(sout, str)
1186
1187 def test_readline(self):
1188 sin = "\x80".encode("base64_codec")
1189 reader = codecs.getreader("base64_codec")(StringIO.StringIO(sin))
1190 sout = reader.readline()
1191 self.assertEqual(sout, "\x80")
1192 self.assertIsInstance(sout, str)
1193
1194 all_unicode_encodings = [
1195 "ascii",
1196 "base64_codec",
1197 "big5",
1198 "big5hkscs",
1199 "charmap",
1200 "cp037",
1201 "cp1006",
1202 "cp1026",
1203 "cp1140",
1204 "cp1250",
1205 "cp1251",
1206 "cp1252",
1207 "cp1253",
1208 "cp1254",
1209 "cp1255",
1210 "cp1256",
1211 "cp1257",
1212 "cp1258",
1213 "cp424",
1214 "cp437",
1215 "cp500",
1216 "cp720",
1217 "cp737",
1218 "cp775",
1219 "cp850",
1220 "cp852",
1221 "cp855",
1222 "cp856",
1223 "cp857",
1224 "cp858",
1225 "cp860",
1226 "cp861",
1227 "cp862",
1228 "cp863",
1229 "cp864",
1230 "cp865",
1231 "cp866",
1232 "cp869",
1233 "cp874",
1234 "cp875",
1235 "cp932",
1236 "cp949",
1237 "cp950",
1238 "euc_jis_2004",
1239 "euc_jisx0213",
1240 "euc_jp",
1241 "euc_kr",
1242 "gb18030",
1243 "gb2312",
1244 "gbk",
1245 "hex_codec",
1246 "hp_roman8",
1247 "hz",
1248 "idna",
1249 "iso2022_jp",
1250 "iso2022_jp_1",
1251 "iso2022_jp_2",
1252 "iso2022_jp_2004",
1253 "iso2022_jp_3",
1254 "iso2022_jp_ext",
1255 "iso2022_kr",
1256 "iso8859_1",
1257 "iso8859_10",
1258 "iso8859_11",
1259 "iso8859_13",
1260 "iso8859_14",
1261 "iso8859_15",
1262 "iso8859_16",
1263 "iso8859_2",
1264 "iso8859_3",
1265 "iso8859_4",
1266 "iso8859_5",
1267 "iso8859_6",
1268 "iso8859_7",
1269 "iso8859_8",
1270 "iso8859_9",
1271 "johab",
1272 "koi8_r",
1273 "koi8_u",
1274 "latin_1",
1275 "mac_cyrillic",
1276 "mac_greek",
1277 "mac_iceland",
1278 "mac_latin2",
1279 "mac_roman",
1280 "mac_turkish",
1281 "palmos",
1282 "ptcp154",
1283 "punycode",
1284 "raw_unicode_escape",
1285 "rot_13",
1286 "shift_jis",
1287 "shift_jis_2004",
1288 "shift_jisx0213",
1289 "tis_620",
1290 "unicode_escape",
1291 "unicode_internal",
1292 "utf_16",
1293 "utf_16_be",
1294 "utf_16_le",
1295 "utf_7",
1296 "utf_8",
1297 ]
1298
1299 if hasattr(codecs, "mbcs_encode"):
1300 all_unicode_encodings.append("mbcs")
1301
1302 # The following encodings work only with str, not unicode
1303 all_string_encodings = [
1304 "quopri_codec",
1305 "string_escape",
1306 "uu_codec",
1307 ]
1308
1309 # The following encoding is not tested, because it's not supposed
1310 # to work:
1311 # "undefined"
1312
1313 # The following encodings don't work in stateful mode
1314 broken_unicode_with_streams = [
1315 "base64_codec",
1316 "hex_codec",
1317 "punycode",
1318 "unicode_internal"
1319 ]
1320 broken_incremental_coders = broken_unicode_with_streams[:]
1321
1322 # The following encodings only support "strict" mode
1323 only_strict_mode = [
1324 "idna",
1325 "zlib_codec",
1326 "bz2_codec",
1327 ]
1328
1329 try:
1330 import bz2
1331 except ImportError:
1332 pass
1333 else:
1334 all_unicode_encodings.append("bz2_codec")
1335 broken_unicode_with_streams.append("bz2_codec")
1336
1337 try:
1338 import zlib
1339 except ImportError:
1340 pass
1341 else:
1342 all_unicode_encodings.append("zlib_codec")
1343 broken_unicode_with_streams.append("zlib_codec")
1344
1345 class BasicUnicodeTest(unittest.TestCase):
1346 def test_basics(self):
1347 s = u"abc123" # all codecs should be able to encode these
1348 for encoding in all_unicode_encodings:
1349 name = codecs.lookup(encoding).name
1350 if encoding.endswith("_codec"):
1351 name += "_codec"
1352 elif encoding == "latin_1":
1353 name = "latin_1"
1354 self.assertEqual(encoding.replace("_", "-"), name.replace("_", "-"))
1355 (bytes, size) = codecs.getencoder(encoding)(s)
1356 self.assertEqual(size, len(s), "%r != %r (encoding=%r)" % (size, len(s), encoding))
1357 (chars, size) = codecs.getdecoder(encoding)(bytes)
1358 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1359
1360 if encoding not in broken_unicode_with_streams:
1361 # check stream reader/writer
1362 q = Queue()
1363 writer = codecs.getwriter(encoding)(q)
1364 encodedresult = ""
1365 for c in s:
1366 writer.write(c)
1367 encodedresult += q.read()
1368 q = Queue()
1369 reader = codecs.getreader(encoding)(q)
1370 decodedresult = u""
1371 for c in encodedresult:
1372 q.write(c)
1373 decodedresult += reader.read()
1374 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1375
1376 if encoding not in broken_incremental_coders:
1377 # check incremental decoder/encoder (fetched via the Python
1378 # and C API) and iterencode()/iterdecode()
1379 try:
1380 encoder = codecs.getincrementalencoder(encoding)()
1381 cencoder = _testcapi.codec_incrementalencoder(encoding)
1382 except LookupError: # no IncrementalEncoder
1383 pass
1384 else:
1385 # check incremental decoder/encoder
1386 encodedresult = ""
1387 for c in s:
1388 encodedresult += encoder.encode(c)
1389 encodedresult += encoder.encode(u"", True)
1390 decoder = codecs.getincrementaldecoder(encoding)()
1391 decodedresult = u""
1392 for c in encodedresult:
1393 decodedresult += decoder.decode(c)
1394 decodedresult += decoder.decode("", True)
1395 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1396
1397 # check C API
1398 encodedresult = ""
1399 for c in s:
1400 encodedresult += cencoder.encode(c)
1401 encodedresult += cencoder.encode(u"", True)
1402 cdecoder = _testcapi.codec_incrementaldecoder(encoding)
1403 decodedresult = u""
1404 for c in encodedresult:
1405 decodedresult += cdecoder.decode(c)
1406 decodedresult += cdecoder.decode("", True)
1407 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1408
1409 # check iterencode()/iterdecode()
1410 result = u"".join(codecs.iterdecode(codecs.iterencode(s, encoding), encoding))
1411 self.assertEqual(result, s, "%r != %r (encoding=%r)" % (result, s, encoding))
1412
1413 # check iterencode()/iterdecode() with empty string
1414 result = u"".join(codecs.iterdecode(codecs.iterencode(u"", encoding), encoding))
1415 self.assertEqual(result, u"")
1416
1417 if encoding not in only_strict_mode:
1418 # check incremental decoder/encoder with errors argument
1419 try:
1420 encoder = codecs.getincrementalencoder(encoding)("ignore")
1421 cencoder = _testcapi.codec_incrementalencoder(encoding, "ignore")
1422 except LookupError: # no IncrementalEncoder
1423 pass
1424 else:
1425 encodedresult = "".join(encoder.encode(c) for c in s)
1426 decoder = codecs.getincrementaldecoder(encoding)("ignore")
1427 decodedresult = u"".join(decoder.decode(c) for c in encodedresult)
1428 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1429
1430 encodedresult = "".join(cencoder.encode(c) for c in s)
1431 cdecoder = _testcapi.codec_incrementaldecoder(encoding, "ignore")
1432 decodedresult = u"".join(cdecoder.decode(c) for c in encodedresult)
1433 self.assertEqual(decodedresult, s, "%r != %r (encoding=%r)" % (decodedresult, s, encoding))
1434
1435 def test_seek(self):
1436 # all codecs should be able to encode these
1437 s = u"%s\n%s\n" % (100*u"abc123", 100*u"def456")
1438 for encoding in all_unicode_encodings:
1439 if encoding == "idna": # FIXME: See SF bug #1163178
1440 continue
1441 if encoding in broken_unicode_with_streams:
1442 continue
1443 reader = codecs.getreader(encoding)(StringIO.StringIO(s.encode(encoding)))
1444 for t in xrange(5):
1445 # Test that calling seek resets the internal codec state and buffers
1446 reader.seek(0, 0)
1447 line = reader.readline()
1448 self.assertEqual(s[:len(line)], line)
1449
1450 def test_bad_decode_args(self):
1451 for encoding in all_unicode_encodings:
1452 decoder = codecs.getdecoder(encoding)
1453 self.assertRaises(TypeError, decoder)
1454 if encoding not in ("idna", "punycode"):
1455 self.assertRaises(TypeError, decoder, 42)
1456
1457 def test_bad_encode_args(self):
1458 for encoding in all_unicode_encodings:
1459 encoder = codecs.getencoder(encoding)
1460 self.assertRaises(TypeError, encoder)
1461
1462 def test_encoding_map_type_initialized(self):
1463 from encodings import cp1140
1464 # This used to crash, we are only verifying there's no crash.
1465 table_type = type(cp1140.encoding_table)
1466 self.assertEqual(table_type, table_type)
1467
1468 class BasicStrTest(unittest.TestCase):
1469 def test_basics(self):
1470 s = "abc123"
1471 for encoding in all_string_encodings:
1472 (bytes, size) = codecs.getencoder(encoding)(s)
1473 self.assertEqual(size, len(s))
1474 (chars, size) = codecs.getdecoder(encoding)(bytes)
1475 self.assertEqual(chars, s, "%r != %r (encoding=%r)" % (chars, s, encoding))
1476
1477 class CharmapTest(unittest.TestCase):
1478 def test_decode_with_string_map(self):
1479 self.assertEqual(
1480 codecs.charmap_decode("\x00\x01\x02", "strict", u"abc"),
1481 (u"abc", 3)
1482 )
1483
1484 self.assertEqual(
1485 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab"),
1486 (u"ab\ufffd", 3)
1487 )
1488
1489 self.assertEqual(
1490 codecs.charmap_decode("\x00\x01\x02", "replace", u"ab\ufffe"),
1491 (u"ab\ufffd", 3)
1492 )
1493
1494 self.assertEqual(
1495 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab"),
1496 (u"ab", 3)
1497 )
1498
1499 self.assertEqual(
1500 codecs.charmap_decode("\x00\x01\x02", "ignore", u"ab\ufffe"),
1501 (u"ab", 3)
1502 )
1503
1504 allbytes = "".join(chr(i) for i in xrange(256))
1505 self.assertEqual(
1506 codecs.charmap_decode(allbytes, "ignore", u""),
1507 (u"", len(allbytes))
1508 )
1509
1510 class WithStmtTest(unittest.TestCase):
1511 def test_encodedfile(self):
1512 f = StringIO.StringIO("\xc3\xbc")
1513 with codecs.EncodedFile(f, "latin-1", "utf-8") as ef:
1514 self.assertEqual(ef.read(), "\xfc")
1515
1516 def test_streamreaderwriter(self):
1517 f = StringIO.StringIO("\xc3\xbc")
1518 info = codecs.lookup("utf-8")
1519 with codecs.StreamReaderWriter(f, info.streamreader,
1520 info.streamwriter, 'strict') as srw:
1521 self.assertEqual(srw.read(), u"\xfc")
1522
1523
1524 class BomTest(unittest.TestCase):
1525 def test_seek0(self):
1526 data = u"1234567890"
1527 tests = ("utf-16",
1528 "utf-16-le",
1529 "utf-16-be",
1530 "utf-32",
1531 "utf-32-le",
1532 "utf-32-be")
1533 self.addCleanup(test_support.unlink, test_support.TESTFN)
1534 for encoding in tests:
1535 # Check if the BOM is written only once
1536 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1537 f.write(data)
1538 f.write(data)
1539 f.seek(0)
1540 self.assertEqual(f.read(), data * 2)
1541 f.seek(0)
1542 self.assertEqual(f.read(), data * 2)
1543
1544 # Check that the BOM is written after a seek(0)
1545 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1546 f.write(data[0])
1547 self.assertNotEqual(f.tell(), 0)
1548 f.seek(0)
1549 f.write(data)
1550 f.seek(0)
1551 self.assertEqual(f.read(), data)
1552
1553 # (StreamWriter) Check that the BOM is written after a seek(0)
1554 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1555 f.writer.write(data[0])
1556 self.assertNotEqual(f.writer.tell(), 0)
1557 f.writer.seek(0)
1558 f.writer.write(data)
1559 f.seek(0)
1560 self.assertEqual(f.read(), data)
1561
1562 # Check that the BOM is not written after a seek() at a position
1563 # different than the start
1564 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1565 f.write(data)
1566 f.seek(f.tell())
1567 f.write(data)
1568 f.seek(0)
1569 self.assertEqual(f.read(), data * 2)
1570
1571 # (StreamWriter) Check that the BOM is not written after a seek()
1572 # at a position different than the start
1573 with codecs.open(test_support.TESTFN, 'w+', encoding=encoding) as f:
1574 f.writer.write(data)
1575 f.writer.seek(f.writer.tell())
1576 f.writer.write(data)
1577 f.seek(0)
1578 self.assertEqual(f.read(), data * 2)
1579
1580
1581 def test_main():
1582 test_support.run_unittest(
1583 UTF32Test,
1584 UTF32LETest,
1585 UTF32BETest,
1586 UTF16Test,
1587 UTF16LETest,
1588 UTF16BETest,
1589 UTF8Test,
1590 UTF8SigTest,
1591 UTF7Test,
1592 UTF16ExTest,
1593 ReadBufferTest,
1594 CharBufferTest,
1595 EscapeDecodeTest,
1596 RecodingTest,
1597 PunycodeTest,
1598 UnicodeInternalTest,
1599 NameprepTest,
1600 IDNACodecTest,
1601 CodecsModuleTest,
1602 StreamReaderTest,
1603 EncodedFileTest,
1604 Str2StrTest,
1605 BasicUnicodeTest,
1606 BasicStrTest,
1607 CharmapTest,
1608 WithStmtTest,
1609 BomTest,
1610 )
1611
1612
1613 if __name__ == "__main__":
1614 test_main()