]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | #!/usr/bin/env python\r |
2 | #\r | |
3 | # test_multibytecodec.py\r | |
4 | # Unit test for multibytecodec itself\r | |
5 | #\r | |
6 | \r | |
7 | from test import test_support\r | |
8 | from test.test_support import TESTFN\r | |
9 | import unittest, StringIO, codecs, sys, os\r | |
10 | import _multibytecodec\r | |
11 | \r | |
12 | ALL_CJKENCODINGS = [\r | |
13 | # _codecs_cn\r | |
14 | 'gb2312', 'gbk', 'gb18030', 'hz',\r | |
15 | # _codecs_hk\r | |
16 | 'big5hkscs',\r | |
17 | # _codecs_jp\r | |
18 | 'cp932', 'shift_jis', 'euc_jp', 'euc_jisx0213', 'shift_jisx0213',\r | |
19 | 'euc_jis_2004', 'shift_jis_2004',\r | |
20 | # _codecs_kr\r | |
21 | 'cp949', 'euc_kr', 'johab',\r | |
22 | # _codecs_tw\r | |
23 | 'big5', 'cp950',\r | |
24 | # _codecs_iso2022\r | |
25 | 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004',\r | |
26 | 'iso2022_jp_3', 'iso2022_jp_ext', 'iso2022_kr',\r | |
27 | ]\r | |
28 | \r | |
29 | class Test_MultibyteCodec(unittest.TestCase):\r | |
30 | \r | |
31 | def test_nullcoding(self):\r | |
32 | for enc in ALL_CJKENCODINGS:\r | |
33 | self.assertEqual(''.decode(enc), u'')\r | |
34 | self.assertEqual(unicode('', enc), u'')\r | |
35 | self.assertEqual(u''.encode(enc), '')\r | |
36 | \r | |
37 | def test_str_decode(self):\r | |
38 | for enc in ALL_CJKENCODINGS:\r | |
39 | self.assertEqual('abcd'.encode(enc), 'abcd')\r | |
40 | \r | |
41 | def test_errorcallback_longindex(self):\r | |
42 | dec = codecs.getdecoder('euc-kr')\r | |
43 | myreplace = lambda exc: (u'', sys.maxint+1)\r | |
44 | codecs.register_error('test.cjktest', myreplace)\r | |
45 | self.assertRaises(IndexError, dec,\r | |
46 | 'apple\x92ham\x93spam', 'test.cjktest')\r | |
47 | \r | |
48 | def test_codingspec(self):\r | |
49 | for enc in ALL_CJKENCODINGS:\r | |
50 | code = '# coding: {}\n'.format(enc)\r | |
51 | exec code\r | |
52 | \r | |
53 | def test_init_segfault(self):\r | |
54 | # bug #3305: this used to segfault\r | |
55 | self.assertRaises(AttributeError,\r | |
56 | _multibytecodec.MultibyteStreamReader, None)\r | |
57 | self.assertRaises(AttributeError,\r | |
58 | _multibytecodec.MultibyteStreamWriter, None)\r | |
59 | \r | |
60 | \r | |
61 | class Test_IncrementalEncoder(unittest.TestCase):\r | |
62 | \r | |
63 | def test_stateless(self):\r | |
64 | # cp949 encoder isn't stateful at all.\r | |
65 | encoder = codecs.getincrementalencoder('cp949')()\r | |
66 | self.assertEqual(encoder.encode(u'\ud30c\uc774\uc36c \ub9c8\uc744'),\r | |
67 | '\xc6\xc4\xc0\xcc\xbd\xe3 \xb8\xb6\xc0\xbb')\r | |
68 | self.assertEqual(encoder.reset(), None)\r | |
69 | self.assertEqual(encoder.encode(u'\u2606\u223c\u2606', True),\r | |
70 | '\xa1\xd9\xa1\xad\xa1\xd9')\r | |
71 | self.assertEqual(encoder.reset(), None)\r | |
72 | self.assertEqual(encoder.encode(u'', True), '')\r | |
73 | self.assertEqual(encoder.encode(u'', False), '')\r | |
74 | self.assertEqual(encoder.reset(), None)\r | |
75 | \r | |
76 | def test_stateful(self):\r | |
77 | # jisx0213 encoder is stateful for a few codepoints. eg)\r | |
78 | # U+00E6 => A9DC\r | |
79 | # U+00E6 U+0300 => ABC4\r | |
80 | # U+0300 => ABDC\r | |
81 | \r | |
82 | encoder = codecs.getincrementalencoder('jisx0213')()\r | |
83 | self.assertEqual(encoder.encode(u'\u00e6\u0300'), '\xab\xc4')\r | |
84 | self.assertEqual(encoder.encode(u'\u00e6'), '')\r | |
85 | self.assertEqual(encoder.encode(u'\u0300'), '\xab\xc4')\r | |
86 | self.assertEqual(encoder.encode(u'\u00e6', True), '\xa9\xdc')\r | |
87 | \r | |
88 | self.assertEqual(encoder.reset(), None)\r | |
89 | self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')\r | |
90 | \r | |
91 | self.assertEqual(encoder.encode(u'\u00e6'), '')\r | |
92 | self.assertEqual(encoder.encode('', True), '\xa9\xdc')\r | |
93 | self.assertEqual(encoder.encode('', True), '')\r | |
94 | \r | |
95 | def test_stateful_keep_buffer(self):\r | |
96 | encoder = codecs.getincrementalencoder('jisx0213')()\r | |
97 | self.assertEqual(encoder.encode(u'\u00e6'), '')\r | |
98 | self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')\r | |
99 | self.assertEqual(encoder.encode(u'\u0300\u00e6'), '\xab\xc4')\r | |
100 | self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')\r | |
101 | self.assertEqual(encoder.reset(), None)\r | |
102 | self.assertEqual(encoder.encode(u'\u0300'), '\xab\xdc')\r | |
103 | self.assertEqual(encoder.encode(u'\u00e6'), '')\r | |
104 | self.assertRaises(UnicodeEncodeError, encoder.encode, u'\u0123')\r | |
105 | self.assertEqual(encoder.encode(u'', True), '\xa9\xdc')\r | |
106 | \r | |
107 | def test_issue5640(self):\r | |
108 | encoder = codecs.getincrementalencoder('shift-jis')('backslashreplace')\r | |
109 | self.assertEqual(encoder.encode(u'\xff'), b'\\xff')\r | |
110 | self.assertEqual(encoder.encode(u'\n'), b'\n')\r | |
111 | \r | |
112 | class Test_IncrementalDecoder(unittest.TestCase):\r | |
113 | \r | |
114 | def test_dbcs(self):\r | |
115 | # cp949 decoder is simple with only 1 or 2 bytes sequences.\r | |
116 | decoder = codecs.getincrementaldecoder('cp949')()\r | |
117 | self.assertEqual(decoder.decode('\xc6\xc4\xc0\xcc\xbd'),\r | |
118 | u'\ud30c\uc774')\r | |
119 | self.assertEqual(decoder.decode('\xe3 \xb8\xb6\xc0\xbb'),\r | |
120 | u'\uc36c \ub9c8\uc744')\r | |
121 | self.assertEqual(decoder.decode(''), u'')\r | |
122 | \r | |
123 | def test_dbcs_keep_buffer(self):\r | |
124 | decoder = codecs.getincrementaldecoder('cp949')()\r | |
125 | self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')\r | |
126 | self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)\r | |
127 | self.assertEqual(decoder.decode('\xcc'), u'\uc774')\r | |
128 | \r | |
129 | self.assertEqual(decoder.decode('\xc6\xc4\xc0'), u'\ud30c')\r | |
130 | self.assertRaises(UnicodeDecodeError, decoder.decode, '\xcc\xbd', True)\r | |
131 | self.assertEqual(decoder.decode('\xcc'), u'\uc774')\r | |
132 | \r | |
133 | def test_iso2022(self):\r | |
134 | decoder = codecs.getincrementaldecoder('iso2022-jp')()\r | |
135 | ESC = '\x1b'\r | |
136 | self.assertEqual(decoder.decode(ESC + '('), u'')\r | |
137 | self.assertEqual(decoder.decode('B', True), u'')\r | |
138 | self.assertEqual(decoder.decode(ESC + '$'), u'')\r | |
139 | self.assertEqual(decoder.decode('B@$'), u'\u4e16')\r | |
140 | self.assertEqual(decoder.decode('@$@'), u'\u4e16')\r | |
141 | self.assertEqual(decoder.decode('$', True), u'\u4e16')\r | |
142 | self.assertEqual(decoder.reset(), None)\r | |
143 | self.assertEqual(decoder.decode('@$'), u'@$')\r | |
144 | self.assertEqual(decoder.decode(ESC + '$'), u'')\r | |
145 | self.assertRaises(UnicodeDecodeError, decoder.decode, '', True)\r | |
146 | self.assertEqual(decoder.decode('B@$'), u'\u4e16')\r | |
147 | \r | |
148 | class Test_StreamReader(unittest.TestCase):\r | |
149 | def test_bug1728403(self):\r | |
150 | try:\r | |
151 | open(TESTFN, 'w').write('\xa1')\r | |
152 | f = codecs.open(TESTFN, encoding='cp949')\r | |
153 | self.assertRaises(UnicodeDecodeError, f.read, 2)\r | |
154 | finally:\r | |
155 | try: f.close()\r | |
156 | except: pass\r | |
157 | os.unlink(TESTFN)\r | |
158 | \r | |
159 | class Test_StreamWriter(unittest.TestCase):\r | |
160 | if len(u'\U00012345') == 2: # UCS2\r | |
161 | def test_gb18030(self):\r | |
162 | s = StringIO.StringIO()\r | |
163 | c = codecs.getwriter('gb18030')(s)\r | |
164 | c.write(u'123')\r | |
165 | self.assertEqual(s.getvalue(), '123')\r | |
166 | c.write(u'\U00012345')\r | |
167 | self.assertEqual(s.getvalue(), '123\x907\x959')\r | |
168 | c.write(u'\U00012345'[0])\r | |
169 | self.assertEqual(s.getvalue(), '123\x907\x959')\r | |
170 | c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')\r | |
171 | self.assertEqual(s.getvalue(),\r | |
172 | '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')\r | |
173 | c.write(u'\U00012345'[0])\r | |
174 | self.assertEqual(s.getvalue(),\r | |
175 | '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')\r | |
176 | self.assertRaises(UnicodeError, c.reset)\r | |
177 | self.assertEqual(s.getvalue(),\r | |
178 | '123\x907\x959\x907\x959\x907\x959\x827\xcf5\x810\x851')\r | |
179 | \r | |
180 | def test_utf_8(self):\r | |
181 | s= StringIO.StringIO()\r | |
182 | c = codecs.getwriter('utf-8')(s)\r | |
183 | c.write(u'123')\r | |
184 | self.assertEqual(s.getvalue(), '123')\r | |
185 | c.write(u'\U00012345')\r | |
186 | self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')\r | |
187 | \r | |
188 | # Python utf-8 codec can't buffer surrogate pairs yet.\r | |
189 | if 0:\r | |
190 | c.write(u'\U00012345'[0])\r | |
191 | self.assertEqual(s.getvalue(), '123\xf0\x92\x8d\x85')\r | |
192 | c.write(u'\U00012345'[1] + u'\U00012345' + u'\uac00\u00ac')\r | |
193 | self.assertEqual(s.getvalue(),\r | |
194 | '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'\r | |
195 | '\xea\xb0\x80\xc2\xac')\r | |
196 | c.write(u'\U00012345'[0])\r | |
197 | self.assertEqual(s.getvalue(),\r | |
198 | '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'\r | |
199 | '\xea\xb0\x80\xc2\xac')\r | |
200 | c.reset()\r | |
201 | self.assertEqual(s.getvalue(),\r | |
202 | '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'\r | |
203 | '\xea\xb0\x80\xc2\xac\xed\xa0\x88')\r | |
204 | c.write(u'\U00012345'[1])\r | |
205 | self.assertEqual(s.getvalue(),\r | |
206 | '123\xf0\x92\x8d\x85\xf0\x92\x8d\x85\xf0\x92\x8d\x85'\r | |
207 | '\xea\xb0\x80\xc2\xac\xed\xa0\x88\xed\xbd\x85')\r | |
208 | \r | |
209 | else: # UCS4\r | |
210 | pass\r | |
211 | \r | |
212 | def test_streamwriter_strwrite(self):\r | |
213 | s = StringIO.StringIO()\r | |
214 | wr = codecs.getwriter('gb18030')(s)\r | |
215 | wr.write('abcd')\r | |
216 | self.assertEqual(s.getvalue(), 'abcd')\r | |
217 | \r | |
218 | class Test_ISO2022(unittest.TestCase):\r | |
219 | def test_g2(self):\r | |
220 | iso2022jp2 = '\x1b(B:hu4:unit\x1b.A\x1bNi de famille'\r | |
221 | uni = u':hu4:unit\xe9 de famille'\r | |
222 | self.assertEqual(iso2022jp2.decode('iso2022-jp-2'), uni)\r | |
223 | \r | |
224 | def test_iso2022_jp_g0(self):\r | |
225 | self.assertNotIn('\x0e', u'\N{SOFT HYPHEN}'.encode('iso-2022-jp-2'))\r | |
226 | for encoding in ('iso-2022-jp-2004', 'iso-2022-jp-3'):\r | |
227 | e = u'\u3406'.encode(encoding)\r | |
228 | self.assertFalse(filter(lambda x: x >= '\x80', e))\r | |
229 | \r | |
230 | def test_bug1572832(self):\r | |
231 | if sys.maxunicode >= 0x10000:\r | |
232 | myunichr = unichr\r | |
233 | else:\r | |
234 | myunichr = lambda x: unichr(0xD7C0+(x>>10)) + unichr(0xDC00+(x&0x3FF))\r | |
235 | \r | |
236 | for x in xrange(0x10000, 0x110000):\r | |
237 | # Any ISO 2022 codec will cause the segfault\r | |
238 | myunichr(x).encode('iso_2022_jp', 'ignore')\r | |
239 | \r | |
240 | class TestStateful(unittest.TestCase):\r | |
241 | text = u'\u4E16\u4E16'\r | |
242 | encoding = 'iso-2022-jp'\r | |
243 | expected = b'\x1b$B@$@$'\r | |
244 | expected_reset = b'\x1b$B@$@$\x1b(B'\r | |
245 | \r | |
246 | def test_encode(self):\r | |
247 | self.assertEqual(self.text.encode(self.encoding), self.expected_reset)\r | |
248 | \r | |
249 | def test_incrementalencoder(self):\r | |
250 | encoder = codecs.getincrementalencoder(self.encoding)()\r | |
251 | output = b''.join(\r | |
252 | encoder.encode(char)\r | |
253 | for char in self.text)\r | |
254 | self.assertEqual(output, self.expected)\r | |
255 | \r | |
256 | def test_incrementalencoder_final(self):\r | |
257 | encoder = codecs.getincrementalencoder(self.encoding)()\r | |
258 | last_index = len(self.text) - 1\r | |
259 | output = b''.join(\r | |
260 | encoder.encode(char, index == last_index)\r | |
261 | for index, char in enumerate(self.text))\r | |
262 | self.assertEqual(output, self.expected_reset)\r | |
263 | \r | |
264 | class TestHZStateful(TestStateful):\r | |
265 | text = u'\u804a\u804a'\r | |
266 | encoding = 'hz'\r | |
267 | expected = b'~{ADAD'\r | |
268 | expected_reset = b'~{ADAD~}'\r | |
269 | \r | |
270 | def test_main():\r | |
271 | test_support.run_unittest(__name__)\r | |
272 | \r | |
273 | if __name__ == "__main__":\r | |
274 | test_main()\r |