]>
Commit | Line | Data |
---|---|---|
7eb75bcc DM |
1 | /*\r |
2 | * _codecs_cn.c: Codecs collection for Mainland Chinese encodings\r | |
3 | *\r | |
4 | * Written by Hye-Shik Chang <perky@FreeBSD.org>\r | |
5 | */\r | |
6 | \r | |
7 | #include "cjkcodecs.h"\r | |
8 | #include "mappings_cn.h"\r | |
9 | \r | |
10 | /**\r | |
11 | * hz is predefined as 100 on AIX. So we undefine it to avoid\r | |
12 | * conflict against hz codec's.\r | |
13 | */\r | |
14 | #ifdef _AIX\r | |
15 | #undef hz\r | |
16 | #endif\r | |
17 | \r | |
18 | /* GBK and GB2312 map differently in few code points that are listed below:\r | |
19 | *\r | |
20 | * gb2312 gbk\r | |
21 | * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT\r | |
22 | * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH\r | |
23 | * A844 undefined U+2015 HORIZONTAL BAR\r | |
24 | */\r | |
25 | \r | |
26 | #define GBK_DECODE(dc1, dc2, assi) \\r | |
27 | if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \\r | |
28 | else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \\r | |
29 | else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \\r | |
30 | else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \\r | |
31 | else TRYMAP_DEC(gbkext, assi, dc1, dc2);\r | |
32 | \r | |
33 | #define GBK_ENCODE(code, assi) \\r | |
34 | if ((code) == 0x2014) (assi) = 0xa1aa; \\r | |
35 | else if ((code) == 0x2015) (assi) = 0xa844; \\r | |
36 | else if ((code) == 0x00b7) (assi) = 0xa1a4; \\r | |
37 | else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));\r | |
38 | \r | |
39 | /*\r | |
40 | * GB2312 codec\r | |
41 | */\r | |
42 | \r | |
43 | ENCODER(gb2312)\r | |
44 | {\r | |
45 | while (inleft > 0) {\r | |
46 | Py_UNICODE c = IN1;\r | |
47 | DBCHAR code;\r | |
48 | \r | |
49 | if (c < 0x80) {\r | |
50 | WRITE1((unsigned char)c)\r | |
51 | NEXT(1, 1)\r | |
52 | continue;\r | |
53 | }\r | |
54 | UCS4INVALID(c)\r | |
55 | \r | |
56 | REQUIRE_OUTBUF(2)\r | |
57 | TRYMAP_ENC(gbcommon, code, c);\r | |
58 | else return 1;\r | |
59 | \r | |
60 | if (code & 0x8000) /* MSB set: GBK */\r | |
61 | return 1;\r | |
62 | \r | |
63 | OUT1((code >> 8) | 0x80)\r | |
64 | OUT2((code & 0xFF) | 0x80)\r | |
65 | NEXT(1, 2)\r | |
66 | }\r | |
67 | \r | |
68 | return 0;\r | |
69 | }\r | |
70 | \r | |
71 | DECODER(gb2312)\r | |
72 | {\r | |
73 | while (inleft > 0) {\r | |
74 | unsigned char c = **inbuf;\r | |
75 | \r | |
76 | REQUIRE_OUTBUF(1)\r | |
77 | \r | |
78 | if (c < 0x80) {\r | |
79 | OUT1(c)\r | |
80 | NEXT(1, 1)\r | |
81 | continue;\r | |
82 | }\r | |
83 | \r | |
84 | REQUIRE_INBUF(2)\r | |
85 | TRYMAP_DEC(gb2312, **outbuf, c ^ 0x80, IN2 ^ 0x80) {\r | |
86 | NEXT(2, 1)\r | |
87 | }\r | |
88 | else return 2;\r | |
89 | }\r | |
90 | \r | |
91 | return 0;\r | |
92 | }\r | |
93 | \r | |
94 | \r | |
95 | /*\r | |
96 | * GBK codec\r | |
97 | */\r | |
98 | \r | |
99 | ENCODER(gbk)\r | |
100 | {\r | |
101 | while (inleft > 0) {\r | |
102 | Py_UNICODE c = IN1;\r | |
103 | DBCHAR code;\r | |
104 | \r | |
105 | if (c < 0x80) {\r | |
106 | WRITE1((unsigned char)c)\r | |
107 | NEXT(1, 1)\r | |
108 | continue;\r | |
109 | }\r | |
110 | UCS4INVALID(c)\r | |
111 | \r | |
112 | REQUIRE_OUTBUF(2)\r | |
113 | \r | |
114 | GBK_ENCODE(c, code)\r | |
115 | else return 1;\r | |
116 | \r | |
117 | OUT1((code >> 8) | 0x80)\r | |
118 | if (code & 0x8000)\r | |
119 | OUT2((code & 0xFF)) /* MSB set: GBK */\r | |
120 | else\r | |
121 | OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */\r | |
122 | NEXT(1, 2)\r | |
123 | }\r | |
124 | \r | |
125 | return 0;\r | |
126 | }\r | |
127 | \r | |
128 | DECODER(gbk)\r | |
129 | {\r | |
130 | while (inleft > 0) {\r | |
131 | unsigned char c = IN1;\r | |
132 | \r | |
133 | REQUIRE_OUTBUF(1)\r | |
134 | \r | |
135 | if (c < 0x80) {\r | |
136 | OUT1(c)\r | |
137 | NEXT(1, 1)\r | |
138 | continue;\r | |
139 | }\r | |
140 | \r | |
141 | REQUIRE_INBUF(2)\r | |
142 | \r | |
143 | GBK_DECODE(c, IN2, **outbuf)\r | |
144 | else return 2;\r | |
145 | \r | |
146 | NEXT(2, 1)\r | |
147 | }\r | |
148 | \r | |
149 | return 0;\r | |
150 | }\r | |
151 | \r | |
152 | \r | |
153 | /*\r | |
154 | * GB18030 codec\r | |
155 | */\r | |
156 | \r | |
157 | ENCODER(gb18030)\r | |
158 | {\r | |
159 | while (inleft > 0) {\r | |
160 | ucs4_t c = IN1;\r | |
161 | DBCHAR code;\r | |
162 | \r | |
163 | if (c < 0x80) {\r | |
164 | WRITE1(c)\r | |
165 | NEXT(1, 1)\r | |
166 | continue;\r | |
167 | }\r | |
168 | \r | |
169 | DECODE_SURROGATE(c)\r | |
170 | if (c > 0x10FFFF)\r | |
171 | #if Py_UNICODE_SIZE == 2\r | |
172 | return 2; /* surrogates pair */\r | |
173 | #else\r | |
174 | return 1;\r | |
175 | #endif\r | |
176 | else if (c >= 0x10000) {\r | |
177 | ucs4_t tc = c - 0x10000;\r | |
178 | \r | |
179 | REQUIRE_OUTBUF(4)\r | |
180 | \r | |
181 | OUT4((unsigned char)(tc % 10) + 0x30)\r | |
182 | tc /= 10;\r | |
183 | OUT3((unsigned char)(tc % 126) + 0x81)\r | |
184 | tc /= 126;\r | |
185 | OUT2((unsigned char)(tc % 10) + 0x30)\r | |
186 | tc /= 10;\r | |
187 | OUT1((unsigned char)(tc + 0x90))\r | |
188 | \r | |
189 | #if Py_UNICODE_SIZE == 2\r | |
190 | NEXT(2, 4) /* surrogates pair */\r | |
191 | #else\r | |
192 | NEXT(1, 4)\r | |
193 | #endif\r | |
194 | continue;\r | |
195 | }\r | |
196 | \r | |
197 | REQUIRE_OUTBUF(2)\r | |
198 | \r | |
199 | GBK_ENCODE(c, code)\r | |
200 | else TRYMAP_ENC(gb18030ext, code, c);\r | |
201 | else {\r | |
202 | const struct _gb18030_to_unibmp_ranges *utrrange;\r | |
203 | \r | |
204 | REQUIRE_OUTBUF(4)\r | |
205 | \r | |
206 | for (utrrange = gb18030_to_unibmp_ranges;\r | |
207 | utrrange->first != 0;\r | |
208 | utrrange++)\r | |
209 | if (utrrange->first <= c &&\r | |
210 | c <= utrrange->last) {\r | |
211 | Py_UNICODE tc;\r | |
212 | \r | |
213 | tc = c - utrrange->first +\r | |
214 | utrrange->base;\r | |
215 | \r | |
216 | OUT4((unsigned char)(tc % 10) + 0x30)\r | |
217 | tc /= 10;\r | |
218 | OUT3((unsigned char)(tc % 126) + 0x81)\r | |
219 | tc /= 126;\r | |
220 | OUT2((unsigned char)(tc % 10) + 0x30)\r | |
221 | tc /= 10;\r | |
222 | OUT1((unsigned char)tc + 0x81)\r | |
223 | \r | |
224 | NEXT(1, 4)\r | |
225 | break;\r | |
226 | }\r | |
227 | \r | |
228 | if (utrrange->first == 0)\r | |
229 | return 1;\r | |
230 | continue;\r | |
231 | }\r | |
232 | \r | |
233 | OUT1((code >> 8) | 0x80)\r | |
234 | if (code & 0x8000)\r | |
235 | OUT2((code & 0xFF)) /* MSB set: GBK or GB18030ext */\r | |
236 | else\r | |
237 | OUT2((code & 0xFF) | 0x80) /* MSB unset: GB2312 */\r | |
238 | \r | |
239 | NEXT(1, 2)\r | |
240 | }\r | |
241 | \r | |
242 | return 0;\r | |
243 | }\r | |
244 | \r | |
245 | DECODER(gb18030)\r | |
246 | {\r | |
247 | while (inleft > 0) {\r | |
248 | unsigned char c = IN1, c2;\r | |
249 | \r | |
250 | REQUIRE_OUTBUF(1)\r | |
251 | \r | |
252 | if (c < 0x80) {\r | |
253 | OUT1(c)\r | |
254 | NEXT(1, 1)\r | |
255 | continue;\r | |
256 | }\r | |
257 | \r | |
258 | REQUIRE_INBUF(2)\r | |
259 | \r | |
260 | c2 = IN2;\r | |
261 | if (c2 >= 0x30 && c2 <= 0x39) { /* 4 bytes seq */\r | |
262 | const struct _gb18030_to_unibmp_ranges *utr;\r | |
263 | unsigned char c3, c4;\r | |
264 | ucs4_t lseq;\r | |
265 | \r | |
266 | REQUIRE_INBUF(4)\r | |
267 | c3 = IN3;\r | |
268 | c4 = IN4;\r | |
269 | if (c < 0x81 || c3 < 0x81 || c4 < 0x30 || c4 > 0x39)\r | |
270 | return 4;\r | |
271 | c -= 0x81; c2 -= 0x30;\r | |
272 | c3 -= 0x81; c4 -= 0x30;\r | |
273 | \r | |
274 | if (c < 4) { /* U+0080 - U+FFFF */\r | |
275 | lseq = ((ucs4_t)c * 10 + c2) * 1260 +\r | |
276 | (ucs4_t)c3 * 10 + c4;\r | |
277 | if (lseq < 39420) {\r | |
278 | for (utr = gb18030_to_unibmp_ranges;\r | |
279 | lseq >= (utr + 1)->base;\r | |
280 | utr++) ;\r | |
281 | OUT1(utr->first - utr->base + lseq)\r | |
282 | NEXT(4, 1)\r | |
283 | continue;\r | |
284 | }\r | |
285 | }\r | |
286 | else if (c >= 15) { /* U+10000 - U+10FFFF */\r | |
287 | lseq = 0x10000 + (((ucs4_t)c-15) * 10 + c2)\r | |
288 | * 1260 + (ucs4_t)c3 * 10 + c4;\r | |
289 | if (lseq <= 0x10FFFF) {\r | |
290 | WRITEUCS4(lseq);\r | |
291 | NEXT_IN(4)\r | |
292 | continue;\r | |
293 | }\r | |
294 | }\r | |
295 | return 4;\r | |
296 | }\r | |
297 | \r | |
298 | GBK_DECODE(c, c2, **outbuf)\r | |
299 | else TRYMAP_DEC(gb18030ext, **outbuf, c, c2);\r | |
300 | else return 2;\r | |
301 | \r | |
302 | NEXT(2, 1)\r | |
303 | }\r | |
304 | \r | |
305 | return 0;\r | |
306 | }\r | |
307 | \r | |
308 | \r | |
309 | /*\r | |
310 | * HZ codec\r | |
311 | */\r | |
312 | \r | |
313 | ENCODER_INIT(hz)\r | |
314 | {\r | |
315 | state->i = 0;\r | |
316 | return 0;\r | |
317 | }\r | |
318 | \r | |
319 | ENCODER_RESET(hz)\r | |
320 | {\r | |
321 | if (state->i != 0) {\r | |
322 | WRITE2('~', '}')\r | |
323 | state->i = 0;\r | |
324 | NEXT_OUT(2)\r | |
325 | }\r | |
326 | return 0;\r | |
327 | }\r | |
328 | \r | |
329 | ENCODER(hz)\r | |
330 | {\r | |
331 | while (inleft > 0) {\r | |
332 | Py_UNICODE c = IN1;\r | |
333 | DBCHAR code;\r | |
334 | \r | |
335 | if (c < 0x80) {\r | |
336 | if (state->i == 0) {\r | |
337 | WRITE1((unsigned char)c)\r | |
338 | NEXT(1, 1)\r | |
339 | }\r | |
340 | else {\r | |
341 | WRITE3('~', '}', (unsigned char)c)\r | |
342 | NEXT(1, 3)\r | |
343 | state->i = 0;\r | |
344 | }\r | |
345 | continue;\r | |
346 | }\r | |
347 | \r | |
348 | UCS4INVALID(c)\r | |
349 | \r | |
350 | TRYMAP_ENC(gbcommon, code, c);\r | |
351 | else return 1;\r | |
352 | \r | |
353 | if (code & 0x8000) /* MSB set: GBK */\r | |
354 | return 1;\r | |
355 | \r | |
356 | if (state->i == 0) {\r | |
357 | WRITE4('~', '{', code >> 8, code & 0xff)\r | |
358 | NEXT(1, 4)\r | |
359 | state->i = 1;\r | |
360 | }\r | |
361 | else {\r | |
362 | WRITE2(code >> 8, code & 0xff)\r | |
363 | NEXT(1, 2)\r | |
364 | }\r | |
365 | }\r | |
366 | \r | |
367 | return 0;\r | |
368 | }\r | |
369 | \r | |
370 | DECODER_INIT(hz)\r | |
371 | {\r | |
372 | state->i = 0;\r | |
373 | return 0;\r | |
374 | }\r | |
375 | \r | |
376 | DECODER_RESET(hz)\r | |
377 | {\r | |
378 | state->i = 0;\r | |
379 | return 0;\r | |
380 | }\r | |
381 | \r | |
382 | DECODER(hz)\r | |
383 | {\r | |
384 | while (inleft > 0) {\r | |
385 | unsigned char c = IN1;\r | |
386 | \r | |
387 | if (c == '~') {\r | |
388 | unsigned char c2 = IN2;\r | |
389 | \r | |
390 | REQUIRE_INBUF(2)\r | |
391 | if (c2 == '~') {\r | |
392 | WRITE1('~')\r | |
393 | NEXT(2, 1)\r | |
394 | continue;\r | |
395 | }\r | |
396 | else if (c2 == '{' && state->i == 0)\r | |
397 | state->i = 1; /* set GB */\r | |
398 | else if (c2 == '}' && state->i == 1)\r | |
399 | state->i = 0; /* set ASCII */\r | |
400 | else if (c2 == '\n')\r | |
401 | ; /* line-continuation */\r | |
402 | else\r | |
403 | return 2;\r | |
404 | NEXT(2, 0);\r | |
405 | continue;\r | |
406 | }\r | |
407 | \r | |
408 | if (c & 0x80)\r | |
409 | return 1;\r | |
410 | \r | |
411 | if (state->i == 0) { /* ASCII mode */\r | |
412 | WRITE1(c)\r | |
413 | NEXT(1, 1)\r | |
414 | }\r | |
415 | else { /* GB mode */\r | |
416 | REQUIRE_INBUF(2)\r | |
417 | REQUIRE_OUTBUF(1)\r | |
418 | TRYMAP_DEC(gb2312, **outbuf, c, IN2) {\r | |
419 | NEXT(2, 1)\r | |
420 | }\r | |
421 | else\r | |
422 | return 2;\r | |
423 | }\r | |
424 | }\r | |
425 | \r | |
426 | return 0;\r | |
427 | }\r | |
428 | \r | |
429 | \r | |
430 | BEGIN_MAPPINGS_LIST\r | |
431 | MAPPING_DECONLY(gb2312)\r | |
432 | MAPPING_DECONLY(gbkext)\r | |
433 | MAPPING_ENCONLY(gbcommon)\r | |
434 | MAPPING_ENCDEC(gb18030ext)\r | |
435 | END_MAPPINGS_LIST\r | |
436 | \r | |
437 | BEGIN_CODECS_LIST\r | |
438 | CODEC_STATELESS(gb2312)\r | |
439 | CODEC_STATELESS(gbk)\r | |
440 | CODEC_STATELESS(gb18030)\r | |
441 | CODEC_STATEFUL(hz)\r | |
442 | END_CODECS_LIST\r | |
443 | \r | |
444 | I_AM_A_MODULE_FOR(cn)\r |