2 * _codecs_cn.c: Codecs collection for Mainland Chinese encodings
4 * Written by Hye-Shik Chang <perky@FreeBSD.org>
8 #include "mappings_cn.h"
11 * hz is predefined as 100 on AIX. So we undefine it to avoid
12 * conflict against hz codec's.
18 /* GBK and GB2312 map differently in few code points that are listed below:
21 * A1A4 U+30FB KATAKANA MIDDLE DOT U+00B7 MIDDLE DOT
22 * A1AA U+2015 HORIZONTAL BAR U+2014 EM DASH
23 * A844 undefined U+2015 HORIZONTAL BAR
26 #define GBK_DECODE(dc1, dc2, assi) \
27 if ((dc1) == 0xa1 && (dc2) == 0xaa) (assi) = 0x2014; \
28 else if ((dc1) == 0xa8 && (dc2) == 0x44) (assi) = 0x2015; \
29 else if ((dc1) == 0xa1 && (dc2) == 0xa4) (assi) = 0x00b7; \
30 else TRYMAP_DEC(gb2312, assi, dc1 ^ 0x80, dc2 ^ 0x80); \
31 else TRYMAP_DEC(gbkext, assi, dc1, dc2);
33 #define GBK_ENCODE(code, assi) \
34 if ((code) == 0x2014) (assi) = 0xa1aa; \
35 else if ((code) == 0x2015) (assi) = 0xa844; \
36 else if ((code) == 0x00b7) (assi) = 0xa1a4; \
37 else if ((code) != 0x30fb && TRYMAP_ENC_COND(gbcommon, assi, code));
50 WRITE1((unsigned char)c
)
57 TRYMAP_ENC(gbcommon
, code
, c
);
60 if (code
& 0x8000) /* MSB set: GBK */
63 OUT1((code
>> 8) | 0x80)
64 OUT2((code
& 0xFF) | 0x80)
74 unsigned char c
= **inbuf
;
85 TRYMAP_DEC(gb2312
, **outbuf
, c
^ 0x80, IN2
^ 0x80) {
106 WRITE1((unsigned char)c
)
117 OUT1((code
>> 8) | 0x80)
119 OUT2((code
& 0xFF)) /* MSB set: GBK */
121 OUT2((code
& 0xFF) | 0x80) /* MSB unset: GB2312 */
131 unsigned char c
= IN1
;
143 GBK_DECODE(c
, IN2
, **outbuf
)
171 #if Py_UNICODE_SIZE == 2
172 return 2; /* surrogates pair */
176 else if (c
>= 0x10000) {
177 ucs4_t tc
= c
- 0x10000;
181 OUT4((unsigned char)(tc
% 10) + 0x30)
183 OUT3((unsigned char)(tc
% 126) + 0x81)
185 OUT2((unsigned char)(tc
% 10) + 0x30)
187 OUT1((unsigned char)(tc
+ 0x90))
189 #if Py_UNICODE_SIZE == 2
190 NEXT(2, 4) /* surrogates pair */
200 else TRYMAP_ENC(gb18030ext
, code
, c
);
202 const struct _gb18030_to_unibmp_ranges
*utrrange
;
206 for (utrrange
= gb18030_to_unibmp_ranges
;
207 utrrange
->first
!= 0;
209 if (utrrange
->first
<= c
&&
210 c
<= utrrange
->last
) {
213 tc
= c
- utrrange
->first
+
216 OUT4((unsigned char)(tc
% 10) + 0x30)
218 OUT3((unsigned char)(tc
% 126) + 0x81)
220 OUT2((unsigned char)(tc
% 10) + 0x30)
222 OUT1((unsigned char)tc
+ 0x81)
228 if (utrrange
->first
== 0)
233 OUT1((code
>> 8) | 0x80)
235 OUT2((code
& 0xFF)) /* MSB set: GBK or GB18030ext */
237 OUT2((code
& 0xFF) | 0x80) /* MSB unset: GB2312 */
248 unsigned char c
= IN1
, c2
;
261 if (c2
>= 0x30 && c2
<= 0x39) { /* 4 bytes seq */
262 const struct _gb18030_to_unibmp_ranges
*utr
;
263 unsigned char c3
, c4
;
269 if (c
< 0x81 || c3
< 0x81 || c4
< 0x30 || c4
> 0x39)
271 c
-= 0x81; c2
-= 0x30;
272 c3
-= 0x81; c4
-= 0x30;
274 if (c
< 4) { /* U+0080 - U+FFFF */
275 lseq
= ((ucs4_t
)c
* 10 + c2
) * 1260 +
276 (ucs4_t
)c3
* 10 + c4
;
278 for (utr
= gb18030_to_unibmp_ranges
;
279 lseq
>= (utr
+ 1)->base
;
281 OUT1(utr
->first
- utr
->base
+ lseq
)
286 else if (c
>= 15) { /* U+10000 - U+10FFFF */
287 lseq
= 0x10000 + (((ucs4_t
)c
-15) * 10 + c2
)
288 * 1260 + (ucs4_t
)c3
* 10 + c4
;
289 if (lseq
<= 0x10FFFF) {
298 GBK_DECODE(c
, c2
, **outbuf
)
299 else TRYMAP_DEC(gb18030ext
, **outbuf
, c
, c2
);
337 WRITE1((unsigned char)c
)
341 WRITE3('~', '}', (unsigned char)c
)
350 TRYMAP_ENC(gbcommon
, code
, c
);
353 if (code
& 0x8000) /* MSB set: GBK */
357 WRITE4('~', '{', code
>> 8, code
& 0xff)
362 WRITE2(code
>> 8, code
& 0xff)
385 unsigned char c
= IN1
;
388 unsigned char c2
= IN2
;
396 else if (c2
== '{' && state
->i
== 0)
397 state
->i
= 1; /* set GB */
398 else if (c2
== '}' && state
->i
== 1)
399 state
->i
= 0; /* set ASCII */
401 ; /* line-continuation */
411 if (state
->i
== 0) { /* ASCII mode */
418 TRYMAP_DEC(gb2312
, **outbuf
, c
, IN2
) {
431 MAPPING_DECONLY(gb2312
)
432 MAPPING_DECONLY(gbkext
)
433 MAPPING_ENCONLY(gbcommon
)
434 MAPPING_ENCDEC(gb18030ext
)
438 CODEC_STATELESS(gb2312
)
440 CODEC_STATELESS(gb18030
)
444 I_AM_A_MODULE_FOR(cn
)