]>
Commit | Line | Data |
---|---|---|
14b0e578 CS |
1 | /**********************************************************************\r |
2 | utf16_le.c - Oniguruma (regular expression library)\r | |
3 | **********************************************************************/\r | |
4 | /*-\r | |
5 | * Copyright (c) 2002-2008 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>\r | |
6 | * All rights reserved.\r | |
7 | *\r | |
8 | * Redistribution and use in source and binary forms, with or without\r | |
9 | * modification, are permitted provided that the following conditions\r | |
10 | * are met:\r | |
11 | * 1. Redistributions of source code must retain the above copyright\r | |
12 | * notice, this list of conditions and the following disclaimer.\r | |
13 | * 2. Redistributions in binary form must reproduce the above copyright\r | |
14 | * notice, this list of conditions and the following disclaimer in the\r | |
15 | * documentation and/or other materials provided with the distribution.\r | |
16 | *\r | |
17 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND\r | |
18 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE\r | |
19 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE\r | |
20 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE\r | |
21 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL\r | |
22 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS\r | |
23 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)\r | |
24 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT\r | |
25 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY\r | |
26 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF\r | |
27 | * SUCH DAMAGE.\r | |
28 | */\r | |
29 | \r | |
30 | #include "regenc.h"\r | |
31 | \r | |
32 | static const int EncLen_UTF16[] = {\r | |
33 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
34 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
35 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
36 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
37 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
38 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
39 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
40 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
41 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
42 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
43 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
44 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
45 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
46 | 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,\r | |
47 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,\r | |
48 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2\r | |
49 | };\r | |
50 | \r | |
51 | static int\r | |
52 | utf16le_code_to_mbclen(OnigCodePoint code)\r | |
53 | {\r | |
54 | return (code > 0xffff ? 4 : 2);\r | |
55 | }\r | |
56 | \r | |
57 | static int\r | |
58 | utf16le_mbc_enc_len(const UChar* p)\r | |
59 | {\r | |
60 | return EncLen_UTF16[*(p+1)];\r | |
61 | }\r | |
62 | \r | |
63 | static int\r | |
64 | utf16le_is_mbc_newline(const UChar* p, const UChar* end)\r | |
65 | {\r | |
66 | if (p + 1 < end) {\r | |
67 | if (*p == 0x0a && *(p+1) == 0x00)\r | |
68 | return 1;\r | |
69 | #ifdef USE_UNICODE_ALL_LINE_TERMINATORS\r | |
70 | if ((\r | |
71 | #ifndef USE_CRNL_AS_LINE_TERMINATOR\r | |
72 | *p == 0x0d ||\r | |
73 | #endif\r | |
74 | *p == 0x85) && *(p+1) == 0x00)\r | |
75 | return 1;\r | |
76 | if (*(p+1) == 0x20 && (*p == 0x29 || *p == 0x28))\r | |
77 | return 1;\r | |
78 | #endif\r | |
79 | }\r | |
80 | return 0;\r | |
81 | }\r | |
82 | \r | |
83 | static OnigCodePoint\r | |
84 | utf16le_mbc_to_code(const UChar* p, const UChar* end ARG_UNUSED)\r | |
85 | {\r | |
86 | OnigCodePoint code;\r | |
87 | UChar c0 = *p;\r | |
88 | UChar c1 = *(p+1);\r | |
89 | \r | |
90 | if (UTF16_IS_SURROGATE_FIRST(c1)) {\r | |
91 | code = ((((c1 - 0xd8) << 2) + ((c0 & 0xc0) >> 6) + 1) << 16)\r | |
92 | + ((((c0 & 0x3f) << 2) + (p[3] - 0xdc)) << 8)\r | |
93 | + p[2];\r | |
94 | }\r | |
95 | else {\r | |
96 | code = c1 * 256 + p[0];\r | |
97 | }\r | |
98 | return code;\r | |
99 | }\r | |
100 | \r | |
101 | static int\r | |
102 | utf16le_code_to_mbc(OnigCodePoint code, UChar *buf)\r | |
103 | {\r | |
104 | UChar* p = buf;\r | |
105 | \r | |
106 | if (code > 0xffff) {\r | |
107 | unsigned int plane, high;\r | |
108 | \r | |
109 | plane = (code >> 16) - 1;\r | |
110 | high = (code & 0xff00) >> 8;\r | |
111 | \r | |
112 | *p++ = (UChar)(((plane & 0x03) << 6) + (high >> 2));\r | |
113 | *p++ = (UChar)((plane >> 2) + 0xd8);\r | |
114 | *p++ = (UChar )(code & 0xff);\r | |
115 | *p = (high & 0x03) + 0xdc;\r | |
116 | return 4;\r | |
117 | }\r | |
118 | else {\r | |
119 | *p++ = (UChar )(code & 0xff);\r | |
120 | *p++ = (UChar )((code & 0xff00) >> 8);\r | |
121 | return 2;\r | |
122 | }\r | |
123 | }\r | |
124 | \r | |
125 | static int\r | |
126 | utf16le_mbc_case_fold(OnigCaseFoldType flag,\r | |
127 | const UChar** pp, const UChar* end, UChar* fold)\r | |
128 | {\r | |
129 | const UChar* p = *pp;\r | |
130 | \r | |
131 | if (ONIGENC_IS_ASCII_CODE(*p) && *(p+1) == 0) {\r | |
132 | #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI\r | |
133 | if ((flag & ONIGENC_CASE_FOLD_TURKISH_AZERI) != 0) {\r | |
134 | if (*p == 0x49) {\r | |
135 | *fold++ = 0x31;\r | |
136 | *fold = 0x01;\r | |
137 | (*pp) += 2;\r | |
138 | return 2;\r | |
139 | }\r | |
140 | }\r | |
141 | #endif\r | |
142 | \r | |
143 | *fold++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p);\r | |
144 | *fold = 0;\r | |
145 | *pp += 2;\r | |
146 | return 2;\r | |
147 | }\r | |
148 | else\r | |
149 | return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_LE, flag, pp, end,\r | |
150 | fold);\r | |
151 | }\r | |
152 | \r | |
153 | #if 0\r | |
154 | static int\r | |
155 | utf16le_is_mbc_ambiguous(OnigCaseFoldType flag, const UChar** pp,\r | |
156 | const UChar* end)\r | |
157 | {\r | |
158 | const UChar* p = *pp;\r | |
159 | \r | |
160 | (*pp) += EncLen_UTF16[*(p+1)];\r | |
161 | \r | |
162 | if (*(p+1) == 0) {\r | |
163 | int c, v;\r | |
164 | \r | |
165 | if (*p == 0xdf && (flag & INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR) != 0) {\r | |
166 | return TRUE;\r | |
167 | }\r | |
168 | \r | |
169 | c = *p;\r | |
170 | v = ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c,\r | |
171 | (BIT_CTYPE_UPPER | BIT_CTYPE_LOWER));\r | |
172 | if ((v | BIT_CTYPE_LOWER) != 0) {\r | |
173 | /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */\r | |
174 | if (c >= 0xaa && c <= 0xba)\r | |
175 | return FALSE;\r | |
176 | else\r | |
177 | return TRUE;\r | |
178 | }\r | |
179 | return (v != 0 ? TRUE : FALSE);\r | |
180 | }\r | |
181 | \r | |
182 | return FALSE;\r | |
183 | }\r | |
184 | #endif\r | |
185 | \r | |
186 | static UChar*\r | |
187 | utf16le_left_adjust_char_head(const UChar* start, const UChar* s)\r | |
188 | {\r | |
189 | if (s <= start) return (UChar* )s;\r | |
190 | \r | |
191 | if ((s - start) % 2 == 1) {\r | |
192 | s--;\r | |
193 | }\r | |
194 | \r | |
195 | if (UTF16_IS_SURROGATE_SECOND(*(s+1)) && s > start + 1)\r | |
196 | s -= 2;\r | |
197 | \r | |
198 | return (UChar* )s;\r | |
199 | }\r | |
200 | \r | |
201 | static int\r | |
202 | utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag,\r | |
203 | const OnigUChar* p, const OnigUChar* end, OnigCaseFoldCodeItem items[])\r | |
204 | {\r | |
205 | return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_LE,\r | |
206 | flag, p, end, items);\r | |
207 | }\r | |
208 | \r | |
209 | OnigEncodingType OnigEncodingUTF16_LE = {\r | |
210 | utf16le_mbc_enc_len,\r | |
211 | "UTF-16LE", /* name */\r | |
212 | 4, /* max byte length */\r | |
213 | 2, /* min byte length */\r | |
214 | utf16le_is_mbc_newline,\r | |
215 | utf16le_mbc_to_code,\r | |
216 | utf16le_code_to_mbclen,\r | |
217 | utf16le_code_to_mbc,\r | |
218 | utf16le_mbc_case_fold,\r | |
219 | onigenc_unicode_apply_all_case_fold,\r | |
220 | utf16le_get_case_fold_codes_by_str,\r | |
221 | onigenc_unicode_property_name_to_ctype,\r | |
222 | onigenc_unicode_is_code_ctype,\r | |
223 | onigenc_utf16_32_get_ctype_code_range,\r | |
224 | utf16le_left_adjust_char_head,\r | |
225 | onigenc_always_false_is_allowed_reverse_match\r | |
226 | };\r |