1 /**********************************************************************
2 utf16_le.c - Oniguruma (regular expression library)
3 **********************************************************************/
5 * Copyright (c) 2002-2019 K.Kosako <sndgk393 AT ybb DOT ne DOT jp>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in the
15 * documentation and/or other materials provided with the distribution.
17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 #include "regint.h" /* for USE_CALLOUT */
42 enc
= ONIG_ENCODING_UTF16_LE
;
44 name
= "F\000A\000I\000L\000\000\000"; BC0_P(name
, fail
);
45 name
= "M\000I\000S\000M\000A\000T\000C\000H\000\000\000"; BC0_P(name
, mismatch
);
47 name
= "M\000A\000X\000\000\000";
48 args
[0] = ONIG_TYPE_TAG
| ONIG_TYPE_LONG
;
49 args
[1] = ONIG_TYPE_CHAR
;
51 BC_B_O(name
, max
, 2, args
, 1, opts
);
53 name
= "E\000R\000R\000O\000R\000\000\000";
54 args
[0] = ONIG_TYPE_LONG
; opts
[0].l
= ONIG_ABORT
;
55 BC_P_O(name
, error
, 1, args
, 1, opts
);
57 name
= "C\000O\000U\000N\000T\000\000\000";
58 args
[0] = ONIG_TYPE_CHAR
; opts
[0].c
= '>';
59 BC_B_O(name
, count
, 1, args
, 1, opts
);
61 name
= "T\000O\000T\000A\000L\000_\000C\000O\000U\000N\000T\000\000\000";
62 args
[0] = ONIG_TYPE_CHAR
; opts
[0].c
= '>';
63 BC_B_O(name
, total_count
, 1, args
, 1, opts
);
65 name
= "C\000M\000P\000\000\000";
66 args
[0] = ONIG_TYPE_TAG
| ONIG_TYPE_LONG
;
67 args
[1] = ONIG_TYPE_STRING
;
68 args
[2] = ONIG_TYPE_TAG
| ONIG_TYPE_LONG
;
69 BC_P(name
, cmp
, 3, args
);
71 #endif /* USE_CALLOUT */
76 static const int EncLen_UTF16
[] = {
77 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
78 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
79 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
80 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
81 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
82 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
83 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
84 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
85 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
86 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
87 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
88 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
89 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
90 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2,
91 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
92 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2
96 utf16le_code_to_mbclen(OnigCodePoint code
)
100 return ONIGERR_INVALID_CODE_POINT_VALUE
;
110 utf16le_mbc_enc_len(const UChar
* p
)
112 return EncLen_UTF16
[*(p
+1)];
116 is_valid_mbc_string(const UChar
* p
, const UChar
* end
)
118 const UChar
* end1
= end
- 1;
121 int len
= utf16le_mbc_enc_len(p
);
123 if (p
+ 3 < end
&& ! UTF16_IS_SURROGATE_SECOND(*(p
+ 3)))
127 if (UTF16_IS_SURROGATE_SECOND(*(p
+ 1)))
140 utf16le_is_mbc_newline(const UChar
* p
, const UChar
* end
)
143 if (*p
== 0x0a && *(p
+1) == 0x00)
145 #ifdef USE_UNICODE_ALL_LINE_TERMINATORS
147 #ifndef USE_CRNL_AS_LINE_TERMINATOR
150 *p
== 0x85) && *(p
+1) == 0x00)
153 if (*(p
+1) == 0x20 && (*p
== 0x29 || *p
== 0x28))
161 utf16le_mbc_to_code(const UChar
* p
, const UChar
* end ARG_UNUSED
)
167 if (UTF16_IS_SURROGATE_FIRST(c1
)) {
168 code
= ((((c1
- 0xd8) << 2) + ((c0
& 0xc0) >> 6) + 1) << 16)
169 + ((((c0
& 0x3f) << 2) + (p
[3] - 0xdc)) << 8)
173 code
= c1
* 256 + p
[0];
179 utf16le_code_to_mbc(OnigCodePoint code
, UChar
*buf
)
184 unsigned int plane
, high
;
186 plane
= (code
>> 16) - 1;
187 high
= (code
& 0xff00) >> 8;
189 *p
++ = ((plane
& 0x03) << 6) + (high
>> 2);
190 *p
++ = (plane
>> 2) + 0xd8;
191 *p
++ = (UChar
)(code
& 0xff);
192 *p
= (high
& 0x03) + 0xdc;
196 *p
++ = (UChar
)(code
& 0xff);
197 *p
++ = (UChar
)((code
& 0xff00) >> 8);
203 utf16le_mbc_case_fold(OnigCaseFoldType flag
,
204 const UChar
** pp
, const UChar
* end
, UChar
* fold
)
206 const UChar
* p
= *pp
;
208 if (ONIGENC_IS_ASCII_CODE(*p
) && *(p
+1) == 0) {
209 #ifdef USE_UNICODE_CASE_FOLD_TURKISH_AZERI
210 if ((flag
& ONIGENC_CASE_FOLD_TURKISH_AZERI
) != 0) {
220 *fold
++ = ONIGENC_ASCII_CODE_TO_LOWER_CASE(*p
);
226 return onigenc_unicode_mbc_case_fold(ONIG_ENCODING_UTF16_LE
, flag
, pp
, end
,
232 utf16le_is_mbc_ambiguous(OnigCaseFoldType flag
, const UChar
** pp
,
235 const UChar
* p
= *pp
;
237 (*pp
) += EncLen_UTF16
[*(p
+1)];
242 if (*p
== 0xdf && (flag
& INTERNAL_ONIGENC_CASE_FOLD_MULTI_CHAR
) != 0) {
247 v
= ONIGENC_IS_UNICODE_ISO_8859_1_BIT_CTYPE(c
,
248 (BIT_CTYPE_UPPER
| BIT_CTYPE_LOWER
));
249 if ((v
| BIT_CTYPE_LOWER
) != 0) {
250 /* 0xaa, 0xb5, 0xba are lower case letter, but can't convert. */
251 if (c
>= 0xaa && c
<= 0xba)
256 return (v
!= 0 ? TRUE
: FALSE
);
264 utf16le_left_adjust_char_head(const UChar
* start
, const UChar
* s
)
266 if (s
<= start
) return (UChar
* )s
;
268 if ((s
- start
) % 2 == 1) {
272 if (UTF16_IS_SURROGATE_SECOND(*(s
+1)) && s
> start
+ 1 &&
273 UTF16_IS_SURROGATE_FIRST(*(s
-1)))
280 utf16le_get_case_fold_codes_by_str(OnigCaseFoldType flag
,
281 const OnigUChar
* p
, const OnigUChar
* end
, OnigCaseFoldCodeItem items
[])
283 return onigenc_unicode_get_case_fold_codes_by_str(ONIG_ENCODING_UTF16_LE
,
284 flag
, p
, end
, items
);
287 OnigEncodingType OnigEncodingUTF16_LE
= {
289 "UTF-16LE", /* name */
290 4, /* max enc length */
291 2, /* min enc length */
292 utf16le_is_mbc_newline
,
294 utf16le_code_to_mbclen
,
296 utf16le_mbc_case_fold
,
297 onigenc_unicode_apply_all_case_fold
,
298 utf16le_get_case_fold_codes_by_str
,
299 onigenc_unicode_property_name_to_ctype
,
300 onigenc_unicode_is_code_ctype
,
301 onigenc_utf16_32_get_ctype_code_range
,
302 utf16le_left_adjust_char_head
,
303 onigenc_always_false_is_allowed_reverse_match
,
305 0, /* is_initialized */
307 ENC_FLAG_UNICODE
|ENC_FLAG_SKIP_OFFSET_1
,