]>
git.proxmox.com Git - mirror_ovs.git/blob - lib/unicode.c
2 * Copyright (c) 2009, 2010 Nicira, Inc.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
23 #include "openvswitch/dynamic-string.h"
26 /* Returns the unicode code point corresponding to leading surrogate 'leading'
27 * and trailing surrogate 'trailing'. The return value will not make any
28 * sense if 'leading' or 'trailing' are not in the correct ranges for leading
29 * or trailing surrogates. */
31 utf16_decode_surrogate_pair(int leading
, int trailing
)
34 * Leading surrogate: 110110wwwwxxxxxx
35 * Trailing surrogate: 110111xxxxxxxxxx
36 * Code point: 000uuuuuxxxxxxxxxxxxxxxx
38 int w
= (leading
>> 6) & 0xf;
40 int x0
= leading
& 0x3f;
41 int x1
= trailing
& 0x3ff;
42 return (u
<< 16) | (x0
<< 10) | x1
;
45 /* Returns the number of Unicode characters in UTF-8 string 's'. */
47 utf8_length(const char *s_
)
53 for (s
= (const uint8_t *) s_
; *s
!= '\0'; s
++) {
54 /* The most-significant bits of the first byte in a character are one
55 * of 2#01, 2#00, or 2#11. 2#10 is a continuation byte. */
56 length
+= (*s
& 0xc0) != 0x80;
62 invalid_utf8_sequence(const uint8_t *s
, int n
, size_t *lengthp
)
72 ds_put_cstr(&msg
, "invalid UTF-8 sequence");
73 for (i
= 0; i
< n
; i
++) {
74 ds_put_format(&msg
, " 0x%02"PRIx8
, s
[i
]);
76 return ds_steal_cstr(&msg
);
79 struct utf8_sequence
{
83 static const struct utf8_sequence
*
84 lookup_utf8_sequence(uint8_t c
)
86 static const struct utf8_sequence seqs
[] = {
88 { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },
90 { { { 0xc2, 0xdf }, { 0x80, 0xbf },
91 { 0, 0 }, { 0, 0 }, { 0, 0 } } },
93 { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },
96 { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },
97 { 0, 0 }, { 0, 0 } } },
99 { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },
100 { 0, 0 }, { 0, 0 } } },
102 { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },
103 { 0, 0 }, { 0, 0 } } },
105 { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
108 { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
111 { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },
117 for (i
= 0; i
< ARRAY_SIZE(seqs
); i
++) {
118 const uint8_t *o
= seqs
[i
].octets
[0];
119 if (c
>= o
[0] && c
<= o
[1]) {
126 /* Checks that 's' is a valid, null-terminated UTF-8 string. If so, returns a
127 * null pointer and sets '*lengthp' to the number of Unicode characters in
128 * 's'. If not, returns an error message that the caller must free and sets
131 * 'lengthp' may be NULL if the length is not needed. */
133 utf8_validate(const char *s_
, size_t *lengthp
)
138 for (s
= (const uint8_t *) s_
; *s
!= '\0'; ) {
143 const struct utf8_sequence
*seq
;
146 seq
= lookup_utf8_sequence(s
[0]);
148 return invalid_utf8_sequence(s
, 1, lengthp
);
151 for (i
= 1; seq
->octets
[i
][0]; i
++) {
152 const uint8_t *o
= seq
->octets
[i
];
153 if (s
[i
] < o
[0] || s
[i
] > o
[1]) {
154 return invalid_utf8_sequence(s
, i
+ 1, lengthp
);