]>
Commit | Line | Data |
---|---|---|
f38b84ea | 1 | /* |
e0edde6f | 2 | * Copyright (c) 2009, 2010 Nicira, Inc. |
f38b84ea BP |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); | |
5 | * you may not use this file except in compliance with the License. | |
6 | * You may obtain a copy of the License at: | |
7 | * | |
8 | * http://www.apache.org/licenses/LICENSE-2.0 | |
9 | * | |
10 | * Unless required by applicable law or agreed to in writing, software | |
11 | * distributed under the License is distributed on an "AS IS" BASIS, | |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
13 | * See the License for the specific language governing permissions and | |
14 | * limitations under the License. | |
15 | */ | |
16 | ||
17 | #include <config.h> | |
18 | ||
19 | #include "unicode.h" | |
20 | ||
bd76d25d BP |
21 | #include <inttypes.h> |
22 | ||
3e8a2ad1 | 23 | #include "openvswitch/dynamic-string.h" |
bd76d25d BP |
24 | #include "util.h" |
25 | ||
f38b84ea BP |
26 | /* Returns the unicode code point corresponding to leading surrogate 'leading' |
27 | * and trailing surrogate 'trailing'. The return value will not make any | |
28 | * sense if 'leading' or 'trailing' are not in the correct ranges for leading | |
29 | * or trailing surrogates. */ | |
30 | int | |
31 | utf16_decode_surrogate_pair(int leading, int trailing) | |
32 | { | |
33 | /* | |
34 | * Leading surrogate: 110110wwwwxxxxxx | |
35 | * Trailing surrogate: 110111xxxxxxxxxx | |
36 | * Code point: 000uuuuuxxxxxxxxxxxxxxxx | |
37 | */ | |
38 | int w = (leading >> 6) & 0xf; | |
39 | int u = w + 1; | |
40 | int x0 = leading & 0x3f; | |
41 | int x1 = trailing & 0x3ff; | |
42 | return (u << 16) | (x0 << 10) | x1; | |
43 | } | |
bd76d25d BP |
44 | |
45 | /* Returns the number of Unicode characters in UTF-8 string 's'. */ | |
46 | size_t | |
47 | utf8_length(const char *s_) | |
48 | { | |
49 | const uint8_t *s; | |
50 | size_t length; | |
51 | ||
52 | length = 0; | |
53 | for (s = (const uint8_t *) s_; *s != '\0'; s++) { | |
54 | /* The most-significant bits of the first byte in a character are one | |
55 | * of 2#01, 2#00, or 2#11. 2#10 is a continuation byte. */ | |
56 | length += (*s & 0xc0) != 0x80; | |
57 | } | |
58 | return length; | |
59 | } | |
60 | ||
61 | static char * | |
62 | invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp) | |
63 | { | |
64 | struct ds msg; | |
65 | int i; | |
66 | ||
67 | if (lengthp) { | |
68 | *lengthp = 0; | |
69 | } | |
70 | ||
71 | ds_init(&msg); | |
72 | ds_put_cstr(&msg, "invalid UTF-8 sequence"); | |
73 | for (i = 0; i < n; i++) { | |
74 | ds_put_format(&msg, " 0x%02"PRIx8, s[i]); | |
75 | } | |
76 | return ds_steal_cstr(&msg); | |
77 | } | |
78 | ||
79 | struct utf8_sequence { | |
80 | uint8_t octets[5][2]; | |
81 | }; | |
82 | ||
83 | static const struct utf8_sequence * | |
84 | lookup_utf8_sequence(uint8_t c) | |
85 | { | |
86 | static const struct utf8_sequence seqs[] = { | |
87 | { { { 0x01, 0x7f }, | |
88 | { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } }, | |
89 | ||
90 | { { { 0xc2, 0xdf }, { 0x80, 0xbf }, | |
91 | { 0, 0 }, { 0, 0 }, { 0, 0 } } }, | |
92 | ||
93 | { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf }, | |
94 | {0,0}, {0, 0 } } }, | |
95 | ||
96 | { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf }, | |
97 | { 0, 0 }, { 0, 0 } } }, | |
98 | ||
99 | { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf }, | |
100 | { 0, 0 }, { 0, 0 } } }, | |
101 | ||
102 | { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf }, | |
103 | { 0, 0 }, { 0, 0 } } }, | |
104 | ||
105 | { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf }, | |
106 | { 0, 0 } } }, | |
107 | ||
108 | { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf }, | |
109 | { 0, 0 } } }, | |
110 | ||
111 | { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf }, | |
112 | { 0, 0 } } }, | |
113 | }; | |
114 | ||
115 | size_t i; | |
116 | ||
117 | for (i = 0; i < ARRAY_SIZE(seqs); i++) { | |
118 | const uint8_t *o = seqs[i].octets[0]; | |
119 | if (c >= o[0] && c <= o[1]) { | |
120 | return &seqs[i]; | |
121 | } | |
122 | } | |
123 | return NULL; | |
124 | } | |
125 | ||
126 | /* Checks that 's' is a valid, null-terminated UTF-8 string. If so, returns a | |
127 | * null pointer and sets '*lengthp' to the number of Unicode characters in | |
128 | * 's'. If not, returns an error message that the caller must free and sets | |
129 | * '*lengthp' to 0. | |
130 | * | |
131 | * 'lengthp' may be NULL if the length is not needed. */ | |
132 | char * | |
133 | utf8_validate(const char *s_, size_t *lengthp) | |
134 | { | |
135 | size_t length = 0; | |
136 | const uint8_t *s; | |
137 | ||
138 | for (s = (const uint8_t *) s_; *s != '\0'; ) { | |
139 | length++; | |
140 | if (s[0] < 0x80) { | |
141 | s++; | |
142 | } else { | |
143 | const struct utf8_sequence *seq; | |
144 | int i; | |
145 | ||
146 | seq = lookup_utf8_sequence(s[0]); | |
147 | if (!seq) { | |
148 | return invalid_utf8_sequence(s, 1, lengthp); | |
149 | } | |
150 | ||
151 | for (i = 1; seq->octets[i][0]; i++) { | |
152 | const uint8_t *o = seq->octets[i]; | |
153 | if (s[i] < o[0] || s[i] > o[1]) { | |
154 | return invalid_utf8_sequence(s, i + 1, lengthp); | |
155 | } | |
156 | } | |
157 | s += i; | |
158 | } | |
159 | } | |
160 | if (lengthp) { | |
161 | *lengthp = length; | |
162 | } | |
163 | return NULL; | |
164 | } |