]> git.proxmox.com Git - mirror_ovs.git/blob - lib/unicode.c
lldp: increase statsTLVsUnrecognizedTotal on unknown TLV
[mirror_ovs.git] / lib / unicode.c
1 /*
2 * Copyright (c) 2009, 2010 Nicira, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at:
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <config.h>
18
19 #include "unicode.h"
20
21 #include <inttypes.h>
22
23 #include "openvswitch/dynamic-string.h"
24 #include "util.h"
25
26 /* Returns the unicode code point corresponding to leading surrogate 'leading'
27 * and trailing surrogate 'trailing'. The return value will not make any
28 * sense if 'leading' or 'trailing' are not in the correct ranges for leading
29 * or trailing surrogates. */
30 int
31 utf16_decode_surrogate_pair(int leading, int trailing)
32 {
33 /*
34 * Leading surrogate: 110110wwwwxxxxxx
35 * Trailing surrogate: 110111xxxxxxxxxx
36 * Code point: 000uuuuuxxxxxxxxxxxxxxxx
37 */
38 int w = (leading >> 6) & 0xf;
39 int u = w + 1;
40 int x0 = leading & 0x3f;
41 int x1 = trailing & 0x3ff;
42 return (u << 16) | (x0 << 10) | x1;
43 }
44
45 /* Returns the number of Unicode characters in UTF-8 string 's'. */
46 size_t
47 utf8_length(const char *s_)
48 {
49 const uint8_t *s;
50 size_t length;
51
52 length = 0;
53 for (s = (const uint8_t *) s_; *s != '\0'; s++) {
54 /* The most-significant bits of the first byte in a character are one
55 * of 2#01, 2#00, or 2#11. 2#10 is a continuation byte. */
56 length += (*s & 0xc0) != 0x80;
57 }
58 return length;
59 }
60
61 static char *
62 invalid_utf8_sequence(const uint8_t *s, int n, size_t *lengthp)
63 {
64 struct ds msg;
65 int i;
66
67 if (lengthp) {
68 *lengthp = 0;
69 }
70
71 ds_init(&msg);
72 ds_put_cstr(&msg, "invalid UTF-8 sequence");
73 for (i = 0; i < n; i++) {
74 ds_put_format(&msg, " 0x%02"PRIx8, s[i]);
75 }
76 return ds_steal_cstr(&msg);
77 }
78
79 struct utf8_sequence {
80 uint8_t octets[5][2];
81 };
82
83 static const struct utf8_sequence *
84 lookup_utf8_sequence(uint8_t c)
85 {
86 static const struct utf8_sequence seqs[] = {
87 { { { 0x01, 0x7f },
88 { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } } },
89
90 { { { 0xc2, 0xdf }, { 0x80, 0xbf },
91 { 0, 0 }, { 0, 0 }, { 0, 0 } } },
92
93 { { { 0xe0, 0xe0 }, { 0xa0, 0xbf }, { 0x80, 0xbf },
94 {0,0}, {0, 0 } } },
95
96 { { { 0xe1, 0xec }, { 0x80, 0xbf }, { 0x80, 0xbf },
97 { 0, 0 }, { 0, 0 } } },
98
99 { { { 0xed, 0xed }, { 0x80, 0x9f }, { 0x80, 0xbf },
100 { 0, 0 }, { 0, 0 } } },
101
102 { { { 0xee, 0xef }, { 0x80, 0xbf }, { 0x80, 0xbf },
103 { 0, 0 }, { 0, 0 } } },
104
105 { { { 0xf0, 0xf0 }, { 0x90, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
106 { 0, 0 } } },
107
108 { { { 0xf1, 0xf3 }, { 0x80, 0xbf }, { 0x80, 0xbf }, { 0x80, 0xbf },
109 { 0, 0 } } },
110
111 { { { 0xf4, 0xf4 }, { 0x80, 0x8f }, { 0x80, 0xbf }, { 0x80, 0xbf },
112 { 0, 0 } } },
113 };
114
115 size_t i;
116
117 for (i = 0; i < ARRAY_SIZE(seqs); i++) {
118 const uint8_t *o = seqs[i].octets[0];
119 if (c >= o[0] && c <= o[1]) {
120 return &seqs[i];
121 }
122 }
123 return NULL;
124 }
125
126 /* Checks that 's' is a valid, null-terminated UTF-8 string. If so, returns a
127 * null pointer and sets '*lengthp' to the number of Unicode characters in
128 * 's'. If not, returns an error message that the caller must free and sets
129 * '*lengthp' to 0.
130 *
131 * 'lengthp' may be NULL if the length is not needed. */
132 char *
133 utf8_validate(const char *s_, size_t *lengthp)
134 {
135 size_t length = 0;
136 const uint8_t *s;
137
138 for (s = (const uint8_t *) s_; *s != '\0'; ) {
139 length++;
140 if (s[0] < 0x80) {
141 s++;
142 } else {
143 const struct utf8_sequence *seq;
144 int i;
145
146 seq = lookup_utf8_sequence(s[0]);
147 if (!seq) {
148 return invalid_utf8_sequence(s, 1, lengthp);
149 }
150
151 for (i = 1; seq->octets[i][0]; i++) {
152 const uint8_t *o = seq->octets[i];
153 if (s[i] < o[0] || s[i] > o[1]) {
154 return invalid_utf8_sequence(s, i + 1, lengthp);
155 }
156 }
157 s += i;
158 }
159 }
160 if (lengthp) {
161 *lengthp = length;
162 }
163 return NULL;
164 }