]> git.proxmox.com Git - grub2.git/blob - include/grub/charset.h
Import grub2_2.02+dfsg1.orig.tar.xz
[grub2.git] / include / grub / charset.h
1 /*
2 * GRUB -- GRand Unified Bootloader
3 * Copyright (C) 1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009 Free Software Foundation, Inc.
4 *
5 * GRUB is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
9 *
10 * GRUB is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with GRUB. If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #ifndef GRUB_CHARSET_HEADER
20 #define GRUB_CHARSET_HEADER 1
21
22 #include <grub/types.h>
23
24 #define GRUB_UINT8_1_LEADINGBIT 0x80
25 #define GRUB_UINT8_2_LEADINGBITS 0xc0
26 #define GRUB_UINT8_3_LEADINGBITS 0xe0
27 #define GRUB_UINT8_4_LEADINGBITS 0xf0
28 #define GRUB_UINT8_5_LEADINGBITS 0xf8
29 #define GRUB_UINT8_6_LEADINGBITS 0xfc
30 #define GRUB_UINT8_7_LEADINGBITS 0xfe
31
32 #define GRUB_UINT8_1_TRAILINGBIT 0x01
33 #define GRUB_UINT8_2_TRAILINGBITS 0x03
34 #define GRUB_UINT8_3_TRAILINGBITS 0x07
35 #define GRUB_UINT8_4_TRAILINGBITS 0x0f
36 #define GRUB_UINT8_5_TRAILINGBITS 0x1f
37 #define GRUB_UINT8_6_TRAILINGBITS 0x3f
38
39 #define GRUB_MAX_UTF8_PER_UTF16 4
40 /* You need at least one UTF-8 byte to have one UTF-16 word.
41 You need at least three UTF-8 bytes to have 2 UTF-16 words (surrogate pairs).
42 */
43 #define GRUB_MAX_UTF16_PER_UTF8 1
44 #define GRUB_MAX_UTF8_PER_CODEPOINT 4
45
46 #define GRUB_UCS2_LIMIT 0x10000
47 #define GRUB_UTF16_UPPER_SURROGATE(code) \
48 (0xD800 | ((((code) - GRUB_UCS2_LIMIT) >> 10) & 0x3ff))
49 #define GRUB_UTF16_LOWER_SURROGATE(code) \
50 (0xDC00 | (((code) - GRUB_UCS2_LIMIT) & 0x3ff))
51
52 /* Process one character from UTF8 sequence.
53 At beginning set *code = 0, *count = 0. Returns 0 on failure and
54 1 on success. *count holds the number of trailing bytes. */
55 static inline int
56 grub_utf8_process (grub_uint8_t c, grub_uint32_t *code, int *count)
57 {
58 if (*count)
59 {
60 if ((c & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
61 {
62 *count = 0;
63 /* invalid */
64 return 0;
65 }
66 else
67 {
68 *code <<= 6;
69 *code |= (c & GRUB_UINT8_6_TRAILINGBITS);
70 (*count)--;
71 /* Overlong. */
72 if ((*count == 1 && *code <= 0x1f)
73 || (*count == 2 && *code <= 0xf))
74 {
75 *code = 0;
76 *count = 0;
77 return 0;
78 }
79 return 1;
80 }
81 }
82
83 if ((c & GRUB_UINT8_1_LEADINGBIT) == 0)
84 {
85 *code = c;
86 return 1;
87 }
88 if ((c & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS)
89 {
90 *count = 1;
91 *code = c & GRUB_UINT8_5_TRAILINGBITS;
92 /* Overlong */
93 if (*code <= 1)
94 {
95 *count = 0;
96 *code = 0;
97 return 0;
98 }
99 return 1;
100 }
101 if ((c & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS)
102 {
103 *count = 2;
104 *code = c & GRUB_UINT8_4_TRAILINGBITS;
105 return 1;
106 }
107 if ((c & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS)
108 {
109 *count = 3;
110 *code = c & GRUB_UINT8_3_TRAILINGBITS;
111 return 1;
112 }
113 return 0;
114 }
115
116
117 /* Convert a (possibly null-terminated) UTF-8 string of at most SRCSIZE
118 bytes (if SRCSIZE is -1, it is ignored) in length to a UTF-16 string.
119 Return the number of characters converted. DEST must be able to hold
120 at least DESTSIZE characters. If an invalid sequence is found, return -1.
121 If SRCEND is not NULL, then *SRCEND is set to the next byte after the
122 last byte used in SRC. */
123 static inline grub_size_t
124 grub_utf8_to_utf16 (grub_uint16_t *dest, grub_size_t destsize,
125 const grub_uint8_t *src, grub_size_t srcsize,
126 const grub_uint8_t **srcend)
127 {
128 grub_uint16_t *p = dest;
129 int count = 0;
130 grub_uint32_t code = 0;
131
132 if (srcend)
133 *srcend = src;
134
135 while (srcsize && destsize)
136 {
137 int was_count = count;
138 if (srcsize != (grub_size_t)-1)
139 srcsize--;
140 if (!grub_utf8_process (*src++, &code, &count))
141 {
142 code = '?';
143 count = 0;
144 /* Character c may be valid, don't eat it. */
145 if (was_count)
146 src--;
147 }
148 if (count != 0)
149 continue;
150 if (code == 0)
151 break;
152 if (destsize < 2 && code >= GRUB_UCS2_LIMIT)
153 break;
154 if (code >= GRUB_UCS2_LIMIT)
155 {
156 *p++ = GRUB_UTF16_UPPER_SURROGATE (code);
157 *p++ = GRUB_UTF16_LOWER_SURROGATE (code);
158 destsize -= 2;
159 }
160 else
161 {
162 *p++ = code;
163 destsize--;
164 }
165 }
166
167 if (srcend)
168 *srcend = src;
169 return p - dest;
170 }
171
172 /* Determine the last position where the UTF-8 string [beg, end) can
173 be safely cut. */
174 static inline grub_size_t
175 grub_getend (const char *beg, const char *end)
176 {
177 const char *ptr;
178 for (ptr = end - 1; ptr >= beg; ptr--)
179 if ((*ptr & GRUB_UINT8_2_LEADINGBITS) != GRUB_UINT8_1_LEADINGBIT)
180 break;
181 if (ptr < beg)
182 return 0;
183 if ((*ptr & GRUB_UINT8_1_LEADINGBIT) == 0)
184 return ptr + 1 - beg;
185 if ((*ptr & GRUB_UINT8_3_LEADINGBITS) == GRUB_UINT8_2_LEADINGBITS
186 && ptr + 2 <= end)
187 return ptr + 2 - beg;
188 if ((*ptr & GRUB_UINT8_4_LEADINGBITS) == GRUB_UINT8_3_LEADINGBITS
189 && ptr + 3 <= end)
190 return ptr + 3 - beg;
191 if ((*ptr & GRUB_UINT8_5_LEADINGBITS) == GRUB_UINT8_4_LEADINGBITS
192 && ptr + 4 <= end)
193 return ptr + 4 - beg;
194 /* Invalid character or incomplete. Cut before it. */
195 return ptr - beg;
196 }
197
198 /* Convert UTF-16 to UTF-8. */
199 static inline grub_uint8_t *
200 grub_utf16_to_utf8 (grub_uint8_t *dest, const grub_uint16_t *src,
201 grub_size_t size)
202 {
203 grub_uint32_t code_high = 0;
204
205 while (size--)
206 {
207 grub_uint32_t code = *src++;
208
209 if (code_high)
210 {
211 if (code >= 0xDC00 && code <= 0xDFFF)
212 {
213 /* Surrogate pair. */
214 code = ((code_high - 0xD800) << 10) + (code - 0xDC00) + 0x10000;
215
216 *dest++ = (code >> 18) | 0xF0;
217 *dest++ = ((code >> 12) & 0x3F) | 0x80;
218 *dest++ = ((code >> 6) & 0x3F) | 0x80;
219 *dest++ = (code & 0x3F) | 0x80;
220 }
221 else
222 {
223 /* Error... */
224 *dest++ = '?';
225 /* *src may be valid. Don't eat it. */
226 src--;
227 }
228
229 code_high = 0;
230 }
231 else
232 {
233 if (code <= 0x007F)
234 *dest++ = code;
235 else if (code <= 0x07FF)
236 {
237 *dest++ = (code >> 6) | 0xC0;
238 *dest++ = (code & 0x3F) | 0x80;
239 }
240 else if (code >= 0xD800 && code <= 0xDBFF)
241 {
242 code_high = code;
243 continue;
244 }
245 else if (code >= 0xDC00 && code <= 0xDFFF)
246 {
247 /* Error... */
248 *dest++ = '?';
249 }
250 else if (code < 0x10000)
251 {
252 *dest++ = (code >> 12) | 0xE0;
253 *dest++ = ((code >> 6) & 0x3F) | 0x80;
254 *dest++ = (code & 0x3F) | 0x80;
255 }
256 else
257 {
258 *dest++ = (code >> 18) | 0xF0;
259 *dest++ = ((code >> 12) & 0x3F) | 0x80;
260 *dest++ = ((code >> 6) & 0x3F) | 0x80;
261 *dest++ = (code & 0x3F) | 0x80;
262 }
263 }
264 }
265
266 return dest;
267 }
268
269 #define GRUB_MAX_UTF8_PER_LATIN1 2
270
271 /* Convert Latin1 to UTF-8. */
272 static inline grub_uint8_t *
273 grub_latin1_to_utf8 (grub_uint8_t *dest, const grub_uint8_t *src,
274 grub_size_t size)
275 {
276 while (size--)
277 {
278 if (!(*src & 0x80))
279 *dest++ = *src;
280 else
281 {
282 *dest++ = (*src >> 6) | 0xC0;
283 *dest++ = (*src & 0x3F) | 0x80;
284 }
285 src++;
286 }
287
288 return dest;
289 }
290
291 /* Convert UCS-4 to UTF-8. */
292 char *grub_ucs4_to_utf8_alloc (const grub_uint32_t *src, grub_size_t size);
293
294 int
295 grub_is_valid_utf8 (const grub_uint8_t *src, grub_size_t srcsize);
296
297 grub_ssize_t grub_utf8_to_ucs4_alloc (const char *msg,
298 grub_uint32_t **unicode_msg,
299 grub_uint32_t **last_position);
300
301 /* Returns the number of bytes the string src would occupy is converted
302 to UTF-8, excluding \0. */
303 grub_size_t
304 grub_get_num_of_utf8_bytes (const grub_uint32_t *src, grub_size_t size);
305
306 /* Converts UCS-4 to UTF-8. Returns the number of bytes effectively written
307 excluding the trailing \0. */
308 grub_size_t
309 grub_ucs4_to_utf8 (const grub_uint32_t *src, grub_size_t size,
310 grub_uint8_t *dest, grub_size_t destsize);
311 grub_size_t grub_utf8_to_ucs4 (grub_uint32_t *dest, grub_size_t destsize,
312 const grub_uint8_t *src, grub_size_t srcsize,
313 const grub_uint8_t **srcend);
314 /* Returns -2 if not enough space, -1 on invalid character. */
315 grub_ssize_t
316 grub_encode_utf8_character (grub_uint8_t *dest, grub_uint8_t *destend,
317 grub_uint32_t code);
318
319 const grub_uint32_t *
320 grub_unicode_get_comb_start (const grub_uint32_t *str,
321 const grub_uint32_t *cur);
322
323 #endif