2 * GRUB -- GRand Unified Bootloader
3 * Copyright (C) 1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009 Free Software Foundation, Inc.
5 * GRUB is free software: you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation, either version 3 of the License, or
8 * (at your option) any later version.
10 * GRUB is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with GRUB. If not, see <http://www.gnu.org/licenses/>.
19 #ifndef GRUB_CHARSET_HEADER
20 #define GRUB_CHARSET_HEADER 1
22 #include <grub/types.h>
24 #define GRUB_UINT8_1_LEADINGBIT 0x80
25 #define GRUB_UINT8_2_LEADINGBITS 0xc0
26 #define GRUB_UINT8_3_LEADINGBITS 0xe0
27 #define GRUB_UINT8_4_LEADINGBITS 0xf0
28 #define GRUB_UINT8_5_LEADINGBITS 0xf8
29 #define GRUB_UINT8_6_LEADINGBITS 0xfc
30 #define GRUB_UINT8_7_LEADINGBITS 0xfe
32 #define GRUB_UINT8_1_TRAILINGBIT 0x01
33 #define GRUB_UINT8_2_TRAILINGBITS 0x03
34 #define GRUB_UINT8_3_TRAILINGBITS 0x07
35 #define GRUB_UINT8_4_TRAILINGBITS 0x0f
36 #define GRUB_UINT8_5_TRAILINGBITS 0x1f
37 #define GRUB_UINT8_6_TRAILINGBITS 0x3f
39 #define GRUB_MAX_UTF8_PER_UTF16 4
40 /* You need at least one UTF-8 byte to have one UTF-16 word.
41 You need at least three UTF-8 bytes to have 2 UTF-16 words (surrogate pairs).
43 #define GRUB_MAX_UTF16_PER_UTF8 1
44 #define GRUB_MAX_UTF8_PER_CODEPOINT 4
46 #define GRUB_UCS2_LIMIT 0x10000
47 #define GRUB_UTF16_UPPER_SURROGATE(code) \
48 (0xD800 | ((((code) - GRUB_UCS2_LIMIT) >> 10) & 0x3ff))
49 #define GRUB_UTF16_LOWER_SURROGATE(code) \
50 (0xDC00 | (((code) - GRUB_UCS2_LIMIT) & 0x3ff))
52 /* Process one character from UTF8 sequence.
53 At beginning set *code = 0, *count = 0. Returns 0 on failure and
54 1 on success. *count holds the number of trailing bytes. */
56 grub_utf8_process (grub_uint8_t c
, grub_uint32_t
*code
, int *count
)
60 if ((c
& GRUB_UINT8_2_LEADINGBITS
) != GRUB_UINT8_1_LEADINGBIT
)
69 *code
|= (c
& GRUB_UINT8_6_TRAILINGBITS
);
72 if ((*count
== 1 && *code
<= 0x1f)
73 || (*count
== 2 && *code
<= 0xf))
83 if ((c
& GRUB_UINT8_1_LEADINGBIT
) == 0)
88 if ((c
& GRUB_UINT8_3_LEADINGBITS
) == GRUB_UINT8_2_LEADINGBITS
)
91 *code
= c
& GRUB_UINT8_5_TRAILINGBITS
;
101 if ((c
& GRUB_UINT8_4_LEADINGBITS
) == GRUB_UINT8_3_LEADINGBITS
)
104 *code
= c
& GRUB_UINT8_4_TRAILINGBITS
;
107 if ((c
& GRUB_UINT8_5_LEADINGBITS
) == GRUB_UINT8_4_LEADINGBITS
)
110 *code
= c
& GRUB_UINT8_3_TRAILINGBITS
;
117 /* Convert a (possibly null-terminated) UTF-8 string of at most SRCSIZE
118 bytes (if SRCSIZE is -1, it is ignored) in length to a UTF-16 string.
119 Return the number of characters converted. DEST must be able to hold
120 at least DESTSIZE characters. If an invalid sequence is found, return -1.
121 If SRCEND is not NULL, then *SRCEND is set to the next byte after the
122 last byte used in SRC. */
123 static inline grub_size_t
124 grub_utf8_to_utf16 (grub_uint16_t
*dest
, grub_size_t destsize
,
125 const grub_uint8_t
*src
, grub_size_t srcsize
,
126 const grub_uint8_t
**srcend
)
128 grub_uint16_t
*p
= dest
;
130 grub_uint32_t code
= 0;
135 while (srcsize
&& destsize
)
137 int was_count
= count
;
138 if (srcsize
!= (grub_size_t
)-1)
140 if (!grub_utf8_process (*src
++, &code
, &count
))
144 /* Character c may be valid, don't eat it. */
152 if (destsize
< 2 && code
>= GRUB_UCS2_LIMIT
)
154 if (code
>= GRUB_UCS2_LIMIT
)
156 *p
++ = GRUB_UTF16_UPPER_SURROGATE (code
);
157 *p
++ = GRUB_UTF16_LOWER_SURROGATE (code
);
172 /* Determine the last position where the UTF-8 string [beg, end) can
174 static inline grub_size_t
175 grub_getend (const char *beg
, const char *end
)
178 for (ptr
= end
- 1; ptr
>= beg
; ptr
--)
179 if ((*ptr
& GRUB_UINT8_2_LEADINGBITS
) != GRUB_UINT8_1_LEADINGBIT
)
183 if ((*ptr
& GRUB_UINT8_1_LEADINGBIT
) == 0)
184 return ptr
+ 1 - beg
;
185 if ((*ptr
& GRUB_UINT8_3_LEADINGBITS
) == GRUB_UINT8_2_LEADINGBITS
187 return ptr
+ 2 - beg
;
188 if ((*ptr
& GRUB_UINT8_4_LEADINGBITS
) == GRUB_UINT8_3_LEADINGBITS
190 return ptr
+ 3 - beg
;
191 if ((*ptr
& GRUB_UINT8_5_LEADINGBITS
) == GRUB_UINT8_4_LEADINGBITS
193 return ptr
+ 4 - beg
;
194 /* Invalid character or incomplete. Cut before it. */
198 /* Convert UTF-16 to UTF-8. */
199 static inline grub_uint8_t
*
200 grub_utf16_to_utf8 (grub_uint8_t
*dest
, const grub_uint16_t
*src
,
203 grub_uint32_t code_high
= 0;
207 grub_uint32_t code
= *src
++;
211 if (code
>= 0xDC00 && code
<= 0xDFFF)
213 /* Surrogate pair. */
214 code
= ((code_high
- 0xD800) << 10) + (code
- 0xDC00) + 0x10000;
216 *dest
++ = (code
>> 18) | 0xF0;
217 *dest
++ = ((code
>> 12) & 0x3F) | 0x80;
218 *dest
++ = ((code
>> 6) & 0x3F) | 0x80;
219 *dest
++ = (code
& 0x3F) | 0x80;
225 /* *src may be valid. Don't eat it. */
235 else if (code
<= 0x07FF)
237 *dest
++ = (code
>> 6) | 0xC0;
238 *dest
++ = (code
& 0x3F) | 0x80;
240 else if (code
>= 0xD800 && code
<= 0xDBFF)
245 else if (code
>= 0xDC00 && code
<= 0xDFFF)
250 else if (code
< 0x10000)
252 *dest
++ = (code
>> 12) | 0xE0;
253 *dest
++ = ((code
>> 6) & 0x3F) | 0x80;
254 *dest
++ = (code
& 0x3F) | 0x80;
258 *dest
++ = (code
>> 18) | 0xF0;
259 *dest
++ = ((code
>> 12) & 0x3F) | 0x80;
260 *dest
++ = ((code
>> 6) & 0x3F) | 0x80;
261 *dest
++ = (code
& 0x3F) | 0x80;
269 #define GRUB_MAX_UTF8_PER_LATIN1 2
271 /* Convert Latin1 to UTF-8. */
272 static inline grub_uint8_t
*
273 grub_latin1_to_utf8 (grub_uint8_t
*dest
, const grub_uint8_t
*src
,
282 *dest
++ = (*src
>> 6) | 0xC0;
283 *dest
++ = (*src
& 0x3F) | 0x80;
291 /* Convert UCS-4 to UTF-8. */
292 char *grub_ucs4_to_utf8_alloc (const grub_uint32_t
*src
, grub_size_t size
);
295 grub_is_valid_utf8 (const grub_uint8_t
*src
, grub_size_t srcsize
);
297 grub_ssize_t
grub_utf8_to_ucs4_alloc (const char *msg
,
298 grub_uint32_t
**unicode_msg
,
299 grub_uint32_t
**last_position
);
301 /* Returns the number of bytes the string src would occupy is converted
302 to UTF-8, excluding \0. */
304 grub_get_num_of_utf8_bytes (const grub_uint32_t
*src
, grub_size_t size
);
306 /* Converts UCS-4 to UTF-8. Returns the number of bytes effectively written
307 excluding the trailing \0. */
309 grub_ucs4_to_utf8 (const grub_uint32_t
*src
, grub_size_t size
,
310 grub_uint8_t
*dest
, grub_size_t destsize
);
311 grub_size_t
grub_utf8_to_ucs4 (grub_uint32_t
*dest
, grub_size_t destsize
,
312 const grub_uint8_t
*src
, grub_size_t srcsize
,
313 const grub_uint8_t
**srcend
);
314 /* Returns -2 if not enough space, -1 on invalid character. */
316 grub_encode_utf8_character (grub_uint8_t
*dest
, grub_uint8_t
*destend
,
319 const grub_uint32_t
*
320 grub_unicode_get_comb_start (const grub_uint32_t
*str
,
321 const grub_uint32_t
*cur
);