]> git.proxmox.com Git - mirror_ubuntu-zesty-kernel.git/blob - ubuntu/vbox/include/iprt/uni.h
UBUNTU: ubuntu: vbox -- update to 5.1.6-dfsg-1
[mirror_ubuntu-zesty-kernel.git] / ubuntu / vbox / include / iprt / uni.h
1 /** @file
2 * IPRT - Unicode Code Points.
3 */
4
5 /*
6 * Copyright (C) 2006-2016 Oracle Corporation
7 *
8 * This file is part of VirtualBox Open Source Edition (OSE), as
9 * available from http://www.virtualbox.org. This file is free software;
10 * you can redistribute it and/or modify it under the terms of the GNU
11 * General Public License (GPL) as published by the Free Software
12 * Foundation, in version 2 as it comes in the "COPYING" file of the
13 * VirtualBox OSE distribution. VirtualBox OSE is distributed in the
14 * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind.
15 *
16 * The contents of this file may alternatively be used under the terms
17 * of the Common Development and Distribution License Version 1.0
18 * (CDDL) only, as it comes in the "COPYING.CDDL" file of the
19 * VirtualBox OSE distribution, in which case the provisions of the
20 * CDDL are applicable instead of those of the GPL.
21 *
22 * You may elect to license modified versions of this file under the
23 * terms and conditions of either the GPL or the CDDL or both.
24 */
25
26 #ifndef ___iprt_uni_h
27 #define ___iprt_uni_h
28
29 /** @defgroup grp_rt_uni RTUniCp - Unicode Code Points
30 * @ingroup grp_rt
31 * @{
32 */
33
34 /** @def RTUNI_USE_WCTYPE
35 * Define RTUNI_USE_WCTYPE to not use the IPRT unicode data but the
36 * data which the C runtime library provides. */
37 #ifdef DOXYGEN_RUNNING
38 # define RTUNI_USE_WCTYPE
39 #endif
40
41 #include <iprt/types.h>
42 #ifdef RTUNI_USE_WCTYPE
43 # include <wctype.h>
44 #endif
45
46 RT_C_DECLS_BEGIN
47
48
49 #ifndef RTUNI_USE_WCTYPE
50
51 /**
52 * A unicode flags range.
53 * @internal
54 */
55 typedef struct RTUNIFLAGSRANGE
56 {
57 /** The first code point of the range. */
58 RTUNICP BeginCP;
59 /** The last + 1 code point of the range. */
60 RTUNICP EndCP;
61 /** Pointer to the array of case folded code points. */
62 const uint8_t *pafFlags;
63 } RTUNIFLAGSRANGE;
64 /** Pointer to a flags range.
65 * @internal */
66 typedef RTUNIFLAGSRANGE *PRTUNIFLAGSRANGE;
67 /** Pointer to a const flags range.
68 * @internal */
69 typedef const RTUNIFLAGSRANGE *PCRTUNIFLAGSRANGE;
70
71 /**
72 * A unicode case folded range.
73 * @internal
74 */
75 typedef struct RTUNICASERANGE
76 {
77 /** The first code point of the range. */
78 RTUNICP BeginCP;
79 /** The last + 1 code point of the range. */
80 RTUNICP EndCP;
81 /** Pointer to the array of case folded code points. */
82 PCRTUNICP paFoldedCPs;
83 } RTUNICASERANGE;
84 /** Pointer to a case folded range.
85 * @internal */
86 typedef RTUNICASERANGE *PRTUNICASERANGE;
87 /** Pointer to a const case folded range.
88 * @internal */
89 typedef const RTUNICASERANGE *PCRTUNICASERANGE;
90
91 /** @name Unicode Code Point Flags.
92 * @internal
93 * @{ */
94 #define RTUNI_UPPER RT_BIT(0)
95 #define RTUNI_LOWER RT_BIT(1)
96 #define RTUNI_ALPHA RT_BIT(2)
97 #define RTUNI_XDIGIT RT_BIT(3)
98 #define RTUNI_DDIGIT RT_BIT(4)
99 #define RTUNI_WSPACE RT_BIT(5)
100 /*#define RTUNI_BSPACE RT_BIT(6) - later */
101 /** When set, the codepoint requires further checking wrt NFC and NFD
102 * normalization. I.e. set when either of QC_NFD and QC_NFC are not Y. */
103 #define RTUNI_QC_NFX RT_BIT(7)
104 /** @} */
105
106
107 /**
108 * Array of flags ranges.
109 * @internal
110 */
111 extern RTDATADECL(const RTUNIFLAGSRANGE) g_aRTUniFlagsRanges[];
112
113 /**
114 * Gets the flags for a unicode code point.
115 *
116 * @returns The flag mask. (RTUNI_*)
117 * @param CodePoint The unicode code point.
118 * @internal
119 */
120 DECLINLINE(RTUNICP) rtUniCpFlags(RTUNICP CodePoint)
121 {
122 PCRTUNIFLAGSRANGE pCur = &g_aRTUniFlagsRanges[0];
123 do
124 {
125 if (pCur->EndCP > CodePoint)
126 {
127 if (pCur->BeginCP <= CodePoint)
128 return pCur->pafFlags[CodePoint - pCur->BeginCP];
129 break;
130 }
131 pCur++;
132 } while (pCur->EndCP != RTUNICP_MAX);
133 return 0;
134 }
135
136
137 /**
138 * Checks if a unicode code point is upper case.
139 *
140 * @returns true if it is.
141 * @returns false if it isn't.
142 * @param CodePoint The code point.
143 */
144 DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
145 {
146 return (rtUniCpFlags(CodePoint) & RTUNI_UPPER) != 0;
147 }
148
149
150 /**
151 * Checks if a unicode code point is lower case.
152 *
153 * @returns true if it is.
154 * @returns false if it isn't.
155 * @param CodePoint The code point.
156 */
157 DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
158 {
159 return (rtUniCpFlags(CodePoint) & RTUNI_LOWER) != 0;
160 }
161
162
163 /**
164 * Checks if a unicode code point is case foldable.
165 *
166 * @returns true if it is.
167 * @returns false if it isn't.
168 * @param CodePoint The code point.
169 */
170 DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
171 {
172 /* Right enough. */
173 return (rtUniCpFlags(CodePoint) & (RTUNI_LOWER | RTUNI_UPPER)) != 0;
174 }
175
176
177 /**
178 * Checks if a unicode code point is alphabetic.
179 *
180 * @returns true if it is.
181 * @returns false if it isn't.
182 * @param CodePoint The code point.
183 */
184 DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
185 {
186 return (rtUniCpFlags(CodePoint) & RTUNI_ALPHA) != 0;
187 }
188
189
190 /**
191 * Checks if a unicode code point is a decimal digit.
192 *
193 * @returns true if it is.
194 * @returns false if it isn't.
195 * @param CodePoint The code point.
196 */
197 DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
198 {
199 return (rtUniCpFlags(CodePoint) & RTUNI_DDIGIT) != 0;
200 }
201
202
203 /**
204 * Checks if a unicode code point is a hexadecimal digit.
205 *
206 * @returns true if it is.
207 * @returns false if it isn't.
208 * @param CodePoint The code point.
209 */
210 DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
211 {
212 return (rtUniCpFlags(CodePoint) & RTUNI_XDIGIT) != 0;
213 }
214
215
216 /**
217 * Checks if a unicode code point is white space.
218 *
219 * @returns true if it is.
220 * @returns false if it isn't.
221 * @param CodePoint The code point.
222 */
223 DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
224 {
225 return (rtUniCpFlags(CodePoint) & RTUNI_WSPACE) != 0;
226 }
227
228
229
230 /**
231 * Array of uppercase ranges.
232 * @internal
233 */
234 extern RTDATADECL(const RTUNICASERANGE) g_aRTUniUpperRanges[];
235
236 /**
237 * Array of lowercase ranges.
238 * @internal
239 */
240 extern RTDATADECL(const RTUNICASERANGE) g_aRTUniLowerRanges[];
241
242
243 /**
244 * Folds a unicode code point using the specified range array.
245 *
246 * @returns FOlded code point.
247 * @param CodePoint The unicode code point to fold.
248 * @param pCur The case folding range to use.
249 */
250 DECLINLINE(RTUNICP) rtUniCpFold(RTUNICP CodePoint, PCRTUNICASERANGE pCur)
251 {
252 do
253 {
254 if (pCur->EndCP > CodePoint)
255 {
256 if (pCur->BeginCP <= CodePoint)
257 CodePoint = pCur->paFoldedCPs[CodePoint - pCur->BeginCP];
258 break;
259 }
260 pCur++;
261 } while (pCur->EndCP != RTUNICP_MAX);
262 return CodePoint;
263 }
264
265
266 /**
267 * Folds a unicode code point to upper case.
268 *
269 * @returns Folded code point.
270 * @param CodePoint The unicode code point to fold.
271 */
272 DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
273 {
274 return rtUniCpFold(CodePoint, &g_aRTUniUpperRanges[0]);
275 }
276
277
278 /**
279 * Folds a unicode code point to lower case.
280 *
281 * @returns Folded code point.
282 * @param CodePoint The unicode code point to fold.
283 */
284 DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
285 {
286 return rtUniCpFold(CodePoint, &g_aRTUniLowerRanges[0]);
287 }
288
289
290 #else /* RTUNI_USE_WCTYPE */
291
292
293 /**
294 * Checks if a unicode code point is upper case.
295 *
296 * @returns true if it is.
297 * @returns false if it isn't.
298 * @param CodePoint The code point.
299 */
300 DECLINLINE(bool) RTUniCpIsUpper(RTUNICP CodePoint)
301 {
302 return !!iswupper(CodePoint);
303 }
304
305
306 /**
307 * Checks if a unicode code point is lower case.
308 *
309 * @returns true if it is.
310 * @returns false if it isn't.
311 * @param CodePoint The code point.
312 */
313 DECLINLINE(bool) RTUniCpIsLower(RTUNICP CodePoint)
314 {
315 return !!iswlower(CodePoint);
316 }
317
318
319 /**
320 * Checks if a unicode code point is case foldable.
321 *
322 * @returns true if it is.
323 * @returns false if it isn't.
324 * @param CodePoint The code point.
325 */
326 DECLINLINE(bool) RTUniCpIsFoldable(RTUNICP CodePoint)
327 {
328 /* Right enough. */
329 return iswupper(CodePoint) || iswlower(CodePoint);
330 }
331
332
333 /**
334 * Checks if a unicode code point is alphabetic.
335 *
336 * @returns true if it is.
337 * @returns false if it isn't.
338 * @param CodePoint The code point.
339 */
340 DECLINLINE(bool) RTUniCpIsAlphabetic(RTUNICP CodePoint)
341 {
342 return !!iswalpha(CodePoint);
343 }
344
345
346 /**
347 * Checks if a unicode code point is a decimal digit.
348 *
349 * @returns true if it is.
350 * @returns false if it isn't.
351 * @param CodePoint The code point.
352 */
353 DECLINLINE(bool) RTUniCpIsDecDigit(RTUNICP CodePoint)
354 {
355 return !!iswdigit(CodePoint);
356 }
357
358
359 /**
360 * Checks if a unicode code point is a hexadecimal digit.
361 *
362 * @returns true if it is.
363 * @returns false if it isn't.
364 * @param CodePoint The code point.
365 */
366 DECLINLINE(bool) RTUniCpIsHexDigit(RTUNICP CodePoint)
367 {
368 return !!iswxdigit(CodePoint);
369 }
370
371
372 /**
373 * Checks if a unicode code point is white space.
374 *
375 * @returns true if it is.
376 * @returns false if it isn't.
377 * @param CodePoint The code point.
378 */
379 DECLINLINE(bool) RTUniCpIsSpace(RTUNICP CodePoint)
380 {
381 return !!iswspace(CodePoint);
382 }
383
384
385 /**
386 * Folds a unicode code point to upper case.
387 *
388 * @returns Folded code point.
389 * @param CodePoint The unicode code point to fold.
390 */
391 DECLINLINE(RTUNICP) RTUniCpToUpper(RTUNICP CodePoint)
392 {
393 return towupper(CodePoint);
394 }
395
396
397 /**
398 * Folds a unicode code point to lower case.
399 *
400 * @returns Folded code point.
401 * @param CodePoint The unicode code point to fold.
402 */
403 DECLINLINE(RTUNICP) RTUniCpToLower(RTUNICP CodePoint)
404 {
405 return towlower(CodePoint);
406 }
407
408
409 #endif /* RTUNI_USE_WCTYPE */
410
411
412 /**
413 * Frees a unicode string.
414 *
415 * @param pusz The string to free.
416 */
417 RTDECL(void) RTUniFree(PRTUNICP pusz);
418
419
420 /**
421 * Checks if a code point valid.
422 *
423 * Any code point (defined or not) within the 17 unicode planes (0 thru 16),
424 * except surrogates will be considered valid code points by this function.
425 *
426 * @returns true if in range, false if not.
427 * @param CodePoint The unicode code point to validate.
428 */
429 DECLINLINE(bool) RTUniCpIsValid(RTUNICP CodePoint)
430 {
431 return CodePoint <= 0x00d7ff
432 || ( CodePoint <= 0x10ffff
433 && CodePoint >= 0x00e000);
434 }
435
436
437 /**
438 * Checks if the given code point is in the BMP range.
439 *
440 * Surrogates are not considered in the BMP range by this function.
441 *
442 * @returns true if in BMP, false if not.
443 * @param CodePoint The unicode code point to consider.
444 */
445 DECLINLINE(bool) RTUniCpIsBMP(RTUNICP CodePoint)
446 {
447 return CodePoint <= 0xd7ff
448 || ( CodePoint <= 0xffff
449 && CodePoint >= 0xe000);
450 }
451
452
453 /**
454 * Folds a unicode code point to lower case.
455 *
456 * @returns Folded code point.
457 * @param CodePoint The unicode code point to fold.
458 */
459 DECLINLINE(size_t) RTUniCpCalcUtf8Len(RTUNICP CodePoint)
460 {
461 if (CodePoint < 0x80)
462 return 1;
463 return 2
464 + (CodePoint >= 0x00000800)
465 + (CodePoint >= 0x00010000)
466 + (CodePoint >= 0x00200000)
467 + (CodePoint >= 0x04000000)
468 + (CodePoint >= 0x80000000) /* illegal */;
469 }
470
471
472
473 RT_C_DECLS_END
474 /** @} */
475
476
477 #endif
478