]>
Commit | Line | Data |
---|---|---|
c25aa7cd PP |
1 | /* |
2 | * Copyright (C) the libgit2 contributors. All rights reserved. | |
3 | * | |
4 | * This file is part of libgit2, distributed under the GNU GPL v2 with | |
5 | * a Linking Exception. For full terms see the included COPYING file. | |
6 | */ | |
7 | ||
8 | #include "utf8.h" | |
9 | ||
10 | #include "common.h" | |
11 | ||
12 | /* | |
13 | * git_utf8_iterate is taken from the utf8proc project, | |
14 | * http://www.public-software-group.org/utf8proc | |
15 | * | |
16 | * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany | |
17 | * | |
18 | * Permission is hereby granted, free of charge, to any person obtaining a | |
19 | * copy of this software and associated documentation files (the ""Software""), | |
20 | * to deal in the Software without restriction, including without limitation | |
21 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, | |
22 | * and/or sell copies of the Software, and to permit persons to whom the | |
23 | * Software is furnished to do so, subject to the following conditions: | |
24 | * | |
25 | * The above copyright notice and this permission notice shall be included in | |
26 | * all copies or substantial portions of the Software. | |
27 | * | |
28 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
29 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
30 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
31 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
32 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
33 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
34 | * DEALINGS IN THE SOFTWARE. | |
35 | */ | |
36 | ||
37 | static const uint8_t utf8proc_utf8class[256] = { | |
38 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
39 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
40 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
41 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
42 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
43 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
44 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
45 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, | |
46 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
47 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
48 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
49 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, | |
50 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
51 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, | |
52 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, | |
53 | 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0 | |
54 | }; | |
55 | ||
56 | static int utf8_charlen(const uint8_t *str, size_t str_len) | |
57 | { | |
58 | uint8_t length; | |
59 | size_t i; | |
60 | ||
61 | length = utf8proc_utf8class[str[0]]; | |
62 | if (!length) | |
63 | return -1; | |
64 | ||
65 | if (str_len > 0 && length > str_len) | |
66 | return -1; | |
67 | ||
68 | for (i = 1; i < length; i++) { | |
69 | if ((str[i] & 0xC0) != 0x80) | |
70 | return -1; | |
71 | } | |
72 | ||
73 | return (int)length; | |
74 | } | |
75 | ||
76 | int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len) | |
77 | { | |
78 | const uint8_t *str = (const uint8_t *)_str; | |
79 | uint32_t uc = 0; | |
80 | int length; | |
81 | ||
82 | *out = 0; | |
83 | ||
84 | if ((length = utf8_charlen(str, str_len)) < 0) | |
85 | return -1; | |
86 | ||
87 | switch (length) { | |
88 | case 1: | |
89 | uc = str[0]; | |
90 | break; | |
91 | case 2: | |
92 | uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F); | |
93 | if (uc < 0x80) uc = -1; | |
94 | break; | |
95 | case 3: | |
96 | uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6) | |
97 | + (str[2] & 0x3F); | |
98 | if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) || | |
99 | (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1; | |
100 | break; | |
101 | case 4: | |
102 | uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12) | |
103 | + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F); | |
104 | if (uc < 0x10000 || uc >= 0x110000) uc = -1; | |
105 | break; | |
106 | default: | |
107 | return -1; | |
108 | } | |
109 | ||
110 | if ((uc & 0xFFFF) >= 0xFFFE) | |
111 | return -1; | |
112 | ||
113 | *out = uc; | |
114 | return length; | |
115 | } | |
116 | ||
117 | size_t git_utf8_char_length(const char *_str, size_t str_len) | |
118 | { | |
119 | const uint8_t *str = (const uint8_t *)_str; | |
120 | size_t offset = 0, count = 0; | |
121 | ||
122 | while (offset < str_len) { | |
123 | int length = utf8_charlen(str + offset, str_len - offset); | |
124 | ||
125 | if (length < 0) | |
126 | length = 1; | |
127 | ||
128 | offset += length; | |
129 | count++; | |
130 | } | |
131 | ||
132 | return count; | |
133 | } | |
134 | ||
135 | size_t git_utf8_valid_buf_length(const char *_str, size_t str_len) | |
136 | { | |
137 | const uint8_t *str = (const uint8_t *)_str; | |
138 | size_t offset = 0; | |
139 | ||
140 | while (offset < str_len) { | |
141 | int length = utf8_charlen(str + offset, str_len - offset); | |
142 | ||
143 | if (length < 0) | |
144 | break; | |
145 | ||
146 | offset += length; | |
147 | } | |
148 | ||
149 | return offset; | |
150 | } |