]> git.proxmox.com Git - libgit2.git/blob - src/utf8.c
New upstream version 1.3.0+dfsg.1
[libgit2.git] / src / utf8.c
1 /*
2 * Copyright (C) the libgit2 contributors. All rights reserved.
3 *
4 * This file is part of libgit2, distributed under the GNU GPL v2 with
5 * a Linking Exception. For full terms see the included COPYING file.
6 */
7
8 #include "utf8.h"
9
10 #include "common.h"
11
12 /*
13 * git_utf8_iterate is taken from the utf8proc project,
14 * http://www.public-software-group.org/utf8proc
15 *
16 * Copyright (c) 2009 Public Software Group e. V., Berlin, Germany
17 *
18 * Permission is hereby granted, free of charge, to any person obtaining a
19 * copy of this software and associated documentation files (the ""Software""),
20 * to deal in the Software without restriction, including without limitation
21 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
22 * and/or sell copies of the Software, and to permit persons to whom the
23 * Software is furnished to do so, subject to the following conditions:
24 *
25 * The above copyright notice and this permission notice shall be included in
26 * all copies or substantial portions of the Software.
27 *
28 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
29 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
30 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
31 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
32 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
33 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
34 * DEALINGS IN THE SOFTWARE.
35 */
36
37 static const uint8_t utf8proc_utf8class[256] = {
38 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
39 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
40 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
41 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
42 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
43 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
44 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
45 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
46 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
47 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
48 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
49 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
50 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
51 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
52 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
53 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0
54 };
55
56 static int utf8_charlen(const uint8_t *str, size_t str_len)
57 {
58 uint8_t length;
59 size_t i;
60
61 length = utf8proc_utf8class[str[0]];
62 if (!length)
63 return -1;
64
65 if (str_len > 0 && length > str_len)
66 return -1;
67
68 for (i = 1; i < length; i++) {
69 if ((str[i] & 0xC0) != 0x80)
70 return -1;
71 }
72
73 return (int)length;
74 }
75
76 int git_utf8_iterate(uint32_t *out, const char *_str, size_t str_len)
77 {
78 const uint8_t *str = (const uint8_t *)_str;
79 uint32_t uc = 0;
80 int length;
81
82 *out = 0;
83
84 if ((length = utf8_charlen(str, str_len)) < 0)
85 return -1;
86
87 switch (length) {
88 case 1:
89 uc = str[0];
90 break;
91 case 2:
92 uc = ((str[0] & 0x1F) << 6) + (str[1] & 0x3F);
93 if (uc < 0x80) uc = -1;
94 break;
95 case 3:
96 uc = ((str[0] & 0x0F) << 12) + ((str[1] & 0x3F) << 6)
97 + (str[2] & 0x3F);
98 if (uc < 0x800 || (uc >= 0xD800 && uc < 0xE000) ||
99 (uc >= 0xFDD0 && uc < 0xFDF0)) uc = -1;
100 break;
101 case 4:
102 uc = ((str[0] & 0x07) << 18) + ((str[1] & 0x3F) << 12)
103 + ((str[2] & 0x3F) << 6) + (str[3] & 0x3F);
104 if (uc < 0x10000 || uc >= 0x110000) uc = -1;
105 break;
106 default:
107 return -1;
108 }
109
110 if ((uc & 0xFFFF) >= 0xFFFE)
111 return -1;
112
113 *out = uc;
114 return length;
115 }
116
117 size_t git_utf8_char_length(const char *_str, size_t str_len)
118 {
119 const uint8_t *str = (const uint8_t *)_str;
120 size_t offset = 0, count = 0;
121
122 while (offset < str_len) {
123 int length = utf8_charlen(str + offset, str_len - offset);
124
125 if (length < 0)
126 length = 1;
127
128 offset += length;
129 count++;
130 }
131
132 return count;
133 }
134
135 size_t git_utf8_valid_buf_length(const char *_str, size_t str_len)
136 {
137 const uint8_t *str = (const uint8_t *)_str;
138 size_t offset = 0;
139
140 while (offset < str_len) {
141 int length = utf8_charlen(str + offset, str_len - offset);
142
143 if (length < 0)
144 break;
145
146 offset += length;
147 }
148
149 return offset;
150 }