]>
Commit | Line | Data |
---|---|---|
1da177e4 LT |
1 | /* |
2 | * unicode.c | |
3 | * | |
4 | * PURPOSE | |
5 | * Routines for converting between UTF-8 and OSTA Compressed Unicode. | |
6 | * Also handles filename mangling | |
7 | * | |
8 | * DESCRIPTION | |
9 | * OSTA Compressed Unicode is explained in the OSTA UDF specification. | |
10 | * http://www.osta.org/ | |
11 | * UTF-8 is explained in the IETF RFC XXXX. | |
12 | * ftp://ftp.internic.net/rfc/rfcxxxx.txt | |
13 | * | |
1da177e4 LT |
14 | * COPYRIGHT |
15 | * This file is distributed under the terms of the GNU General Public | |
16 | * License (GPL). Copies of the GPL can be obtained from: | |
17 | * ftp://prep.ai.mit.edu/pub/gnu/GPL | |
18 | * Each contributing author retains all rights to their own work. | |
19 | */ | |
20 | ||
21 | #include "udfdecl.h" | |
22 | ||
23 | #include <linux/kernel.h> | |
24 | #include <linux/string.h> /* for memset */ | |
25 | #include <linux/nls.h> | |
f845fced | 26 | #include <linux/crc-itu-t.h> |
5a0e3ad6 | 27 | #include <linux/slab.h> |
1da177e4 LT |
28 | |
29 | #include "udf_sb.h" | |
30 | ||
44f06ba8 JK |
31 | #define SURROGATE_MASK 0xfffff800 |
32 | #define SURROGATE_PAIR 0x0000d800 | |
33 | ||
3e7fc205 AG |
34 | static int udf_uni2char_utf8(wchar_t uni, |
35 | unsigned char *out, | |
36 | int boundlen) | |
1da177e4 | 37 | { |
3e7fc205 AG |
38 | int u_len = 0; |
39 | ||
40 | if (boundlen <= 0) | |
41 | return -ENAMETOOLONG; | |
42 | ||
44f06ba8 JK |
43 | if ((uni & SURROGATE_MASK) == SURROGATE_PAIR) |
44 | return -EINVAL; | |
45 | ||
3e7fc205 AG |
46 | if (uni < 0x80) { |
47 | out[u_len++] = (unsigned char)uni; | |
48 | } else if (uni < 0x800) { | |
49 | if (boundlen < 2) | |
50 | return -ENAMETOOLONG; | |
51 | out[u_len++] = (unsigned char)(0xc0 | (uni >> 6)); | |
52 | out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); | |
53 | } else { | |
54 | if (boundlen < 3) | |
55 | return -ENAMETOOLONG; | |
56 | out[u_len++] = (unsigned char)(0xe0 | (uni >> 12)); | |
57 | out[u_len++] = (unsigned char)(0x80 | ((uni >> 6) & 0x3f)); | |
58 | out[u_len++] = (unsigned char)(0x80 | (uni & 0x3f)); | |
1da177e4 | 59 | } |
3e7fc205 | 60 | return u_len; |
1da177e4 LT |
61 | } |
62 | ||
3e7fc205 AG |
63 | static int udf_char2uni_utf8(const unsigned char *in, |
64 | int boundlen, | |
65 | wchar_t *uni) | |
1da177e4 | 66 | { |
3e7fc205 AG |
67 | unsigned int utf_char; |
68 | unsigned char c; | |
69 | int utf_cnt, u_len; | |
bb00c898 | 70 | |
3e7fc205 AG |
71 | utf_char = 0; |
72 | utf_cnt = 0; | |
73 | for (u_len = 0; u_len < boundlen;) { | |
74 | c = in[u_len++]; | |
1da177e4 LT |
75 | |
76 | /* Complete a multi-byte UTF-8 character */ | |
cb00ea35 | 77 | if (utf_cnt) { |
3e7fc205 | 78 | utf_char = (utf_char << 6) | (c & 0x3f); |
1da177e4 LT |
79 | if (--utf_cnt) |
80 | continue; | |
cb00ea35 | 81 | } else { |
1da177e4 | 82 | /* Check for a multi-byte UTF-8 character */ |
3e7fc205 | 83 | if (c & 0x80) { |
1da177e4 | 84 | /* Start a multi-byte UTF-8 character */ |
3e7fc205 AG |
85 | if ((c & 0xe0) == 0xc0) { |
86 | utf_char = c & 0x1f; | |
1da177e4 | 87 | utf_cnt = 1; |
3e7fc205 AG |
88 | } else if ((c & 0xf0) == 0xe0) { |
89 | utf_char = c & 0x0f; | |
1da177e4 | 90 | utf_cnt = 2; |
3e7fc205 AG |
91 | } else if ((c & 0xf8) == 0xf0) { |
92 | utf_char = c & 0x07; | |
1da177e4 | 93 | utf_cnt = 3; |
3e7fc205 AG |
94 | } else if ((c & 0xfc) == 0xf8) { |
95 | utf_char = c & 0x03; | |
1da177e4 | 96 | utf_cnt = 4; |
3e7fc205 AG |
97 | } else if ((c & 0xfe) == 0xfc) { |
98 | utf_char = c & 0x01; | |
1da177e4 | 99 | utf_cnt = 5; |
28de7948 | 100 | } else { |
3e7fc205 AG |
101 | utf_cnt = -1; |
102 | break; | |
28de7948 | 103 | } |
1da177e4 | 104 | continue; |
28de7948 | 105 | } else { |
1da177e4 LT |
106 | /* Single byte UTF-8 character (most common) */ |
107 | utf_char = c; | |
28de7948 | 108 | } |
1da177e4 | 109 | } |
3e7fc205 AG |
110 | *uni = utf_char; |
111 | break; | |
1da177e4 | 112 | } |
cb00ea35 | 113 | if (utf_cnt) { |
3e7fc205 AG |
114 | *uni = '?'; |
115 | return -EINVAL; | |
1da177e4 | 116 | } |
3e7fc205 | 117 | return u_len; |
1da177e4 LT |
118 | } |
119 | ||
484a10f4 AG |
120 | #define ILLEGAL_CHAR_MARK '_' |
121 | #define EXT_MARK '.' | |
122 | #define CRC_MARK '#' | |
123 | #define EXT_SIZE 5 | |
124 | /* Number of chars we need to store generated CRC to make filename unique */ | |
125 | #define CRC_LEN 5 | |
126 | ||
127 | static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, | |
128 | int *str_o_idx, | |
129 | const uint8_t *str_i, int str_i_max_len, | |
130 | int *str_i_idx, | |
131 | int u_ch, int *needsCRC, | |
132 | int (*conv_f)(wchar_t, unsigned char *, int), | |
133 | int translate) | |
134 | { | |
135 | uint32_t c; | |
136 | int illChar = 0; | |
137 | int len, gotch = 0; | |
138 | ||
139 | for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) { | |
140 | if (*str_o_idx >= str_o_max_len) { | |
141 | *needsCRC = 1; | |
142 | return gotch; | |
143 | } | |
144 | ||
145 | /* Expand OSTA compressed Unicode to Unicode */ | |
146 | c = str_i[*str_i_idx]; | |
147 | if (u_ch > 1) | |
148 | c = (c << 8) | str_i[*str_i_idx + 1]; | |
149 | ||
150 | if (translate && (c == '/' || c == 0)) | |
151 | illChar = 1; | |
152 | else if (illChar) | |
153 | break; | |
154 | else | |
155 | gotch = 1; | |
156 | } | |
157 | if (illChar) { | |
158 | *needsCRC = 1; | |
159 | c = ILLEGAL_CHAR_MARK; | |
160 | gotch = 1; | |
161 | } | |
162 | if (gotch) { | |
163 | len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx); | |
164 | /* Valid character? */ | |
165 | if (len >= 0) | |
166 | *str_o_idx += len; | |
167 | else if (len == -ENAMETOOLONG) { | |
168 | *needsCRC = 1; | |
169 | gotch = 0; | |
170 | } else { | |
171 | str_o[(*str_o_idx)++] = '?'; | |
172 | *needsCRC = 1; | |
173 | } | |
174 | } | |
175 | return gotch; | |
176 | } | |
177 | ||
9293fcfb AG |
178 | static int udf_name_from_CS0(uint8_t *str_o, int str_max_len, |
179 | const uint8_t *ocu, int ocu_len, | |
484a10f4 AG |
180 | int (*conv_f)(wchar_t, unsigned char *, int), |
181 | int translate) | |
1da177e4 | 182 | { |
484a10f4 | 183 | uint32_t c; |
9293fcfb | 184 | uint8_t cmp_id; |
484a10f4 AG |
185 | int idx, len; |
186 | int u_ch; | |
187 | int needsCRC = 0; | |
188 | int ext_i_len, ext_max_len; | |
189 | int str_o_len = 0; /* Length of resulting output */ | |
190 | int ext_o_len = 0; /* Extension output length */ | |
191 | int ext_crc_len = 0; /* Extension output length if used with CRC */ | |
192 | int i_ext = -1; /* Extension position in input buffer */ | |
193 | int o_crc = 0; /* Rightmost possible output pos for CRC+ext */ | |
194 | unsigned short valueCRC; | |
195 | uint8_t ext[EXT_SIZE * NLS_MAX_CHARSET_SIZE + 1]; | |
196 | uint8_t crc[CRC_LEN]; | |
1da177e4 | 197 | |
9293fcfb AG |
198 | if (str_max_len <= 0) |
199 | return 0; | |
1da177e4 | 200 | |
cb00ea35 | 201 | if (ocu_len == 0) { |
9293fcfb | 202 | memset(str_o, 0, str_max_len); |
1da177e4 LT |
203 | return 0; |
204 | } | |
205 | ||
9293fcfb | 206 | cmp_id = ocu[0]; |
34f953dd | 207 | if (cmp_id != 8 && cmp_id != 16) { |
9293fcfb | 208 | memset(str_o, 0, str_max_len); |
fcbf7637 | 209 | pr_err("unknown compression code (%u)\n", cmp_id); |
78fc2e69 | 210 | return -EINVAL; |
1da177e4 | 211 | } |
484a10f4 | 212 | u_ch = cmp_id >> 3; |
1da177e4 | 213 | |
484a10f4 AG |
214 | ocu++; |
215 | ocu_len--; | |
1da177e4 | 216 | |
484a10f4 AG |
217 | if (ocu_len % u_ch) { |
218 | pr_err("incorrect filename length (%d)\n", ocu_len + 1); | |
219 | return -EINVAL; | |
220 | } | |
221 | ||
222 | if (translate) { | |
223 | /* Look for extension */ | |
224 | for (idx = ocu_len - u_ch, ext_i_len = 0; | |
225 | (idx >= 0) && (ext_i_len < EXT_SIZE); | |
226 | idx -= u_ch, ext_i_len++) { | |
227 | c = ocu[idx]; | |
228 | if (u_ch > 1) | |
229 | c = (c << 8) | ocu[idx + 1]; | |
230 | ||
231 | if (c == EXT_MARK) { | |
232 | if (ext_i_len) | |
233 | i_ext = idx; | |
234 | break; | |
235 | } | |
236 | } | |
237 | if (i_ext >= 0) { | |
238 | /* Convert extension */ | |
239 | ext_max_len = min_t(int, sizeof(ext), str_max_len); | |
240 | ext[ext_o_len++] = EXT_MARK; | |
241 | idx = i_ext + u_ch; | |
242 | while (udf_name_conv_char(ext, ext_max_len, &ext_o_len, | |
243 | ocu, ocu_len, &idx, | |
244 | u_ch, &needsCRC, | |
245 | conv_f, translate)) { | |
246 | if ((ext_o_len + CRC_LEN) < str_max_len) | |
247 | ext_crc_len = ext_o_len; | |
248 | } | |
249 | } | |
250 | } | |
251 | ||
252 | idx = 0; | |
253 | while (1) { | |
254 | if (translate && (idx == i_ext)) { | |
255 | if (str_o_len > (str_max_len - ext_o_len)) | |
256 | needsCRC = 1; | |
3e7fc205 | 257 | break; |
484a10f4 AG |
258 | } |
259 | ||
260 | if (!udf_name_conv_char(str_o, str_max_len, &str_o_len, | |
261 | ocu, ocu_len, &idx, | |
262 | u_ch, &needsCRC, conv_f, translate)) | |
263 | break; | |
264 | ||
265 | if (translate && | |
266 | (str_o_len <= (str_max_len - ext_o_len - CRC_LEN))) | |
267 | o_crc = str_o_len; | |
268 | } | |
269 | ||
270 | if (translate) { | |
271 | if (str_o_len <= 2 && str_o[0] == '.' && | |
272 | (str_o_len == 1 || str_o[1] == '.')) | |
273 | needsCRC = 1; | |
274 | if (needsCRC) { | |
275 | str_o_len = o_crc; | |
276 | valueCRC = crc_itu_t(0, ocu, ocu_len); | |
277 | crc[0] = CRC_MARK; | |
278 | crc[1] = hex_asc_upper_hi(valueCRC >> 8); | |
279 | crc[2] = hex_asc_upper_lo(valueCRC >> 8); | |
280 | crc[3] = hex_asc_upper_hi(valueCRC); | |
281 | crc[4] = hex_asc_upper_lo(valueCRC); | |
282 | len = min_t(int, CRC_LEN, str_max_len - str_o_len); | |
283 | memcpy(&str_o[str_o_len], crc, len); | |
284 | str_o_len += len; | |
285 | ext_o_len = ext_crc_len; | |
286 | } | |
287 | if (ext_o_len > 0) { | |
288 | memcpy(&str_o[str_o_len], ext, ext_o_len); | |
289 | str_o_len += ext_o_len; | |
290 | } | |
1da177e4 | 291 | } |
1da177e4 | 292 | |
9293fcfb | 293 | return str_o_len; |
1da177e4 LT |
294 | } |
295 | ||
9293fcfb AG |
296 | static int udf_name_to_CS0(uint8_t *ocu, int ocu_max_len, |
297 | const uint8_t *str_i, int str_len, | |
3e7fc205 | 298 | int (*conv_f)(const unsigned char *, int, wchar_t *)) |
1da177e4 | 299 | { |
3e7fc205 AG |
300 | int i, len; |
301 | unsigned int max_val; | |
302 | wchar_t uni_char; | |
bb00c898 | 303 | int u_len, u_ch; |
1da177e4 | 304 | |
9293fcfb AG |
305 | if (ocu_max_len <= 0) |
306 | return 0; | |
307 | ||
308 | memset(ocu, 0, ocu_max_len); | |
1da177e4 | 309 | ocu[0] = 8; |
3e7fc205 | 310 | max_val = 0xff; |
bb00c898 | 311 | u_ch = 1; |
1da177e4 | 312 | |
28de7948 | 313 | try_again: |
9293fcfb AG |
314 | u_len = 1; |
315 | for (i = 0; i < str_len; i++) { | |
bb00c898 | 316 | /* Name didn't fit? */ |
9293fcfb | 317 | if (u_len + u_ch > ocu_max_len) |
bb00c898 | 318 | return 0; |
9293fcfb | 319 | len = conv_f(&str_i[i], str_len - i, &uni_char); |
59285c28 | 320 | if (!len) |
1da177e4 | 321 | continue; |
59285c28 JK |
322 | /* Invalid character, deal with it */ |
323 | if (len < 0) { | |
324 | len = 1; | |
325 | uni_char = '?'; | |
326 | } | |
1da177e4 | 327 | |
cb00ea35 | 328 | if (uni_char > max_val) { |
3e7fc205 AG |
329 | max_val = 0xffff; |
330 | ocu[0] = 0x10; | |
bb00c898 | 331 | u_ch = 2; |
1da177e4 LT |
332 | goto try_again; |
333 | } | |
cb00ea35 | 334 | |
3e7fc205 | 335 | if (max_val == 0xffff) |
9293fcfb AG |
336 | ocu[u_len++] = (uint8_t)(uni_char >> 8); |
337 | ocu[u_len++] = (uint8_t)(uni_char & 0xff); | |
1da177e4 LT |
338 | i += len - 1; |
339 | } | |
340 | ||
9293fcfb | 341 | return u_len; |
1da177e4 LT |
342 | } |
343 | ||
c26f6c61 AG |
344 | int udf_dstrCS0toUTF8(uint8_t *utf_o, int o_len, |
345 | const uint8_t *ocu_i, int i_len) | |
3e7fc205 | 346 | { |
c26f6c61 AG |
347 | int s_len = 0; |
348 | ||
349 | if (i_len > 0) { | |
350 | s_len = ocu_i[i_len - 1]; | |
351 | if (s_len >= i_len) { | |
352 | pr_err("incorrect dstring lengths (%d/%d)\n", | |
353 | s_len, i_len); | |
354 | return -EINVAL; | |
355 | } | |
356 | } | |
357 | ||
358 | return udf_name_from_CS0(utf_o, o_len, ocu_i, s_len, | |
484a10f4 | 359 | udf_uni2char_utf8, 0); |
3e7fc205 AG |
360 | } |
361 | ||
9293fcfb | 362 | int udf_get_filename(struct super_block *sb, const uint8_t *sname, int slen, |
0e5cc9a4 | 363 | uint8_t *dname, int dlen) |
1da177e4 | 364 | { |
3e7fc205 | 365 | int (*conv_f)(wchar_t, unsigned char *, int); |
6ce63836 | 366 | int ret; |
1da177e4 | 367 | |
31f2566f FF |
368 | if (!slen) |
369 | return -EIO; | |
370 | ||
9293fcfb AG |
371 | if (dlen <= 0) |
372 | return 0; | |
373 | ||
cb00ea35 | 374 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { |
3e7fc205 | 375 | conv_f = udf_uni2char_utf8; |
cb00ea35 | 376 | } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { |
3e7fc205 | 377 | conv_f = UDF_SB(sb)->s_nls_map->uni2char; |
4b11111a | 378 | } else |
5dce54b7 | 379 | BUG(); |
530f1a5e | 380 | |
484a10f4 | 381 | ret = udf_name_from_CS0(dname, dlen, sname, slen, conv_f, 1); |
6ce63836 FF |
382 | /* Zero length filename isn't valid... */ |
383 | if (ret == 0) | |
384 | ret = -EINVAL; | |
5ceb8b55 | 385 | return ret; |
1da177e4 LT |
386 | } |
387 | ||
525e2c56 AG |
388 | int udf_put_filename(struct super_block *sb, const uint8_t *sname, int slen, |
389 | uint8_t *dname, int dlen) | |
1da177e4 | 390 | { |
3e7fc205 | 391 | int (*conv_f)(const unsigned char *, int, wchar_t *); |
1da177e4 | 392 | |
cb00ea35 | 393 | if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { |
3e7fc205 | 394 | conv_f = udf_char2uni_utf8; |
cb00ea35 | 395 | } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { |
3e7fc205 | 396 | conv_f = UDF_SB(sb)->s_nls_map->char2uni; |
4b11111a | 397 | } else |
3e7fc205 | 398 | BUG(); |
1da177e4 | 399 | |
9293fcfb | 400 | return udf_name_to_CS0(dname, dlen, sname, slen, conv_f); |
1da177e4 LT |
401 | } |
402 |