]>
git.proxmox.com Git - ceph.git/blob - ceph/src/common/utf8.c
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2011 New Dream Network
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
14 #include "common/utf8.h"
18 static int high_bits_set(int c
)
22 if ((c
& 0x80) != 0x080)
30 /* Encode a 31-bit UTF8 code point to 'buf'.
31 * Assumes buf is of size MAX_UTF8_SZ
32 * Returns -1 on failure; number of bytes in the encoded value otherwise.
34 int encode_utf8(unsigned long u
, unsigned char *buf
)
36 static const unsigned long max_val
[MAX_UTF8_SZ
] = {
37 0x0000007ful
, 0x000007fful
, 0x0000fffful
,
38 0x001ffffful
, 0x03fffffful
, 0x7ffffffful
40 static const int MAX_VAL_SZ
= sizeof(max_val
)/sizeof(max_val
[0]);
43 for (i
= 0; i
< MAX_VAL_SZ
; ++i
) {
47 if (i
== MAX_VAL_SZ
) {
48 // This code point is too big to encode.
57 for (j
= i
; j
> 0; --j
) {
58 buf
[j
] = 0x80 | (u
& 0x3f);
62 unsigned char mask
= ~(0xFF >> (i
+ 1));
70 * Decode a UTF8 character from an array of bytes. Return character code.
71 * Upon error, return INVALID_UTF8_CHAR.
73 unsigned long decode_utf8(unsigned char *buf
, int nbytes
)
79 return INVALID_UTF8_CHAR
;
83 return INVALID_UTF8_CHAR
;
87 i
= high_bits_set(buf
[0]);
89 return INVALID_UTF8_CHAR
;
90 code
= buf
[0] & (0xff >> i
);
91 for (j
= 1; j
< nbytes
; ++j
) {
92 if ((buf
[j
] & 0xc0) != 0x80)
93 return INVALID_UTF8_CHAR
;
94 code
= (code
<< 6) | (buf
[j
] & 0x3f);
97 // Check for invalid code points
99 return INVALID_UTF8_CHAR
;
101 return INVALID_UTF8_CHAR
;
102 if (code
>= 0xD800 && code
<= 0xDFFF)
103 return INVALID_UTF8_CHAR
;
108 int check_utf8(const char *buf
, int len
)
110 unsigned char u
[MAX_UTF8_SZ
];
114 unsigned int c
= buf
[i
];
115 if (i
>= len
|| c
< 0x80 || (c
& 0xC0) != 0x80) {
116 // the start of a new character. Process what we have
120 unsigned char re_encoded
[MAX_UTF8_SZ
];
121 unsigned long code
= decode_utf8(u
, enc_len
);
122 if (code
== INVALID_UTF8_CHAR
) {
123 //printf("decoded to invalid utf8");
126 re_encoded_len
= encode_utf8(code
, re_encoded
);
127 if (enc_len
!= re_encoded_len
) {
128 //printf("originally encoded as %d bytes, "
129 // "but was re-encoded to %d!\n",
130 // enc_len, re_encoded_len);
133 if (memcmp(u
, re_encoded
, enc_len
) != 0) {
134 //printf("re-encoded to a different "
138 //printf("code_point %lu\n", code);
143 // start collecting again?
147 if (enc_len
== MAX_UTF8_SZ
) {
148 //printf("too many enc_len in utf character!\n");
151 //printf("continuation byte...\n");
159 int check_utf8_cstr(const char *buf
)
161 return check_utf8(buf
, strlen(buf
));
164 int is_control_character(int c
)
166 return (((c
!= 0) && (c
< 0x20)) || (c
== 0x7f));
169 int check_for_control_characters(const char *buf
, int len
)
172 for (i
= 0; i
< len
; ++i
) {
173 if (is_control_character((int)(unsigned char)buf
[i
])) {
180 int check_for_control_characters_cstr(const char *buf
)
182 return check_for_control_characters(buf
, strlen(buf
));