]>
Commit | Line | Data |
---|---|---|
4710c53d | 1 | /*\r |
2 | \r | |
3 | Unicode implementation based on original code by Fredrik Lundh,\r | |
4 | modified by Marc-Andre Lemburg <mal@lemburg.com> according to the\r | |
5 | Unicode Integration Proposal (see file Misc/unicode.txt).\r | |
6 | \r | |
7 | Major speed upgrades to the method implementations at the Reykjavik\r | |
8 | NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.\r | |
9 | \r | |
10 | Copyright (c) Corporation for National Research Initiatives.\r | |
11 | \r | |
12 | --------------------------------------------------------------------\r | |
13 | The original string type implementation is:\r | |
14 | \r | |
15 | Copyright (c) 1999 by Secret Labs AB\r | |
16 | Copyright (c) 1999 by Fredrik Lundh\r | |
17 | \r | |
18 | By obtaining, using, and/or copying this software and/or its\r | |
19 | associated documentation, you agree that you have read, understood,\r | |
20 | and will comply with the following terms and conditions:\r | |
21 | \r | |
22 | Permission to use, copy, modify, and distribute this software and its\r | |
23 | associated documentation for any purpose and without fee is hereby\r | |
24 | granted, provided that the above copyright notice appears in all\r | |
25 | copies, and that both that copyright notice and this permission notice\r | |
26 | appear in supporting documentation, and that the name of Secret Labs\r | |
27 | AB or the author not be used in advertising or publicity pertaining to\r | |
28 | distribution of the software without specific, written prior\r | |
29 | permission.\r | |
30 | \r | |
31 | SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO\r | |
32 | THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND\r | |
33 | FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR\r | |
34 | ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\r | |
35 | WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN\r | |
36 | ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT\r | |
37 | OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\r | |
38 | --------------------------------------------------------------------\r | |
39 | \r | |
40 | */\r | |
41 | \r | |
42 | #define PY_SSIZE_T_CLEAN\r | |
43 | #include "Python.h"\r | |
44 | \r | |
45 | #include "unicodeobject.h"\r | |
46 | #include "ucnhash.h"\r | |
47 | \r | |
48 | #ifdef MS_WINDOWS\r | |
49 | #include <windows.h>\r | |
50 | #endif\r | |
51 | \r | |
52 | /* Limit for the Unicode object free list */\r | |
53 | \r | |
54 | #define PyUnicode_MAXFREELIST 1024\r | |
55 | \r | |
56 | /* Limit for the Unicode object free list stay alive optimization.\r | |
57 | \r | |
58 | The implementation will keep allocated Unicode memory intact for\r | |
59 | all objects on the free list having a size less than this\r | |
60 | limit. This reduces malloc() overhead for small Unicode objects.\r | |
61 | \r | |
62 | At worst this will result in PyUnicode_MAXFREELIST *\r | |
63 | (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +\r | |
64 | malloc()-overhead) bytes of unused garbage.\r | |
65 | \r | |
66 | Setting the limit to 0 effectively turns the feature off.\r | |
67 | \r | |
68 | Note: This is an experimental feature ! If you get core dumps when\r | |
69 | using Unicode objects, turn this feature off.\r | |
70 | \r | |
71 | */\r | |
72 | \r | |
73 | #define KEEPALIVE_SIZE_LIMIT 9\r | |
74 | \r | |
75 | /* Endianness switches; defaults to little endian */\r | |
76 | \r | |
77 | #ifdef WORDS_BIGENDIAN\r | |
78 | # define BYTEORDER_IS_BIG_ENDIAN\r | |
79 | #else\r | |
80 | # define BYTEORDER_IS_LITTLE_ENDIAN\r | |
81 | #endif\r | |
82 | \r | |
83 | /* --- Globals ------------------------------------------------------------\r | |
84 | \r | |
85 | The globals are initialized by the _PyUnicode_Init() API and should\r | |
86 | not be used before calling that API.\r | |
87 | \r | |
88 | */\r | |
89 | \r | |
90 | \r | |
91 | #ifdef __cplusplus\r | |
92 | extern "C" {\r | |
93 | #endif\r | |
94 | \r | |
95 | /* Free list for Unicode objects */\r | |
96 | static PyUnicodeObject *free_list;\r | |
97 | static int numfree;\r | |
98 | \r | |
99 | /* The empty Unicode object is shared to improve performance. */\r | |
100 | static PyUnicodeObject *unicode_empty;\r | |
101 | \r | |
102 | /* Single character Unicode strings in the Latin-1 range are being\r | |
103 | shared as well. */\r | |
104 | static PyUnicodeObject *unicode_latin1[256];\r | |
105 | \r | |
106 | /* Default encoding to use and assume when NULL is passed as encoding\r | |
107 | parameter; it is initialized by _PyUnicode_Init().\r | |
108 | \r | |
109 | Always use the PyUnicode_SetDefaultEncoding() and\r | |
110 | PyUnicode_GetDefaultEncoding() APIs to access this global.\r | |
111 | \r | |
112 | */\r | |
113 | static char unicode_default_encoding[100];\r | |
114 | \r | |
115 | /* Fast detection of the most frequent whitespace characters */\r | |
116 | const unsigned char _Py_ascii_whitespace[] = {\r | |
117 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
118 | /* case 0x0009: * CHARACTER TABULATION */\r | |
119 | /* case 0x000A: * LINE FEED */\r | |
120 | /* case 0x000B: * LINE TABULATION */\r | |
121 | /* case 0x000C: * FORM FEED */\r | |
122 | /* case 0x000D: * CARRIAGE RETURN */\r | |
123 | 0, 1, 1, 1, 1, 1, 0, 0,\r | |
124 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
125 | /* case 0x001C: * FILE SEPARATOR */\r | |
126 | /* case 0x001D: * GROUP SEPARATOR */\r | |
127 | /* case 0x001E: * RECORD SEPARATOR */\r | |
128 | /* case 0x001F: * UNIT SEPARATOR */\r | |
129 | 0, 0, 0, 0, 1, 1, 1, 1,\r | |
130 | /* case 0x0020: * SPACE */\r | |
131 | 1, 0, 0, 0, 0, 0, 0, 0,\r | |
132 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
133 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
134 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
135 | \r | |
136 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
137 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
138 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
139 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
140 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
141 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
142 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
143 | 0, 0, 0, 0, 0, 0, 0, 0\r | |
144 | };\r | |
145 | \r | |
146 | /* Same for linebreaks */\r | |
147 | static unsigned char ascii_linebreak[] = {\r | |
148 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
149 | /* 0x000A, * LINE FEED */\r | |
150 | /* 0x000B, * LINE TABULATION */\r | |
151 | /* 0x000C, * FORM FEED */\r | |
152 | /* 0x000D, * CARRIAGE RETURN */\r | |
153 | 0, 0, 1, 1, 1, 1, 0, 0,\r | |
154 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
155 | /* 0x001C, * FILE SEPARATOR */\r | |
156 | /* 0x001D, * GROUP SEPARATOR */\r | |
157 | /* 0x001E, * RECORD SEPARATOR */\r | |
158 | 0, 0, 0, 0, 1, 1, 1, 0,\r | |
159 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
160 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
161 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
162 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
163 | \r | |
164 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
165 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
166 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
167 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
168 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
169 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
170 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
171 | 0, 0, 0, 0, 0, 0, 0, 0\r | |
172 | };\r | |
173 | \r | |
174 | \r | |
175 | Py_UNICODE\r | |
176 | PyUnicode_GetMax(void)\r | |
177 | {\r | |
178 | #ifdef Py_UNICODE_WIDE\r | |
179 | return 0x10FFFF;\r | |
180 | #else\r | |
181 | /* This is actually an illegal character, so it should\r | |
182 | not be passed to unichr. */\r | |
183 | return 0xFFFF;\r | |
184 | #endif\r | |
185 | }\r | |
186 | \r | |
187 | /* --- Bloom Filters ----------------------------------------------------- */\r | |
188 | \r | |
189 | /* stuff to implement simple "bloom filters" for Unicode characters.\r | |
190 | to keep things simple, we use a single bitmask, using the least 5\r | |
191 | bits from each unicode characters as the bit index. */\r | |
192 | \r | |
193 | /* the linebreak mask is set up by Unicode_Init below */\r | |
194 | \r | |
195 | #if LONG_BIT >= 128\r | |
196 | #define BLOOM_WIDTH 128\r | |
197 | #elif LONG_BIT >= 64\r | |
198 | #define BLOOM_WIDTH 64\r | |
199 | #elif LONG_BIT >= 32\r | |
200 | #define BLOOM_WIDTH 32\r | |
201 | #else\r | |
202 | #error "LONG_BIT is smaller than 32"\r | |
203 | #endif\r | |
204 | \r | |
205 | #define BLOOM_MASK unsigned long\r | |
206 | \r | |
207 | static BLOOM_MASK bloom_linebreak;\r | |
208 | \r | |
209 | #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))\r | |
210 | #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))\r | |
211 | \r | |
212 | #define BLOOM_LINEBREAK(ch) \\r | |
213 | ((ch) < 128U ? ascii_linebreak[(ch)] : \\r | |
214 | (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))\r | |
215 | \r | |
216 | Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)\r | |
217 | {\r | |
218 | /* calculate simple bloom-style bitmask for a given unicode string */\r | |
219 | \r | |
220 | BLOOM_MASK mask;\r | |
221 | Py_ssize_t i;\r | |
222 | \r | |
223 | mask = 0;\r | |
224 | for (i = 0; i < len; i++)\r | |
225 | BLOOM_ADD(mask, ptr[i]);\r | |
226 | \r | |
227 | return mask;\r | |
228 | }\r | |
229 | \r | |
230 | Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)\r | |
231 | {\r | |
232 | Py_ssize_t i;\r | |
233 | \r | |
234 | for (i = 0; i < setlen; i++)\r | |
235 | if (set[i] == chr)\r | |
236 | return 1;\r | |
237 | \r | |
238 | return 0;\r | |
239 | }\r | |
240 | \r | |
241 | #define BLOOM_MEMBER(mask, chr, set, setlen) \\r | |
242 | BLOOM(mask, chr) && unicode_member(chr, set, setlen)\r | |
243 | \r | |
244 | /* --- Unicode Object ----------------------------------------------------- */\r | |
245 | \r | |
246 | static\r | |
247 | int unicode_resize(register PyUnicodeObject *unicode,\r | |
248 | Py_ssize_t length)\r | |
249 | {\r | |
250 | void *oldstr;\r | |
251 | \r | |
252 | /* Shortcut if there's nothing much to do. */\r | |
253 | if (unicode->length == length)\r | |
254 | goto reset;\r | |
255 | \r | |
256 | /* Resizing shared object (unicode_empty or single character\r | |
257 | objects) in-place is not allowed. Use PyUnicode_Resize()\r | |
258 | instead ! */\r | |
259 | \r | |
260 | if (unicode == unicode_empty ||\r | |
261 | (unicode->length == 1 &&\r | |
262 | unicode->str[0] < 256U &&\r | |
263 | unicode_latin1[unicode->str[0]] == unicode)) {\r | |
264 | PyErr_SetString(PyExc_SystemError,\r | |
265 | "can't resize shared unicode objects");\r | |
266 | return -1;\r | |
267 | }\r | |
268 | \r | |
269 | /* We allocate one more byte to make sure the string is Ux0000 terminated.\r | |
270 | The overallocation is also used by fastsearch, which assumes that it's\r | |
271 | safe to look at str[length] (without making any assumptions about what\r | |
272 | it contains). */\r | |
273 | \r | |
274 | oldstr = unicode->str;\r | |
275 | unicode->str = PyObject_REALLOC(unicode->str,\r | |
276 | sizeof(Py_UNICODE) * (length + 1));\r | |
277 | if (!unicode->str) {\r | |
278 | unicode->str = (Py_UNICODE *)oldstr;\r | |
279 | PyErr_NoMemory();\r | |
280 | return -1;\r | |
281 | }\r | |
282 | unicode->str[length] = 0;\r | |
283 | unicode->length = length;\r | |
284 | \r | |
285 | reset:\r | |
286 | /* Reset the object caches */\r | |
287 | if (unicode->defenc) {\r | |
288 | Py_CLEAR(unicode->defenc);\r | |
289 | }\r | |
290 | unicode->hash = -1;\r | |
291 | \r | |
292 | return 0;\r | |
293 | }\r | |
294 | \r | |
295 | /* We allocate one more byte to make sure the string is\r | |
296 | Ux0000 terminated; some code relies on that.\r | |
297 | \r | |
298 | XXX This allocator could further be enhanced by assuring that the\r | |
299 | free list never reduces its size below 1.\r | |
300 | \r | |
301 | */\r | |
302 | \r | |
303 | static\r | |
304 | PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)\r | |
305 | {\r | |
306 | register PyUnicodeObject *unicode;\r | |
307 | \r | |
308 | /* Optimization for empty strings */\r | |
309 | if (length == 0 && unicode_empty != NULL) {\r | |
310 | Py_INCREF(unicode_empty);\r | |
311 | return unicode_empty;\r | |
312 | }\r | |
313 | \r | |
314 | /* Ensure we won't overflow the size. */\r | |
315 | if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {\r | |
316 | return (PyUnicodeObject *)PyErr_NoMemory();\r | |
317 | }\r | |
318 | \r | |
319 | /* Unicode freelist & memory allocation */\r | |
320 | if (free_list) {\r | |
321 | unicode = free_list;\r | |
322 | free_list = *(PyUnicodeObject **)unicode;\r | |
323 | numfree--;\r | |
324 | if (unicode->str) {\r | |
325 | /* Keep-Alive optimization: we only upsize the buffer,\r | |
326 | never downsize it. */\r | |
327 | if ((unicode->length < length) &&\r | |
328 | unicode_resize(unicode, length) < 0) {\r | |
329 | PyObject_DEL(unicode->str);\r | |
330 | unicode->str = NULL;\r | |
331 | }\r | |
332 | }\r | |
333 | else {\r | |
334 | size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);\r | |
335 | unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);\r | |
336 | }\r | |
337 | PyObject_INIT(unicode, &PyUnicode_Type);\r | |
338 | }\r | |
339 | else {\r | |
340 | size_t new_size;\r | |
341 | unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);\r | |
342 | if (unicode == NULL)\r | |
343 | return NULL;\r | |
344 | new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);\r | |
345 | unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);\r | |
346 | }\r | |
347 | \r | |
348 | if (!unicode->str) {\r | |
349 | PyErr_NoMemory();\r | |
350 | goto onError;\r | |
351 | }\r | |
352 | /* Initialize the first element to guard against cases where\r | |
353 | * the caller fails before initializing str -- unicode_resize()\r | |
354 | * reads str[0], and the Keep-Alive optimization can keep memory\r | |
355 | * allocated for str alive across a call to unicode_dealloc(unicode).\r | |
356 | * We don't want unicode_resize to read uninitialized memory in\r | |
357 | * that case.\r | |
358 | */\r | |
359 | unicode->str[0] = 0;\r | |
360 | unicode->str[length] = 0;\r | |
361 | unicode->length = length;\r | |
362 | unicode->hash = -1;\r | |
363 | unicode->defenc = NULL;\r | |
364 | return unicode;\r | |
365 | \r | |
366 | onError:\r | |
367 | /* XXX UNREF/NEWREF interface should be more symmetrical */\r | |
368 | _Py_DEC_REFTOTAL;\r | |
369 | _Py_ForgetReference((PyObject *)unicode);\r | |
370 | PyObject_Del(unicode);\r | |
371 | return NULL;\r | |
372 | }\r | |
373 | \r | |
374 | static\r | |
375 | void unicode_dealloc(register PyUnicodeObject *unicode)\r | |
376 | {\r | |
377 | if (PyUnicode_CheckExact(unicode) &&\r | |
378 | numfree < PyUnicode_MAXFREELIST) {\r | |
379 | /* Keep-Alive optimization */\r | |
380 | if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {\r | |
381 | PyObject_DEL(unicode->str);\r | |
382 | unicode->str = NULL;\r | |
383 | unicode->length = 0;\r | |
384 | }\r | |
385 | if (unicode->defenc) {\r | |
386 | Py_CLEAR(unicode->defenc);\r | |
387 | }\r | |
388 | /* Add to free list */\r | |
389 | *(PyUnicodeObject **)unicode = free_list;\r | |
390 | free_list = unicode;\r | |
391 | numfree++;\r | |
392 | }\r | |
393 | else {\r | |
394 | PyObject_DEL(unicode->str);\r | |
395 | Py_XDECREF(unicode->defenc);\r | |
396 | Py_TYPE(unicode)->tp_free((PyObject *)unicode);\r | |
397 | }\r | |
398 | }\r | |
399 | \r | |
400 | static\r | |
401 | int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)\r | |
402 | {\r | |
403 | register PyUnicodeObject *v;\r | |
404 | \r | |
405 | /* Argument checks */\r | |
406 | if (unicode == NULL) {\r | |
407 | PyErr_BadInternalCall();\r | |
408 | return -1;\r | |
409 | }\r | |
410 | v = *unicode;\r | |
411 | if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {\r | |
412 | PyErr_BadInternalCall();\r | |
413 | return -1;\r | |
414 | }\r | |
415 | \r | |
416 | /* Resizing unicode_empty and single character objects is not\r | |
417 | possible since these are being shared. We simply return a fresh\r | |
418 | copy with the same Unicode content. */\r | |
419 | if (v->length != length &&\r | |
420 | (v == unicode_empty || v->length == 1)) {\r | |
421 | PyUnicodeObject *w = _PyUnicode_New(length);\r | |
422 | if (w == NULL)\r | |
423 | return -1;\r | |
424 | Py_UNICODE_COPY(w->str, v->str,\r | |
425 | length < v->length ? length : v->length);\r | |
426 | Py_DECREF(*unicode);\r | |
427 | *unicode = w;\r | |
428 | return 0;\r | |
429 | }\r | |
430 | \r | |
431 | /* Note that we don't have to modify *unicode for unshared Unicode\r | |
432 | objects, since we can modify them in-place. */\r | |
433 | return unicode_resize(v, length);\r | |
434 | }\r | |
435 | \r | |
436 | int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)\r | |
437 | {\r | |
438 | return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);\r | |
439 | }\r | |
440 | \r | |
441 | PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,\r | |
442 | Py_ssize_t size)\r | |
443 | {\r | |
444 | PyUnicodeObject *unicode;\r | |
445 | \r | |
446 | /* If the Unicode data is known at construction time, we can apply\r | |
447 | some optimizations which share commonly used objects. */\r | |
448 | if (u != NULL) {\r | |
449 | \r | |
450 | /* Optimization for empty strings */\r | |
451 | if (size == 0 && unicode_empty != NULL) {\r | |
452 | Py_INCREF(unicode_empty);\r | |
453 | return (PyObject *)unicode_empty;\r | |
454 | }\r | |
455 | \r | |
456 | /* Single character Unicode objects in the Latin-1 range are\r | |
457 | shared when using this constructor */\r | |
458 | if (size == 1 && *u < 256) {\r | |
459 | unicode = unicode_latin1[*u];\r | |
460 | if (!unicode) {\r | |
461 | unicode = _PyUnicode_New(1);\r | |
462 | if (!unicode)\r | |
463 | return NULL;\r | |
464 | unicode->str[0] = *u;\r | |
465 | unicode_latin1[*u] = unicode;\r | |
466 | }\r | |
467 | Py_INCREF(unicode);\r | |
468 | return (PyObject *)unicode;\r | |
469 | }\r | |
470 | }\r | |
471 | \r | |
472 | unicode = _PyUnicode_New(size);\r | |
473 | if (!unicode)\r | |
474 | return NULL;\r | |
475 | \r | |
476 | /* Copy the Unicode data into the new object */\r | |
477 | if (u != NULL)\r | |
478 | Py_UNICODE_COPY(unicode->str, u, size);\r | |
479 | \r | |
480 | return (PyObject *)unicode;\r | |
481 | }\r | |
482 | \r | |
483 | PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)\r | |
484 | {\r | |
485 | PyUnicodeObject *unicode;\r | |
486 | \r | |
487 | if (size < 0) {\r | |
488 | PyErr_SetString(PyExc_SystemError,\r | |
489 | "Negative size passed to PyUnicode_FromStringAndSize");\r | |
490 | return NULL;\r | |
491 | }\r | |
492 | \r | |
493 | /* If the Unicode data is known at construction time, we can apply\r | |
494 | some optimizations which share commonly used objects.\r | |
495 | Also, this means the input must be UTF-8, so fall back to the\r | |
496 | UTF-8 decoder at the end. */\r | |
497 | if (u != NULL) {\r | |
498 | \r | |
499 | /* Optimization for empty strings */\r | |
500 | if (size == 0 && unicode_empty != NULL) {\r | |
501 | Py_INCREF(unicode_empty);\r | |
502 | return (PyObject *)unicode_empty;\r | |
503 | }\r | |
504 | \r | |
505 | /* Single characters are shared when using this constructor.\r | |
506 | Restrict to ASCII, since the input must be UTF-8. */\r | |
507 | if (size == 1 && Py_CHARMASK(*u) < 128) {\r | |
508 | unicode = unicode_latin1[Py_CHARMASK(*u)];\r | |
509 | if (!unicode) {\r | |
510 | unicode = _PyUnicode_New(1);\r | |
511 | if (!unicode)\r | |
512 | return NULL;\r | |
513 | unicode->str[0] = Py_CHARMASK(*u);\r | |
514 | unicode_latin1[Py_CHARMASK(*u)] = unicode;\r | |
515 | }\r | |
516 | Py_INCREF(unicode);\r | |
517 | return (PyObject *)unicode;\r | |
518 | }\r | |
519 | \r | |
520 | return PyUnicode_DecodeUTF8(u, size, NULL);\r | |
521 | }\r | |
522 | \r | |
523 | unicode = _PyUnicode_New(size);\r | |
524 | if (!unicode)\r | |
525 | return NULL;\r | |
526 | \r | |
527 | return (PyObject *)unicode;\r | |
528 | }\r | |
529 | \r | |
530 | PyObject *PyUnicode_FromString(const char *u)\r | |
531 | {\r | |
532 | size_t size = strlen(u);\r | |
533 | if (size > PY_SSIZE_T_MAX) {\r | |
534 | PyErr_SetString(PyExc_OverflowError, "input too long");\r | |
535 | return NULL;\r | |
536 | }\r | |
537 | \r | |
538 | return PyUnicode_FromStringAndSize(u, size);\r | |
539 | }\r | |
540 | \r | |
541 | #ifdef HAVE_WCHAR_H\r | |
542 | \r | |
543 | #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)\r | |
544 | # define CONVERT_WCHAR_TO_SURROGATES\r | |
545 | #endif\r | |
546 | \r | |
547 | #ifdef CONVERT_WCHAR_TO_SURROGATES\r | |
548 | \r | |
549 | /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need\r | |
550 | to convert from UTF32 to UTF16. */\r | |
551 | \r | |
552 | PyObject *PyUnicode_FromWideChar(register const wchar_t *w,\r | |
553 | Py_ssize_t size)\r | |
554 | {\r | |
555 | PyUnicodeObject *unicode;\r | |
556 | register Py_ssize_t i;\r | |
557 | Py_ssize_t alloc;\r | |
558 | const wchar_t *orig_w;\r | |
559 | \r | |
560 | if (w == NULL) {\r | |
561 | PyErr_BadInternalCall();\r | |
562 | return NULL;\r | |
563 | }\r | |
564 | \r | |
565 | alloc = size;\r | |
566 | orig_w = w;\r | |
567 | for (i = size; i > 0; i--) {\r | |
568 | if (*w > 0xFFFF)\r | |
569 | alloc++;\r | |
570 | w++;\r | |
571 | }\r | |
572 | w = orig_w;\r | |
573 | unicode = _PyUnicode_New(alloc);\r | |
574 | if (!unicode)\r | |
575 | return NULL;\r | |
576 | \r | |
577 | /* Copy the wchar_t data into the new object */\r | |
578 | {\r | |
579 | register Py_UNICODE *u;\r | |
580 | u = PyUnicode_AS_UNICODE(unicode);\r | |
581 | for (i = size; i > 0; i--) {\r | |
582 | if (*w > 0xFFFF) {\r | |
583 | wchar_t ordinal = *w++;\r | |
584 | ordinal -= 0x10000;\r | |
585 | *u++ = 0xD800 | (ordinal >> 10);\r | |
586 | *u++ = 0xDC00 | (ordinal & 0x3FF);\r | |
587 | }\r | |
588 | else\r | |
589 | *u++ = *w++;\r | |
590 | }\r | |
591 | }\r | |
592 | return (PyObject *)unicode;\r | |
593 | }\r | |
594 | \r | |
595 | #else\r | |
596 | \r | |
597 | PyObject *PyUnicode_FromWideChar(register const wchar_t *w,\r | |
598 | Py_ssize_t size)\r | |
599 | {\r | |
600 | PyUnicodeObject *unicode;\r | |
601 | \r | |
602 | if (w == NULL) {\r | |
603 | PyErr_BadInternalCall();\r | |
604 | return NULL;\r | |
605 | }\r | |
606 | \r | |
607 | unicode = _PyUnicode_New(size);\r | |
608 | if (!unicode)\r | |
609 | return NULL;\r | |
610 | \r | |
611 | /* Copy the wchar_t data into the new object */\r | |
612 | #ifdef HAVE_USABLE_WCHAR_T\r | |
613 | memcpy(unicode->str, w, size * sizeof(wchar_t));\r | |
614 | #else\r | |
615 | {\r | |
616 | register Py_UNICODE *u;\r | |
617 | register Py_ssize_t i;\r | |
618 | u = PyUnicode_AS_UNICODE(unicode);\r | |
619 | for (i = size; i > 0; i--)\r | |
620 | *u++ = *w++;\r | |
621 | }\r | |
622 | #endif\r | |
623 | \r | |
624 | return (PyObject *)unicode;\r | |
625 | }\r | |
626 | \r | |
627 | #endif /* CONVERT_WCHAR_TO_SURROGATES */\r | |
628 | \r | |
629 | #undef CONVERT_WCHAR_TO_SURROGATES\r | |
630 | \r | |
631 | static void\r | |
632 | makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)\r | |
633 | {\r | |
634 | *fmt++ = '%';\r | |
635 | if (width) {\r | |
636 | if (zeropad)\r | |
637 | *fmt++ = '0';\r | |
638 | fmt += sprintf(fmt, "%d", width);\r | |
639 | }\r | |
640 | if (precision)\r | |
641 | fmt += sprintf(fmt, ".%d", precision);\r | |
642 | if (longflag)\r | |
643 | *fmt++ = 'l';\r | |
644 | else if (size_tflag) {\r | |
645 | char *f = PY_FORMAT_SIZE_T;\r | |
646 | while (*f)\r | |
647 | *fmt++ = *f++;\r | |
648 | }\r | |
649 | *fmt++ = c;\r | |
650 | *fmt = '\0';\r | |
651 | }\r | |
652 | \r | |
653 | #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}\r | |
654 | \r | |
655 | PyObject *\r | |
656 | PyUnicode_FromFormatV(const char *format, va_list vargs)\r | |
657 | {\r | |
658 | va_list count;\r | |
659 | Py_ssize_t callcount = 0;\r | |
660 | PyObject **callresults = NULL;\r | |
661 | PyObject **callresult = NULL;\r | |
662 | Py_ssize_t n = 0;\r | |
663 | int width = 0;\r | |
664 | int precision = 0;\r | |
665 | int zeropad;\r | |
666 | const char* f;\r | |
667 | Py_UNICODE *s;\r | |
668 | PyObject *string;\r | |
669 | /* used by sprintf */\r | |
670 | char buffer[21];\r | |
671 | /* use abuffer instead of buffer, if we need more space\r | |
672 | * (which can happen if there's a format specifier with width). */\r | |
673 | char *abuffer = NULL;\r | |
674 | char *realbuffer;\r | |
675 | Py_ssize_t abuffersize = 0;\r | |
676 | char fmt[60]; /* should be enough for %0width.precisionld */\r | |
677 | const char *copy;\r | |
678 | \r | |
679 | #ifdef VA_LIST_IS_ARRAY\r | |
680 | Py_MEMCPY(count, vargs, sizeof(va_list));\r | |
681 | #else\r | |
682 | #ifdef __va_copy\r | |
683 | __va_copy(count, vargs);\r | |
684 | #else\r | |
685 | count = vargs;\r | |
686 | #endif\r | |
687 | #endif\r | |
688 | /* step 1: count the number of %S/%R/%s format specifications\r | |
689 | * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these\r | |
690 | * objects once during step 3 and put the result in an array) */\r | |
691 | for (f = format; *f; f++) {\r | |
692 | if (*f == '%') {\r | |
693 | if (*(f+1)=='%')\r | |
694 | continue;\r | |
695 | if (*(f+1)=='S' || *(f+1)=='R')\r | |
696 | ++callcount;\r | |
697 | while (isdigit((unsigned)*f))\r | |
698 | width = (width*10) + *f++ - '0';\r | |
699 | while (*++f && *f != '%' && !isalpha((unsigned)*f))\r | |
700 | ;\r | |
701 | if (*f == 's')\r | |
702 | ++callcount;\r | |
703 | }\r | |
704 | }\r | |
705 | /* step 2: allocate memory for the results of\r | |
706 | * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */\r | |
707 | if (callcount) {\r | |
708 | callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);\r | |
709 | if (!callresults) {\r | |
710 | PyErr_NoMemory();\r | |
711 | return NULL;\r | |
712 | }\r | |
713 | callresult = callresults;\r | |
714 | }\r | |
715 | /* step 3: figure out how large a buffer we need */\r | |
716 | for (f = format; *f; f++) {\r | |
717 | if (*f == '%') {\r | |
718 | const char* p = f;\r | |
719 | width = 0;\r | |
720 | while (isdigit((unsigned)*f))\r | |
721 | width = (width*10) + *f++ - '0';\r | |
722 | while (*++f && *f != '%' && !isalpha((unsigned)*f))\r | |
723 | ;\r | |
724 | \r | |
725 | /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since\r | |
726 | * they don't affect the amount of space we reserve.\r | |
727 | */\r | |
728 | if ((*f == 'l' || *f == 'z') &&\r | |
729 | (f[1] == 'd' || f[1] == 'u'))\r | |
730 | ++f;\r | |
731 | \r | |
732 | switch (*f) {\r | |
733 | case 'c':\r | |
734 | (void)va_arg(count, int);\r | |
735 | /* fall through... */\r | |
736 | case '%':\r | |
737 | n++;\r | |
738 | break;\r | |
739 | case 'd': case 'u': case 'i': case 'x':\r | |
740 | (void) va_arg(count, int);\r | |
741 | /* 20 bytes is enough to hold a 64-bit\r | |
742 | integer. Decimal takes the most space.\r | |
743 | This isn't enough for octal.\r | |
744 | If a width is specified we need more\r | |
745 | (which we allocate later). */\r | |
746 | if (width < 20)\r | |
747 | width = 20;\r | |
748 | n += width;\r | |
749 | if (abuffersize < width)\r | |
750 | abuffersize = width;\r | |
751 | break;\r | |
752 | case 's':\r | |
753 | {\r | |
754 | /* UTF-8 */\r | |
755 | const char *s = va_arg(count, const char*);\r | |
756 | PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");\r | |
757 | if (!str)\r | |
758 | goto fail;\r | |
759 | n += PyUnicode_GET_SIZE(str);\r | |
760 | /* Remember the str and switch to the next slot */\r | |
761 | *callresult++ = str;\r | |
762 | break;\r | |
763 | }\r | |
764 | case 'U':\r | |
765 | {\r | |
766 | PyObject *obj = va_arg(count, PyObject *);\r | |
767 | assert(obj && PyUnicode_Check(obj));\r | |
768 | n += PyUnicode_GET_SIZE(obj);\r | |
769 | break;\r | |
770 | }\r | |
771 | case 'V':\r | |
772 | {\r | |
773 | PyObject *obj = va_arg(count, PyObject *);\r | |
774 | const char *str = va_arg(count, const char *);\r | |
775 | assert(obj || str);\r | |
776 | assert(!obj || PyUnicode_Check(obj));\r | |
777 | if (obj)\r | |
778 | n += PyUnicode_GET_SIZE(obj);\r | |
779 | else\r | |
780 | n += strlen(str);\r | |
781 | break;\r | |
782 | }\r | |
783 | case 'S':\r | |
784 | {\r | |
785 | PyObject *obj = va_arg(count, PyObject *);\r | |
786 | PyObject *str;\r | |
787 | assert(obj);\r | |
788 | str = PyObject_Str(obj);\r | |
789 | if (!str)\r | |
790 | goto fail;\r | |
791 | n += PyUnicode_GET_SIZE(str);\r | |
792 | /* Remember the str and switch to the next slot */\r | |
793 | *callresult++ = str;\r | |
794 | break;\r | |
795 | }\r | |
796 | case 'R':\r | |
797 | {\r | |
798 | PyObject *obj = va_arg(count, PyObject *);\r | |
799 | PyObject *repr;\r | |
800 | assert(obj);\r | |
801 | repr = PyObject_Repr(obj);\r | |
802 | if (!repr)\r | |
803 | goto fail;\r | |
804 | n += PyUnicode_GET_SIZE(repr);\r | |
805 | /* Remember the repr and switch to the next slot */\r | |
806 | *callresult++ = repr;\r | |
807 | break;\r | |
808 | }\r | |
809 | case 'p':\r | |
810 | (void) va_arg(count, int);\r | |
811 | /* maximum 64-bit pointer representation:\r | |
812 | * 0xffffffffffffffff\r | |
813 | * so 19 characters is enough.\r | |
814 | * XXX I count 18 -- what's the extra for?\r | |
815 | */\r | |
816 | n += 19;\r | |
817 | break;\r | |
818 | default:\r | |
819 | /* if we stumble upon an unknown\r | |
820 | formatting code, copy the rest of\r | |
821 | the format string to the output\r | |
822 | string. (we cannot just skip the\r | |
823 | code, since there's no way to know\r | |
824 | what's in the argument list) */\r | |
825 | n += strlen(p);\r | |
826 | goto expand;\r | |
827 | }\r | |
828 | } else\r | |
829 | n++;\r | |
830 | }\r | |
831 | expand:\r | |
832 | if (abuffersize > 20) {\r | |
833 | abuffer = PyObject_Malloc(abuffersize);\r | |
834 | if (!abuffer) {\r | |
835 | PyErr_NoMemory();\r | |
836 | goto fail;\r | |
837 | }\r | |
838 | realbuffer = abuffer;\r | |
839 | }\r | |
840 | else\r | |
841 | realbuffer = buffer;\r | |
842 | /* step 4: fill the buffer */\r | |
843 | /* Since we've analyzed how much space we need for the worst case,\r | |
844 | we don't have to resize the string.\r | |
845 | There can be no errors beyond this point. */\r | |
846 | string = PyUnicode_FromUnicode(NULL, n);\r | |
847 | if (!string)\r | |
848 | goto fail;\r | |
849 | \r | |
850 | s = PyUnicode_AS_UNICODE(string);\r | |
851 | callresult = callresults;\r | |
852 | \r | |
853 | for (f = format; *f; f++) {\r | |
854 | if (*f == '%') {\r | |
855 | const char* p = f++;\r | |
856 | int longflag = 0;\r | |
857 | int size_tflag = 0;\r | |
858 | zeropad = (*f == '0');\r | |
859 | /* parse the width.precision part */\r | |
860 | width = 0;\r | |
861 | while (isdigit((unsigned)*f))\r | |
862 | width = (width*10) + *f++ - '0';\r | |
863 | precision = 0;\r | |
864 | if (*f == '.') {\r | |
865 | f++;\r | |
866 | while (isdigit((unsigned)*f))\r | |
867 | precision = (precision*10) + *f++ - '0';\r | |
868 | }\r | |
869 | /* handle the long flag, but only for %ld and %lu.\r | |
870 | others can be added when necessary. */\r | |
871 | if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {\r | |
872 | longflag = 1;\r | |
873 | ++f;\r | |
874 | }\r | |
875 | /* handle the size_t flag. */\r | |
876 | if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {\r | |
877 | size_tflag = 1;\r | |
878 | ++f;\r | |
879 | }\r | |
880 | \r | |
881 | switch (*f) {\r | |
882 | case 'c':\r | |
883 | *s++ = va_arg(vargs, int);\r | |
884 | break;\r | |
885 | case 'd':\r | |
886 | makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');\r | |
887 | if (longflag)\r | |
888 | sprintf(realbuffer, fmt, va_arg(vargs, long));\r | |
889 | else if (size_tflag)\r | |
890 | sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));\r | |
891 | else\r | |
892 | sprintf(realbuffer, fmt, va_arg(vargs, int));\r | |
893 | appendstring(realbuffer);\r | |
894 | break;\r | |
895 | case 'u':\r | |
896 | makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');\r | |
897 | if (longflag)\r | |
898 | sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));\r | |
899 | else if (size_tflag)\r | |
900 | sprintf(realbuffer, fmt, va_arg(vargs, size_t));\r | |
901 | else\r | |
902 | sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));\r | |
903 | appendstring(realbuffer);\r | |
904 | break;\r | |
905 | case 'i':\r | |
906 | makefmt(fmt, 0, 0, zeropad, width, precision, 'i');\r | |
907 | sprintf(realbuffer, fmt, va_arg(vargs, int));\r | |
908 | appendstring(realbuffer);\r | |
909 | break;\r | |
910 | case 'x':\r | |
911 | makefmt(fmt, 0, 0, zeropad, width, precision, 'x');\r | |
912 | sprintf(realbuffer, fmt, va_arg(vargs, int));\r | |
913 | appendstring(realbuffer);\r | |
914 | break;\r | |
915 | case 's':\r | |
916 | {\r | |
917 | /* unused, since we already have the result */\r | |
918 | (void) va_arg(vargs, char *);\r | |
919 | Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),\r | |
920 | PyUnicode_GET_SIZE(*callresult));\r | |
921 | s += PyUnicode_GET_SIZE(*callresult);\r | |
922 | /* We're done with the unicode()/repr() => forget it */\r | |
923 | Py_DECREF(*callresult);\r | |
924 | /* switch to next unicode()/repr() result */\r | |
925 | ++callresult;\r | |
926 | break;\r | |
927 | }\r | |
928 | case 'U':\r | |
929 | {\r | |
930 | PyObject *obj = va_arg(vargs, PyObject *);\r | |
931 | Py_ssize_t size = PyUnicode_GET_SIZE(obj);\r | |
932 | Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);\r | |
933 | s += size;\r | |
934 | break;\r | |
935 | }\r | |
936 | case 'V':\r | |
937 | {\r | |
938 | PyObject *obj = va_arg(vargs, PyObject *);\r | |
939 | const char *str = va_arg(vargs, const char *);\r | |
940 | if (obj) {\r | |
941 | Py_ssize_t size = PyUnicode_GET_SIZE(obj);\r | |
942 | Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);\r | |
943 | s += size;\r | |
944 | } else {\r | |
945 | appendstring(str);\r | |
946 | }\r | |
947 | break;\r | |
948 | }\r | |
949 | case 'S':\r | |
950 | case 'R':\r | |
951 | {\r | |
952 | Py_UNICODE *ucopy;\r | |
953 | Py_ssize_t usize;\r | |
954 | Py_ssize_t upos;\r | |
955 | /* unused, since we already have the result */\r | |
956 | (void) va_arg(vargs, PyObject *);\r | |
957 | ucopy = PyUnicode_AS_UNICODE(*callresult);\r | |
958 | usize = PyUnicode_GET_SIZE(*callresult);\r | |
959 | for (upos = 0; upos<usize;)\r | |
960 | *s++ = ucopy[upos++];\r | |
961 | /* We're done with the unicode()/repr() => forget it */\r | |
962 | Py_DECREF(*callresult);\r | |
963 | /* switch to next unicode()/repr() result */\r | |
964 | ++callresult;\r | |
965 | break;\r | |
966 | }\r | |
967 | case 'p':\r | |
968 | sprintf(buffer, "%p", va_arg(vargs, void*));\r | |
969 | /* %p is ill-defined: ensure leading 0x. */\r | |
970 | if (buffer[1] == 'X')\r | |
971 | buffer[1] = 'x';\r | |
972 | else if (buffer[1] != 'x') {\r | |
973 | memmove(buffer+2, buffer, strlen(buffer)+1);\r | |
974 | buffer[0] = '0';\r | |
975 | buffer[1] = 'x';\r | |
976 | }\r | |
977 | appendstring(buffer);\r | |
978 | break;\r | |
979 | case '%':\r | |
980 | *s++ = '%';\r | |
981 | break;\r | |
982 | default:\r | |
983 | appendstring(p);\r | |
984 | goto end;\r | |
985 | }\r | |
986 | } else\r | |
987 | *s++ = *f;\r | |
988 | }\r | |
989 | \r | |
990 | end:\r | |
991 | if (callresults)\r | |
992 | PyObject_Free(callresults);\r | |
993 | if (abuffer)\r | |
994 | PyObject_Free(abuffer);\r | |
995 | PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));\r | |
996 | return string;\r | |
997 | fail:\r | |
998 | if (callresults) {\r | |
999 | PyObject **callresult2 = callresults;\r | |
1000 | while (callresult2 < callresult) {\r | |
1001 | Py_DECREF(*callresult2);\r | |
1002 | ++callresult2;\r | |
1003 | }\r | |
1004 | PyObject_Free(callresults);\r | |
1005 | }\r | |
1006 | if (abuffer)\r | |
1007 | PyObject_Free(abuffer);\r | |
1008 | return NULL;\r | |
1009 | }\r | |
1010 | \r | |
1011 | #undef appendstring\r | |
1012 | \r | |
1013 | PyObject *\r | |
1014 | PyUnicode_FromFormat(const char *format, ...)\r | |
1015 | {\r | |
1016 | PyObject* ret;\r | |
1017 | va_list vargs;\r | |
1018 | \r | |
1019 | #ifdef HAVE_STDARG_PROTOTYPES\r | |
1020 | va_start(vargs, format);\r | |
1021 | #else\r | |
1022 | va_start(vargs);\r | |
1023 | #endif\r | |
1024 | ret = PyUnicode_FromFormatV(format, vargs);\r | |
1025 | va_end(vargs);\r | |
1026 | return ret;\r | |
1027 | }\r | |
1028 | \r | |
1029 | Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,\r | |
1030 | wchar_t *w,\r | |
1031 | Py_ssize_t size)\r | |
1032 | {\r | |
1033 | if (unicode == NULL) {\r | |
1034 | PyErr_BadInternalCall();\r | |
1035 | return -1;\r | |
1036 | }\r | |
1037 | \r | |
1038 | /* If possible, try to copy the 0-termination as well */\r | |
1039 | if (size > PyUnicode_GET_SIZE(unicode))\r | |
1040 | size = PyUnicode_GET_SIZE(unicode) + 1;\r | |
1041 | \r | |
1042 | #ifdef HAVE_USABLE_WCHAR_T\r | |
1043 | memcpy(w, unicode->str, size * sizeof(wchar_t));\r | |
1044 | #else\r | |
1045 | {\r | |
1046 | register Py_UNICODE *u;\r | |
1047 | register Py_ssize_t i;\r | |
1048 | u = PyUnicode_AS_UNICODE(unicode);\r | |
1049 | for (i = size; i > 0; i--)\r | |
1050 | *w++ = *u++;\r | |
1051 | }\r | |
1052 | #endif\r | |
1053 | \r | |
1054 | if (size > PyUnicode_GET_SIZE(unicode))\r | |
1055 | return PyUnicode_GET_SIZE(unicode);\r | |
1056 | else\r | |
1057 | return size;\r | |
1058 | }\r | |
1059 | \r | |
1060 | #endif\r | |
1061 | \r | |
1062 | PyObject *PyUnicode_FromOrdinal(int ordinal)\r | |
1063 | {\r | |
1064 | Py_UNICODE s[1];\r | |
1065 | \r | |
1066 | #ifdef Py_UNICODE_WIDE\r | |
1067 | if (ordinal < 0 || ordinal > 0x10ffff) {\r | |
1068 | PyErr_SetString(PyExc_ValueError,\r | |
1069 | "unichr() arg not in range(0x110000) "\r | |
1070 | "(wide Python build)");\r | |
1071 | return NULL;\r | |
1072 | }\r | |
1073 | #else\r | |
1074 | if (ordinal < 0 || ordinal > 0xffff) {\r | |
1075 | PyErr_SetString(PyExc_ValueError,\r | |
1076 | "unichr() arg not in range(0x10000) "\r | |
1077 | "(narrow Python build)");\r | |
1078 | return NULL;\r | |
1079 | }\r | |
1080 | #endif\r | |
1081 | \r | |
1082 | s[0] = (Py_UNICODE)ordinal;\r | |
1083 | return PyUnicode_FromUnicode(s, 1);\r | |
1084 | }\r | |
1085 | \r | |
1086 | PyObject *PyUnicode_FromObject(register PyObject *obj)\r | |
1087 | {\r | |
1088 | /* XXX Perhaps we should make this API an alias of\r | |
1089 | PyObject_Unicode() instead ?! */\r | |
1090 | if (PyUnicode_CheckExact(obj)) {\r | |
1091 | Py_INCREF(obj);\r | |
1092 | return obj;\r | |
1093 | }\r | |
1094 | if (PyUnicode_Check(obj)) {\r | |
1095 | /* For a Unicode subtype that's not a Unicode object,\r | |
1096 | return a true Unicode object with the same data. */\r | |
1097 | return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),\r | |
1098 | PyUnicode_GET_SIZE(obj));\r | |
1099 | }\r | |
1100 | return PyUnicode_FromEncodedObject(obj, NULL, "strict");\r | |
1101 | }\r | |
1102 | \r | |
1103 | PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,\r | |
1104 | const char *encoding,\r | |
1105 | const char *errors)\r | |
1106 | {\r | |
1107 | const char *s = NULL;\r | |
1108 | Py_ssize_t len;\r | |
1109 | PyObject *v;\r | |
1110 | \r | |
1111 | if (obj == NULL) {\r | |
1112 | PyErr_BadInternalCall();\r | |
1113 | return NULL;\r | |
1114 | }\r | |
1115 | \r | |
1116 | #if 0\r | |
1117 | /* For b/w compatibility we also accept Unicode objects provided\r | |
1118 | that no encodings is given and then redirect to\r | |
1119 | PyObject_Unicode() which then applies the additional logic for\r | |
1120 | Unicode subclasses.\r | |
1121 | \r | |
1122 | NOTE: This API should really only be used for object which\r | |
1123 | represent *encoded* Unicode !\r | |
1124 | \r | |
1125 | */\r | |
1126 | if (PyUnicode_Check(obj)) {\r | |
1127 | if (encoding) {\r | |
1128 | PyErr_SetString(PyExc_TypeError,\r | |
1129 | "decoding Unicode is not supported");\r | |
1130 | return NULL;\r | |
1131 | }\r | |
1132 | return PyObject_Unicode(obj);\r | |
1133 | }\r | |
1134 | #else\r | |
1135 | if (PyUnicode_Check(obj)) {\r | |
1136 | PyErr_SetString(PyExc_TypeError,\r | |
1137 | "decoding Unicode is not supported");\r | |
1138 | return NULL;\r | |
1139 | }\r | |
1140 | #endif\r | |
1141 | \r | |
1142 | /* Coerce object */\r | |
1143 | if (PyString_Check(obj)) {\r | |
1144 | s = PyString_AS_STRING(obj);\r | |
1145 | len = PyString_GET_SIZE(obj);\r | |
1146 | }\r | |
1147 | else if (PyByteArray_Check(obj)) {\r | |
1148 | /* Python 2.x specific */\r | |
1149 | PyErr_Format(PyExc_TypeError,\r | |
1150 | "decoding bytearray is not supported");\r | |
1151 | return NULL;\r | |
1152 | }\r | |
1153 | else if (PyObject_AsCharBuffer(obj, &s, &len)) {\r | |
1154 | /* Overwrite the error message with something more useful in\r | |
1155 | case of a TypeError. */\r | |
1156 | if (PyErr_ExceptionMatches(PyExc_TypeError))\r | |
1157 | PyErr_Format(PyExc_TypeError,\r | |
1158 | "coercing to Unicode: need string or buffer, "\r | |
1159 | "%.80s found",\r | |
1160 | Py_TYPE(obj)->tp_name);\r | |
1161 | goto onError;\r | |
1162 | }\r | |
1163 | \r | |
1164 | /* Convert to Unicode */\r | |
1165 | if (len == 0) {\r | |
1166 | Py_INCREF(unicode_empty);\r | |
1167 | v = (PyObject *)unicode_empty;\r | |
1168 | }\r | |
1169 | else\r | |
1170 | v = PyUnicode_Decode(s, len, encoding, errors);\r | |
1171 | \r | |
1172 | return v;\r | |
1173 | \r | |
1174 | onError:\r | |
1175 | return NULL;\r | |
1176 | }\r | |
1177 | \r | |
1178 | PyObject *PyUnicode_Decode(const char *s,\r | |
1179 | Py_ssize_t size,\r | |
1180 | const char *encoding,\r | |
1181 | const char *errors)\r | |
1182 | {\r | |
1183 | PyObject *buffer = NULL, *unicode;\r | |
1184 | \r | |
1185 | if (encoding == NULL)\r | |
1186 | encoding = PyUnicode_GetDefaultEncoding();\r | |
1187 | \r | |
1188 | /* Shortcuts for common default encodings */\r | |
1189 | if (strcmp(encoding, "utf-8") == 0)\r | |
1190 | return PyUnicode_DecodeUTF8(s, size, errors);\r | |
1191 | else if (strcmp(encoding, "latin-1") == 0)\r | |
1192 | return PyUnicode_DecodeLatin1(s, size, errors);\r | |
1193 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)\r | |
1194 | else if (strcmp(encoding, "mbcs") == 0)\r | |
1195 | return PyUnicode_DecodeMBCS(s, size, errors);\r | |
1196 | #endif\r | |
1197 | else if (strcmp(encoding, "ascii") == 0)\r | |
1198 | return PyUnicode_DecodeASCII(s, size, errors);\r | |
1199 | \r | |
1200 | /* Decode via the codec registry */\r | |
1201 | buffer = PyBuffer_FromMemory((void *)s, size);\r | |
1202 | if (buffer == NULL)\r | |
1203 | goto onError;\r | |
1204 | unicode = PyCodec_Decode(buffer, encoding, errors);\r | |
1205 | if (unicode == NULL)\r | |
1206 | goto onError;\r | |
1207 | if (!PyUnicode_Check(unicode)) {\r | |
1208 | PyErr_Format(PyExc_TypeError,\r | |
1209 | "decoder did not return an unicode object (type=%.400s)",\r | |
1210 | Py_TYPE(unicode)->tp_name);\r | |
1211 | Py_DECREF(unicode);\r | |
1212 | goto onError;\r | |
1213 | }\r | |
1214 | Py_DECREF(buffer);\r | |
1215 | return unicode;\r | |
1216 | \r | |
1217 | onError:\r | |
1218 | Py_XDECREF(buffer);\r | |
1219 | return NULL;\r | |
1220 | }\r | |
1221 | \r | |
1222 | PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,\r | |
1223 | const char *encoding,\r | |
1224 | const char *errors)\r | |
1225 | {\r | |
1226 | PyObject *v;\r | |
1227 | \r | |
1228 | if (!PyUnicode_Check(unicode)) {\r | |
1229 | PyErr_BadArgument();\r | |
1230 | goto onError;\r | |
1231 | }\r | |
1232 | \r | |
1233 | if (encoding == NULL)\r | |
1234 | encoding = PyUnicode_GetDefaultEncoding();\r | |
1235 | \r | |
1236 | /* Decode via the codec registry */\r | |
1237 | v = PyCodec_Decode(unicode, encoding, errors);\r | |
1238 | if (v == NULL)\r | |
1239 | goto onError;\r | |
1240 | return v;\r | |
1241 | \r | |
1242 | onError:\r | |
1243 | return NULL;\r | |
1244 | }\r | |
1245 | \r | |
1246 | PyObject *PyUnicode_Encode(const Py_UNICODE *s,\r | |
1247 | Py_ssize_t size,\r | |
1248 | const char *encoding,\r | |
1249 | const char *errors)\r | |
1250 | {\r | |
1251 | PyObject *v, *unicode;\r | |
1252 | \r | |
1253 | unicode = PyUnicode_FromUnicode(s, size);\r | |
1254 | if (unicode == NULL)\r | |
1255 | return NULL;\r | |
1256 | v = PyUnicode_AsEncodedString(unicode, encoding, errors);\r | |
1257 | Py_DECREF(unicode);\r | |
1258 | return v;\r | |
1259 | }\r | |
1260 | \r | |
1261 | PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,\r | |
1262 | const char *encoding,\r | |
1263 | const char *errors)\r | |
1264 | {\r | |
1265 | PyObject *v;\r | |
1266 | \r | |
1267 | if (!PyUnicode_Check(unicode)) {\r | |
1268 | PyErr_BadArgument();\r | |
1269 | goto onError;\r | |
1270 | }\r | |
1271 | \r | |
1272 | if (encoding == NULL)\r | |
1273 | encoding = PyUnicode_GetDefaultEncoding();\r | |
1274 | \r | |
1275 | /* Encode via the codec registry */\r | |
1276 | v = PyCodec_Encode(unicode, encoding, errors);\r | |
1277 | if (v == NULL)\r | |
1278 | goto onError;\r | |
1279 | return v;\r | |
1280 | \r | |
1281 | onError:\r | |
1282 | return NULL;\r | |
1283 | }\r | |
1284 | \r | |
1285 | PyObject *PyUnicode_AsEncodedString(PyObject *unicode,\r | |
1286 | const char *encoding,\r | |
1287 | const char *errors)\r | |
1288 | {\r | |
1289 | PyObject *v;\r | |
1290 | \r | |
1291 | if (!PyUnicode_Check(unicode)) {\r | |
1292 | PyErr_BadArgument();\r | |
1293 | goto onError;\r | |
1294 | }\r | |
1295 | \r | |
1296 | if (encoding == NULL)\r | |
1297 | encoding = PyUnicode_GetDefaultEncoding();\r | |
1298 | \r | |
1299 | /* Shortcuts for common default encodings */\r | |
1300 | if (errors == NULL) {\r | |
1301 | if (strcmp(encoding, "utf-8") == 0)\r | |
1302 | return PyUnicode_AsUTF8String(unicode);\r | |
1303 | else if (strcmp(encoding, "latin-1") == 0)\r | |
1304 | return PyUnicode_AsLatin1String(unicode);\r | |
1305 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)\r | |
1306 | else if (strcmp(encoding, "mbcs") == 0)\r | |
1307 | return PyUnicode_AsMBCSString(unicode);\r | |
1308 | #endif\r | |
1309 | else if (strcmp(encoding, "ascii") == 0)\r | |
1310 | return PyUnicode_AsASCIIString(unicode);\r | |
1311 | }\r | |
1312 | \r | |
1313 | /* Encode via the codec registry */\r | |
1314 | v = PyCodec_Encode(unicode, encoding, errors);\r | |
1315 | if (v == NULL)\r | |
1316 | goto onError;\r | |
1317 | if (!PyString_Check(v)) {\r | |
1318 | PyErr_Format(PyExc_TypeError,\r | |
1319 | "encoder did not return a string object (type=%.400s)",\r | |
1320 | Py_TYPE(v)->tp_name);\r | |
1321 | Py_DECREF(v);\r | |
1322 | goto onError;\r | |
1323 | }\r | |
1324 | return v;\r | |
1325 | \r | |
1326 | onError:\r | |
1327 | return NULL;\r | |
1328 | }\r | |
1329 | \r | |
1330 | PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,\r | |
1331 | const char *errors)\r | |
1332 | {\r | |
1333 | PyObject *v = ((PyUnicodeObject *)unicode)->defenc;\r | |
1334 | \r | |
1335 | if (v)\r | |
1336 | return v;\r | |
1337 | v = PyUnicode_AsEncodedString(unicode, NULL, errors);\r | |
1338 | if (v && errors == NULL)\r | |
1339 | ((PyUnicodeObject *)unicode)->defenc = v;\r | |
1340 | return v;\r | |
1341 | }\r | |
1342 | \r | |
1343 | Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)\r | |
1344 | {\r | |
1345 | if (!PyUnicode_Check(unicode)) {\r | |
1346 | PyErr_BadArgument();\r | |
1347 | goto onError;\r | |
1348 | }\r | |
1349 | return PyUnicode_AS_UNICODE(unicode);\r | |
1350 | \r | |
1351 | onError:\r | |
1352 | return NULL;\r | |
1353 | }\r | |
1354 | \r | |
1355 | Py_ssize_t PyUnicode_GetSize(PyObject *unicode)\r | |
1356 | {\r | |
1357 | if (!PyUnicode_Check(unicode)) {\r | |
1358 | PyErr_BadArgument();\r | |
1359 | goto onError;\r | |
1360 | }\r | |
1361 | return PyUnicode_GET_SIZE(unicode);\r | |
1362 | \r | |
1363 | onError:\r | |
1364 | return -1;\r | |
1365 | }\r | |
1366 | \r | |
1367 | const char *PyUnicode_GetDefaultEncoding(void)\r | |
1368 | {\r | |
1369 | return unicode_default_encoding;\r | |
1370 | }\r | |
1371 | \r | |
1372 | int PyUnicode_SetDefaultEncoding(const char *encoding)\r | |
1373 | {\r | |
1374 | PyObject *v;\r | |
1375 | \r | |
1376 | /* Make sure the encoding is valid. As side effect, this also\r | |
1377 | loads the encoding into the codec registry cache. */\r | |
1378 | v = _PyCodec_Lookup(encoding);\r | |
1379 | if (v == NULL)\r | |
1380 | goto onError;\r | |
1381 | Py_DECREF(v);\r | |
1382 | strncpy(unicode_default_encoding,\r | |
1383 | encoding,\r | |
1384 | sizeof(unicode_default_encoding));\r | |
1385 | return 0;\r | |
1386 | \r | |
1387 | onError:\r | |
1388 | return -1;\r | |
1389 | }\r | |
1390 | \r | |
1391 | /* error handling callback helper:\r | |
1392 | build arguments, call the callback and check the arguments,\r | |
1393 | if no exception occurred, copy the replacement to the output\r | |
1394 | and adjust various state variables.\r | |
1395 | return 0 on success, -1 on error\r | |
1396 | */\r | |
1397 | \r | |
1398 | static\r | |
1399 | int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,\r | |
1400 | const char *encoding, const char *reason,\r | |
1401 | const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,\r | |
1402 | Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,\r | |
1403 | PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)\r | |
1404 | {\r | |
1405 | static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";\r | |
1406 | \r | |
1407 | PyObject *restuple = NULL;\r | |
1408 | PyObject *repunicode = NULL;\r | |
1409 | Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);\r | |
1410 | Py_ssize_t requiredsize;\r | |
1411 | Py_ssize_t newpos;\r | |
1412 | Py_UNICODE *repptr;\r | |
1413 | Py_ssize_t repsize;\r | |
1414 | int res = -1;\r | |
1415 | \r | |
1416 | if (*errorHandler == NULL) {\r | |
1417 | *errorHandler = PyCodec_LookupError(errors);\r | |
1418 | if (*errorHandler == NULL)\r | |
1419 | goto onError;\r | |
1420 | }\r | |
1421 | \r | |
1422 | if (*exceptionObject == NULL) {\r | |
1423 | *exceptionObject = PyUnicodeDecodeError_Create(\r | |
1424 | encoding, input, insize, *startinpos, *endinpos, reason);\r | |
1425 | if (*exceptionObject == NULL)\r | |
1426 | goto onError;\r | |
1427 | }\r | |
1428 | else {\r | |
1429 | if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))\r | |
1430 | goto onError;\r | |
1431 | if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))\r | |
1432 | goto onError;\r | |
1433 | if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))\r | |
1434 | goto onError;\r | |
1435 | }\r | |
1436 | \r | |
1437 | restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);\r | |
1438 | if (restuple == NULL)\r | |
1439 | goto onError;\r | |
1440 | if (!PyTuple_Check(restuple)) {\r | |
1441 | PyErr_SetString(PyExc_TypeError, &argparse[4]);\r | |
1442 | goto onError;\r | |
1443 | }\r | |
1444 | if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))\r | |
1445 | goto onError;\r | |
1446 | if (newpos<0)\r | |
1447 | newpos = insize+newpos;\r | |
1448 | if (newpos<0 || newpos>insize) {\r | |
1449 | PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);\r | |
1450 | goto onError;\r | |
1451 | }\r | |
1452 | \r | |
1453 | /* need more space? (at least enough for what we\r | |
1454 | have+the replacement+the rest of the string (starting\r | |
1455 | at the new input position), so we won't have to check space\r | |
1456 | when there are no errors in the rest of the string) */\r | |
1457 | repptr = PyUnicode_AS_UNICODE(repunicode);\r | |
1458 | repsize = PyUnicode_GET_SIZE(repunicode);\r | |
1459 | requiredsize = *outpos + repsize + insize-newpos;\r | |
1460 | if (requiredsize > outsize) {\r | |
1461 | if (requiredsize<2*outsize)\r | |
1462 | requiredsize = 2*outsize;\r | |
1463 | if (_PyUnicode_Resize(output, requiredsize) < 0)\r | |
1464 | goto onError;\r | |
1465 | *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;\r | |
1466 | }\r | |
1467 | *endinpos = newpos;\r | |
1468 | *inptr = input + newpos;\r | |
1469 | Py_UNICODE_COPY(*outptr, repptr, repsize);\r | |
1470 | *outptr += repsize;\r | |
1471 | *outpos += repsize;\r | |
1472 | /* we made it! */\r | |
1473 | res = 0;\r | |
1474 | \r | |
1475 | onError:\r | |
1476 | Py_XDECREF(restuple);\r | |
1477 | return res;\r | |
1478 | }\r | |
1479 | \r | |
1480 | /* --- UTF-7 Codec -------------------------------------------------------- */\r | |
1481 | \r | |
1482 | /* See RFC2152 for details. We encode conservatively and decode liberally. */\r | |
1483 | \r | |
1484 | /* Three simple macros defining base-64. */\r | |
1485 | \r | |
1486 | /* Is c a base-64 character? */\r | |
1487 | \r | |
1488 | #define IS_BASE64(c) \\r | |
1489 | (isalnum(c) || (c) == '+' || (c) == '/')\r | |
1490 | \r | |
1491 | /* given that c is a base-64 character, what is its base-64 value? */\r | |
1492 | \r | |
1493 | #define FROM_BASE64(c) \\r | |
1494 | (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \\r | |
1495 | ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \\r | |
1496 | ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \\r | |
1497 | (c) == '+' ? 62 : 63)\r | |
1498 | \r | |
1499 | /* What is the base-64 character of the bottom 6 bits of n? */\r | |
1500 | \r | |
1501 | #define TO_BASE64(n) \\r | |
1502 | ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])\r | |
1503 | \r | |
1504 | /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be\r | |
1505 | * decoded as itself. We are permissive on decoding; the only ASCII\r | |
1506 | * byte not decoding to itself is the + which begins a base64\r | |
1507 | * string. */\r | |
1508 | \r | |
1509 | #define DECODE_DIRECT(c) \\r | |
1510 | ((c) <= 127 && (c) != '+')\r | |
1511 | \r | |
1512 | /* The UTF-7 encoder treats ASCII characters differently according to\r | |
1513 | * whether they are Set D, Set O, Whitespace, or special (i.e. none of\r | |
1514 | * the above). See RFC2152. This array identifies these different\r | |
1515 | * sets:\r | |
1516 | * 0 : "Set D"\r | |
1517 | * alphanumeric and '(),-./:?\r | |
1518 | * 1 : "Set O"\r | |
1519 | * !"#$%&*;<=>@[]^_`{|}\r | |
1520 | * 2 : "whitespace"\r | |
1521 | * ht nl cr sp\r | |
1522 | * 3 : special (must be base64 encoded)\r | |
1523 | * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)\r | |
1524 | */\r | |
1525 | \r | |
1526 | static\r | |
1527 | char utf7_category[128] = {\r | |
1528 | /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */\r | |
1529 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,\r | |
1530 | /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */\r | |
1531 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\r | |
1532 | /* sp ! " # $ % & ' ( ) * + , - . / */\r | |
1533 | 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,\r | |
1534 | /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */\r | |
1535 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,\r | |
1536 | /* @ A B C D E F G H I J K L M N O */\r | |
1537 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r | |
1538 | /* P Q R S T U V W X Y Z [ \ ] ^ _ */\r | |
1539 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,\r | |
1540 | /* ` a b c d e f g h i j k l m n o */\r | |
1541 | 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r | |
1542 | /* p q r s t u v w x y z { | } ~ del */\r | |
1543 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,\r | |
1544 | };\r | |
1545 | \r | |
1546 | /* ENCODE_DIRECT: this character should be encoded as itself. The\r | |
1547 | * answer depends on whether we are encoding set O as itself, and also\r | |
1548 | * on whether we are encoding whitespace as itself. RFC2152 makes it\r | |
1549 | * clear that the answers to these questions vary between\r | |
1550 | * applications, so this code needs to be flexible. */\r | |
1551 | \r | |
1552 | #define ENCODE_DIRECT(c, directO, directWS) \\r | |
1553 | ((c) < 128 && (c) > 0 && \\r | |
1554 | ((utf7_category[(c)] == 0) || \\r | |
1555 | (directWS && (utf7_category[(c)] == 2)) || \\r | |
1556 | (directO && (utf7_category[(c)] == 1))))\r | |
1557 | \r | |
1558 | PyObject *PyUnicode_DecodeUTF7(const char *s,\r | |
1559 | Py_ssize_t size,\r | |
1560 | const char *errors)\r | |
1561 | {\r | |
1562 | return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);\r | |
1563 | }\r | |
1564 | \r | |
1565 | /* The decoder. The only state we preserve is our read position,\r | |
1566 | * i.e. how many characters we have consumed. So if we end in the\r | |
1567 | * middle of a shift sequence we have to back off the read position\r | |
1568 | * and the output to the beginning of the sequence, otherwise we lose\r | |
1569 | * all the shift state (seen bits, number of bits seen, high\r | |
1570 | * surrogate). */\r | |
1571 | \r | |
1572 | PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,\r | |
1573 | Py_ssize_t size,\r | |
1574 | const char *errors,\r | |
1575 | Py_ssize_t *consumed)\r | |
1576 | {\r | |
1577 | const char *starts = s;\r | |
1578 | Py_ssize_t startinpos;\r | |
1579 | Py_ssize_t endinpos;\r | |
1580 | Py_ssize_t outpos;\r | |
1581 | const char *e;\r | |
1582 | PyUnicodeObject *unicode;\r | |
1583 | Py_UNICODE *p;\r | |
1584 | const char *errmsg = "";\r | |
1585 | int inShift = 0;\r | |
1586 | Py_UNICODE *shiftOutStart;\r | |
1587 | unsigned int base64bits = 0;\r | |
1588 | unsigned long base64buffer = 0;\r | |
1589 | Py_UNICODE surrogate = 0;\r | |
1590 | PyObject *errorHandler = NULL;\r | |
1591 | PyObject *exc = NULL;\r | |
1592 | \r | |
1593 | unicode = _PyUnicode_New(size);\r | |
1594 | if (!unicode)\r | |
1595 | return NULL;\r | |
1596 | if (size == 0) {\r | |
1597 | if (consumed)\r | |
1598 | *consumed = 0;\r | |
1599 | return (PyObject *)unicode;\r | |
1600 | }\r | |
1601 | \r | |
1602 | p = unicode->str;\r | |
1603 | shiftOutStart = p;\r | |
1604 | e = s + size;\r | |
1605 | \r | |
1606 | while (s < e) {\r | |
1607 | Py_UNICODE ch = (unsigned char) *s;\r | |
1608 | \r | |
1609 | if (inShift) { /* in a base-64 section */\r | |
1610 | if (IS_BASE64(ch)) { /* consume a base-64 character */\r | |
1611 | base64buffer = (base64buffer << 6) | FROM_BASE64(ch);\r | |
1612 | base64bits += 6;\r | |
1613 | s++;\r | |
1614 | if (base64bits >= 16) {\r | |
1615 | /* we have enough bits for a UTF-16 value */\r | |
1616 | Py_UNICODE outCh = (Py_UNICODE)\r | |
1617 | (base64buffer >> (base64bits-16));\r | |
1618 | base64bits -= 16;\r | |
1619 | base64buffer &= (1 << base64bits) - 1; /* clear high bits */\r | |
1620 | if (surrogate) {\r | |
1621 | /* expecting a second surrogate */\r | |
1622 | if (outCh >= 0xDC00 && outCh <= 0xDFFF) {\r | |
1623 | #ifdef Py_UNICODE_WIDE\r | |
1624 | *p++ = (((surrogate & 0x3FF)<<10)\r | |
1625 | | (outCh & 0x3FF)) + 0x10000;\r | |
1626 | #else\r | |
1627 | *p++ = surrogate;\r | |
1628 | *p++ = outCh;\r | |
1629 | #endif\r | |
1630 | surrogate = 0;\r | |
1631 | }\r | |
1632 | else {\r | |
1633 | surrogate = 0;\r | |
1634 | errmsg = "second surrogate missing";\r | |
1635 | goto utf7Error;\r | |
1636 | }\r | |
1637 | }\r | |
1638 | else if (outCh >= 0xD800 && outCh <= 0xDBFF) {\r | |
1639 | /* first surrogate */\r | |
1640 | surrogate = outCh;\r | |
1641 | }\r | |
1642 | else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {\r | |
1643 | errmsg = "unexpected second surrogate";\r | |
1644 | goto utf7Error;\r | |
1645 | }\r | |
1646 | else {\r | |
1647 | *p++ = outCh;\r | |
1648 | }\r | |
1649 | }\r | |
1650 | }\r | |
1651 | else { /* now leaving a base-64 section */\r | |
1652 | inShift = 0;\r | |
1653 | s++;\r | |
1654 | if (surrogate) {\r | |
1655 | errmsg = "second surrogate missing at end of shift sequence";\r | |
1656 | goto utf7Error;\r | |
1657 | }\r | |
1658 | if (base64bits > 0) { /* left-over bits */\r | |
1659 | if (base64bits >= 6) {\r | |
1660 | /* We've seen at least one base-64 character */\r | |
1661 | errmsg = "partial character in shift sequence";\r | |
1662 | goto utf7Error;\r | |
1663 | }\r | |
1664 | else {\r | |
1665 | /* Some bits remain; they should be zero */\r | |
1666 | if (base64buffer != 0) {\r | |
1667 | errmsg = "non-zero padding bits in shift sequence";\r | |
1668 | goto utf7Error;\r | |
1669 | }\r | |
1670 | }\r | |
1671 | }\r | |
1672 | if (ch != '-') {\r | |
1673 | /* '-' is absorbed; other terminating\r | |
1674 | characters are preserved */\r | |
1675 | *p++ = ch;\r | |
1676 | }\r | |
1677 | }\r | |
1678 | }\r | |
1679 | else if ( ch == '+' ) {\r | |
1680 | startinpos = s-starts;\r | |
1681 | s++; /* consume '+' */\r | |
1682 | if (s < e && *s == '-') { /* '+-' encodes '+' */\r | |
1683 | s++;\r | |
1684 | *p++ = '+';\r | |
1685 | }\r | |
1686 | else { /* begin base64-encoded section */\r | |
1687 | inShift = 1;\r | |
1688 | shiftOutStart = p;\r | |
1689 | base64bits = 0;\r | |
1690 | }\r | |
1691 | }\r | |
1692 | else if (DECODE_DIRECT(ch)) { /* character decodes as itself */\r | |
1693 | *p++ = ch;\r | |
1694 | s++;\r | |
1695 | }\r | |
1696 | else {\r | |
1697 | startinpos = s-starts;\r | |
1698 | s++;\r | |
1699 | errmsg = "unexpected special character";\r | |
1700 | goto utf7Error;\r | |
1701 | }\r | |
1702 | continue;\r | |
1703 | utf7Error:\r | |
1704 | outpos = p-PyUnicode_AS_UNICODE(unicode);\r | |
1705 | endinpos = s-starts;\r | |
1706 | if (unicode_decode_call_errorhandler(\r | |
1707 | errors, &errorHandler,\r | |
1708 | "utf7", errmsg,\r | |
1709 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
1710 | &unicode, &outpos, &p))\r | |
1711 | goto onError;\r | |
1712 | }\r | |
1713 | \r | |
1714 | /* end of string */\r | |
1715 | \r | |
1716 | if (inShift && !consumed) { /* in shift sequence, no more to follow */\r | |
1717 | /* if we're in an inconsistent state, that's an error */\r | |
1718 | if (surrogate ||\r | |
1719 | (base64bits >= 6) ||\r | |
1720 | (base64bits > 0 && base64buffer != 0)) {\r | |
1721 | outpos = p-PyUnicode_AS_UNICODE(unicode);\r | |
1722 | endinpos = size;\r | |
1723 | if (unicode_decode_call_errorhandler(\r | |
1724 | errors, &errorHandler,\r | |
1725 | "utf7", "unterminated shift sequence",\r | |
1726 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
1727 | &unicode, &outpos, &p))\r | |
1728 | goto onError;\r | |
1729 | }\r | |
1730 | }\r | |
1731 | \r | |
1732 | /* return state */\r | |
1733 | if (consumed) {\r | |
1734 | if (inShift) {\r | |
1735 | p = shiftOutStart; /* back off output */\r | |
1736 | *consumed = startinpos;\r | |
1737 | }\r | |
1738 | else {\r | |
1739 | *consumed = s-starts;\r | |
1740 | }\r | |
1741 | }\r | |
1742 | \r | |
1743 | if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)\r | |
1744 | goto onError;\r | |
1745 | \r | |
1746 | Py_XDECREF(errorHandler);\r | |
1747 | Py_XDECREF(exc);\r | |
1748 | return (PyObject *)unicode;\r | |
1749 | \r | |
1750 | onError:\r | |
1751 | Py_XDECREF(errorHandler);\r | |
1752 | Py_XDECREF(exc);\r | |
1753 | Py_DECREF(unicode);\r | |
1754 | return NULL;\r | |
1755 | }\r | |
1756 | \r | |
1757 | \r | |
1758 | PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,\r | |
1759 | Py_ssize_t size,\r | |
1760 | int base64SetO,\r | |
1761 | int base64WhiteSpace,\r | |
1762 | const char *errors)\r | |
1763 | {\r | |
1764 | PyObject *v;\r | |
1765 | /* It might be possible to tighten this worst case */\r | |
1766 | Py_ssize_t allocated = 8 * size;\r | |
1767 | int inShift = 0;\r | |
1768 | Py_ssize_t i = 0;\r | |
1769 | unsigned int base64bits = 0;\r | |
1770 | unsigned long base64buffer = 0;\r | |
1771 | char * out;\r | |
1772 | char * start;\r | |
1773 | \r | |
1774 | if (allocated / 8 != size)\r | |
1775 | return PyErr_NoMemory();\r | |
1776 | \r | |
1777 | if (size == 0)\r | |
1778 | return PyString_FromStringAndSize(NULL, 0);\r | |
1779 | \r | |
1780 | v = PyString_FromStringAndSize(NULL, allocated);\r | |
1781 | if (v == NULL)\r | |
1782 | return NULL;\r | |
1783 | \r | |
1784 | start = out = PyString_AS_STRING(v);\r | |
1785 | for (;i < size; ++i) {\r | |
1786 | Py_UNICODE ch = s[i];\r | |
1787 | \r | |
1788 | if (inShift) {\r | |
1789 | if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {\r | |
1790 | /* shifting out */\r | |
1791 | if (base64bits) { /* output remaining bits */\r | |
1792 | *out++ = TO_BASE64(base64buffer << (6-base64bits));\r | |
1793 | base64buffer = 0;\r | |
1794 | base64bits = 0;\r | |
1795 | }\r | |
1796 | inShift = 0;\r | |
1797 | /* Characters not in the BASE64 set implicitly unshift the sequence\r | |
1798 | so no '-' is required, except if the character is itself a '-' */\r | |
1799 | if (IS_BASE64(ch) || ch == '-') {\r | |
1800 | *out++ = '-';\r | |
1801 | }\r | |
1802 | *out++ = (char) ch;\r | |
1803 | }\r | |
1804 | else {\r | |
1805 | goto encode_char;\r | |
1806 | }\r | |
1807 | }\r | |
1808 | else { /* not in a shift sequence */\r | |
1809 | if (ch == '+') {\r | |
1810 | *out++ = '+';\r | |
1811 | *out++ = '-';\r | |
1812 | }\r | |
1813 | else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {\r | |
1814 | *out++ = (char) ch;\r | |
1815 | }\r | |
1816 | else {\r | |
1817 | *out++ = '+';\r | |
1818 | inShift = 1;\r | |
1819 | goto encode_char;\r | |
1820 | }\r | |
1821 | }\r | |
1822 | continue;\r | |
1823 | encode_char:\r | |
1824 | #ifdef Py_UNICODE_WIDE\r | |
1825 | if (ch >= 0x10000) {\r | |
1826 | /* code first surrogate */\r | |
1827 | base64bits += 16;\r | |
1828 | base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);\r | |
1829 | while (base64bits >= 6) {\r | |
1830 | *out++ = TO_BASE64(base64buffer >> (base64bits-6));\r | |
1831 | base64bits -= 6;\r | |
1832 | }\r | |
1833 | /* prepare second surrogate */\r | |
1834 | ch = 0xDC00 | ((ch-0x10000) & 0x3FF);\r | |
1835 | }\r | |
1836 | #endif\r | |
1837 | base64bits += 16;\r | |
1838 | base64buffer = (base64buffer << 16) | ch;\r | |
1839 | while (base64bits >= 6) {\r | |
1840 | *out++ = TO_BASE64(base64buffer >> (base64bits-6));\r | |
1841 | base64bits -= 6;\r | |
1842 | }\r | |
1843 | }\r | |
1844 | if (base64bits)\r | |
1845 | *out++= TO_BASE64(base64buffer << (6-base64bits) );\r | |
1846 | if (inShift)\r | |
1847 | *out++ = '-';\r | |
1848 | \r | |
1849 | if (_PyString_Resize(&v, out - start))\r | |
1850 | return NULL;\r | |
1851 | return v;\r | |
1852 | }\r | |
1853 | \r | |
1854 | #undef IS_BASE64\r | |
1855 | #undef FROM_BASE64\r | |
1856 | #undef TO_BASE64\r | |
1857 | #undef DECODE_DIRECT\r | |
1858 | #undef ENCODE_DIRECT\r | |
1859 | \r | |
1860 | /* --- UTF-8 Codec -------------------------------------------------------- */\r | |
1861 | \r | |
1862 | static\r | |
1863 | char utf8_code_length[256] = {\r | |
1864 | /* Map UTF-8 encoded prefix byte to sequence length. Zero means\r | |
1865 | illegal prefix. See RFC 3629 for details */\r | |
1866 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */\r | |
1867 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r | |
de08c53b | 1868 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r |
4710c53d | 1869 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r |
1870 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r | |
1871 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r | |
1872 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r | |
1873 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */\r | |
1874 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */\r | |
1875 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r | |
1876 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r | |
1877 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */\r | |
1878 | 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */\r | |
1879 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */\r | |
1880 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */\r | |
1881 | 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */\r | |
1882 | };\r | |
1883 | \r | |
1884 | PyObject *PyUnicode_DecodeUTF8(const char *s,\r | |
1885 | Py_ssize_t size,\r | |
1886 | const char *errors)\r | |
1887 | {\r | |
1888 | return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);\r | |
1889 | }\r | |
1890 | \r | |
1891 | PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,\r | |
1892 | Py_ssize_t size,\r | |
1893 | const char *errors,\r | |
1894 | Py_ssize_t *consumed)\r | |
1895 | {\r | |
1896 | const char *starts = s;\r | |
1897 | int n;\r | |
1898 | int k;\r | |
1899 | Py_ssize_t startinpos;\r | |
1900 | Py_ssize_t endinpos;\r | |
1901 | Py_ssize_t outpos;\r | |
1902 | const char *e;\r | |
1903 | PyUnicodeObject *unicode;\r | |
1904 | Py_UNICODE *p;\r | |
1905 | const char *errmsg = "";\r | |
1906 | PyObject *errorHandler = NULL;\r | |
1907 | PyObject *exc = NULL;\r | |
1908 | \r | |
1909 | /* Note: size will always be longer than the resulting Unicode\r | |
1910 | character count */\r | |
1911 | unicode = _PyUnicode_New(size);\r | |
1912 | if (!unicode)\r | |
1913 | return NULL;\r | |
1914 | if (size == 0) {\r | |
1915 | if (consumed)\r | |
1916 | *consumed = 0;\r | |
1917 | return (PyObject *)unicode;\r | |
1918 | }\r | |
1919 | \r | |
1920 | /* Unpack UTF-8 encoded data */\r | |
1921 | p = unicode->str;\r | |
1922 | e = s + size;\r | |
1923 | \r | |
1924 | while (s < e) {\r | |
1925 | Py_UCS4 ch = (unsigned char)*s;\r | |
1926 | \r | |
1927 | if (ch < 0x80) {\r | |
1928 | *p++ = (Py_UNICODE)ch;\r | |
1929 | s++;\r | |
1930 | continue;\r | |
1931 | }\r | |
1932 | \r | |
1933 | n = utf8_code_length[ch];\r | |
1934 | \r | |
1935 | if (s + n > e) {\r | |
1936 | if (consumed)\r | |
1937 | break;\r | |
1938 | else {\r | |
1939 | errmsg = "unexpected end of data";\r | |
1940 | startinpos = s-starts;\r | |
1941 | endinpos = startinpos+1;\r | |
1942 | for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)\r | |
1943 | endinpos++;\r | |
1944 | goto utf8Error;\r | |
1945 | }\r | |
1946 | }\r | |
1947 | \r | |
1948 | switch (n) {\r | |
1949 | \r | |
1950 | case 0:\r | |
1951 | errmsg = "invalid start byte";\r | |
1952 | startinpos = s-starts;\r | |
1953 | endinpos = startinpos+1;\r | |
1954 | goto utf8Error;\r | |
1955 | \r | |
1956 | case 1:\r | |
1957 | errmsg = "internal error";\r | |
1958 | startinpos = s-starts;\r | |
1959 | endinpos = startinpos+1;\r | |
1960 | goto utf8Error;\r | |
1961 | \r | |
1962 | case 2:\r | |
1963 | if ((s[1] & 0xc0) != 0x80) {\r | |
1964 | errmsg = "invalid continuation byte";\r | |
1965 | startinpos = s-starts;\r | |
1966 | endinpos = startinpos + 1;\r | |
1967 | goto utf8Error;\r | |
1968 | }\r | |
1969 | ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);\r | |
1970 | assert ((ch > 0x007F) && (ch <= 0x07FF));\r | |
1971 | *p++ = (Py_UNICODE)ch;\r | |
1972 | break;\r | |
1973 | \r | |
1974 | case 3:\r | |
1975 | /* XXX: surrogates shouldn't be valid UTF-8!\r | |
1976 | see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf\r | |
1977 | (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt\r | |
1978 | Uncomment the 2 lines below to make them invalid,\r | |
1979 | codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */\r | |
1980 | if ((s[1] & 0xc0) != 0x80 ||\r | |
1981 | (s[2] & 0xc0) != 0x80 ||\r | |
1982 | ((unsigned char)s[0] == 0xE0 &&\r | |
1983 | (unsigned char)s[1] < 0xA0)/* ||\r | |
1984 | ((unsigned char)s[0] == 0xED &&\r | |
1985 | (unsigned char)s[1] > 0x9F)*/) {\r | |
1986 | errmsg = "invalid continuation byte";\r | |
1987 | startinpos = s-starts;\r | |
1988 | endinpos = startinpos + 1;\r | |
1989 | \r | |
1990 | /* if s[1] first two bits are 1 and 0, then the invalid\r | |
1991 | continuation byte is s[2], so increment endinpos by 1,\r | |
1992 | if not, s[1] is invalid and endinpos doesn't need to\r | |
1993 | be incremented. */\r | |
1994 | if ((s[1] & 0xC0) == 0x80)\r | |
1995 | endinpos++;\r | |
1996 | goto utf8Error;\r | |
1997 | }\r | |
1998 | ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);\r | |
1999 | assert ((ch > 0x07FF) && (ch <= 0xFFFF));\r | |
2000 | *p++ = (Py_UNICODE)ch;\r | |
2001 | break;\r | |
2002 | \r | |
2003 | case 4:\r | |
2004 | if ((s[1] & 0xc0) != 0x80 ||\r | |
2005 | (s[2] & 0xc0) != 0x80 ||\r | |
2006 | (s[3] & 0xc0) != 0x80 ||\r | |
2007 | ((unsigned char)s[0] == 0xF0 &&\r | |
2008 | (unsigned char)s[1] < 0x90) ||\r | |
2009 | ((unsigned char)s[0] == 0xF4 &&\r | |
2010 | (unsigned char)s[1] > 0x8F)) {\r | |
2011 | errmsg = "invalid continuation byte";\r | |
2012 | startinpos = s-starts;\r | |
2013 | endinpos = startinpos + 1;\r | |
2014 | if ((s[1] & 0xC0) == 0x80) {\r | |
2015 | endinpos++;\r | |
2016 | if ((s[2] & 0xC0) == 0x80)\r | |
2017 | endinpos++;\r | |
2018 | }\r | |
2019 | goto utf8Error;\r | |
2020 | }\r | |
2021 | ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +\r | |
2022 | ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);\r | |
2023 | assert ((ch > 0xFFFF) && (ch <= 0x10ffff));\r | |
2024 | \r | |
2025 | #ifdef Py_UNICODE_WIDE\r | |
2026 | *p++ = (Py_UNICODE)ch;\r | |
2027 | #else\r | |
2028 | /* compute and append the two surrogates: */\r | |
2029 | \r | |
2030 | /* translate from 10000..10FFFF to 0..FFFF */\r | |
2031 | ch -= 0x10000;\r | |
2032 | \r | |
2033 | /* high surrogate = top 10 bits added to D800 */\r | |
2034 | *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));\r | |
2035 | \r | |
2036 | /* low surrogate = bottom 10 bits added to DC00 */\r | |
2037 | *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));\r | |
2038 | #endif\r | |
2039 | break;\r | |
2040 | }\r | |
2041 | s += n;\r | |
2042 | continue;\r | |
2043 | \r | |
2044 | utf8Error:\r | |
2045 | outpos = p-PyUnicode_AS_UNICODE(unicode);\r | |
2046 | if (unicode_decode_call_errorhandler(\r | |
2047 | errors, &errorHandler,\r | |
2048 | "utf8", errmsg,\r | |
2049 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
2050 | &unicode, &outpos, &p))\r | |
2051 | goto onError;\r | |
2052 | }\r | |
2053 | if (consumed)\r | |
2054 | *consumed = s-starts;\r | |
2055 | \r | |
2056 | /* Adjust length */\r | |
2057 | if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)\r | |
2058 | goto onError;\r | |
2059 | \r | |
2060 | Py_XDECREF(errorHandler);\r | |
2061 | Py_XDECREF(exc);\r | |
2062 | return (PyObject *)unicode;\r | |
2063 | \r | |
2064 | onError:\r | |
2065 | Py_XDECREF(errorHandler);\r | |
2066 | Py_XDECREF(exc);\r | |
2067 | Py_DECREF(unicode);\r | |
2068 | return NULL;\r | |
2069 | }\r | |
2070 | \r | |
2071 | /* Allocation strategy: if the string is short, convert into a stack buffer\r | |
2072 | and allocate exactly as much space needed at the end. Else allocate the\r | |
2073 | maximum possible needed (4 result bytes per Unicode character), and return\r | |
2074 | the excess memory at the end.\r | |
2075 | */\r | |
2076 | PyObject *\r | |
2077 | PyUnicode_EncodeUTF8(const Py_UNICODE *s,\r | |
2078 | Py_ssize_t size,\r | |
2079 | const char *errors)\r | |
2080 | {\r | |
2081 | #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */\r | |
2082 | \r | |
2083 | Py_ssize_t i; /* index into s of next input byte */\r | |
2084 | PyObject *v; /* result string object */\r | |
2085 | char *p; /* next free byte in output buffer */\r | |
2086 | Py_ssize_t nallocated; /* number of result bytes allocated */\r | |
2087 | Py_ssize_t nneeded; /* number of result bytes needed */\r | |
2088 | char stackbuf[MAX_SHORT_UNICHARS * 4];\r | |
2089 | \r | |
2090 | assert(s != NULL);\r | |
2091 | assert(size >= 0);\r | |
2092 | \r | |
2093 | if (size <= MAX_SHORT_UNICHARS) {\r | |
2094 | /* Write into the stack buffer; nallocated can't overflow.\r | |
2095 | * At the end, we'll allocate exactly as much heap space as it\r | |
2096 | * turns out we need.\r | |
2097 | */\r | |
2098 | nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);\r | |
2099 | v = NULL; /* will allocate after we're done */\r | |
2100 | p = stackbuf;\r | |
2101 | }\r | |
2102 | else {\r | |
2103 | /* Overallocate on the heap, and give the excess back at the end. */\r | |
2104 | nallocated = size * 4;\r | |
2105 | if (nallocated / 4 != size) /* overflow! */\r | |
2106 | return PyErr_NoMemory();\r | |
2107 | v = PyString_FromStringAndSize(NULL, nallocated);\r | |
2108 | if (v == NULL)\r | |
2109 | return NULL;\r | |
2110 | p = PyString_AS_STRING(v);\r | |
2111 | }\r | |
2112 | \r | |
2113 | for (i = 0; i < size;) {\r | |
2114 | Py_UCS4 ch = s[i++];\r | |
2115 | \r | |
2116 | if (ch < 0x80)\r | |
2117 | /* Encode ASCII */\r | |
2118 | *p++ = (char) ch;\r | |
2119 | \r | |
2120 | else if (ch < 0x0800) {\r | |
2121 | /* Encode Latin-1 */\r | |
2122 | *p++ = (char)(0xc0 | (ch >> 6));\r | |
2123 | *p++ = (char)(0x80 | (ch & 0x3f));\r | |
2124 | }\r | |
2125 | else {\r | |
2126 | /* Encode UCS2 Unicode ordinals */\r | |
2127 | if (ch < 0x10000) {\r | |
2128 | /* Special case: check for high surrogate */\r | |
2129 | if (0xD800 <= ch && ch <= 0xDBFF && i != size) {\r | |
2130 | Py_UCS4 ch2 = s[i];\r | |
2131 | /* Check for low surrogate and combine the two to\r | |
2132 | form a UCS4 value */\r | |
2133 | if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r | |
2134 | ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;\r | |
2135 | i++;\r | |
2136 | goto encodeUCS4;\r | |
2137 | }\r | |
2138 | /* Fall through: handles isolated high surrogates */\r | |
2139 | }\r | |
2140 | *p++ = (char)(0xe0 | (ch >> 12));\r | |
2141 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r | |
2142 | *p++ = (char)(0x80 | (ch & 0x3f));\r | |
2143 | continue;\r | |
2144 | }\r | |
2145 | encodeUCS4:\r | |
2146 | /* Encode UCS4 Unicode ordinals */\r | |
2147 | *p++ = (char)(0xf0 | (ch >> 18));\r | |
2148 | *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));\r | |
2149 | *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r | |
2150 | *p++ = (char)(0x80 | (ch & 0x3f));\r | |
2151 | }\r | |
2152 | }\r | |
2153 | \r | |
2154 | if (v == NULL) {\r | |
2155 | /* This was stack allocated. */\r | |
2156 | nneeded = p - stackbuf;\r | |
2157 | assert(nneeded <= nallocated);\r | |
2158 | v = PyString_FromStringAndSize(stackbuf, nneeded);\r | |
2159 | }\r | |
2160 | else {\r | |
2161 | /* Cut back to size actually needed. */\r | |
2162 | nneeded = p - PyString_AS_STRING(v);\r | |
2163 | assert(nneeded <= nallocated);\r | |
2164 | if (_PyString_Resize(&v, nneeded))\r | |
2165 | return NULL;\r | |
2166 | }\r | |
2167 | return v;\r | |
2168 | \r | |
2169 | #undef MAX_SHORT_UNICHARS\r | |
2170 | }\r | |
2171 | \r | |
2172 | PyObject *PyUnicode_AsUTF8String(PyObject *unicode)\r | |
2173 | {\r | |
2174 | if (!PyUnicode_Check(unicode)) {\r | |
2175 | PyErr_BadArgument();\r | |
2176 | return NULL;\r | |
2177 | }\r | |
2178 | return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),\r | |
2179 | PyUnicode_GET_SIZE(unicode),\r | |
2180 | NULL);\r | |
2181 | }\r | |
2182 | \r | |
2183 | /* --- UTF-32 Codec ------------------------------------------------------- */\r | |
2184 | \r | |
2185 | PyObject *\r | |
2186 | PyUnicode_DecodeUTF32(const char *s,\r | |
2187 | Py_ssize_t size,\r | |
2188 | const char *errors,\r | |
2189 | int *byteorder)\r | |
2190 | {\r | |
2191 | return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);\r | |
2192 | }\r | |
2193 | \r | |
2194 | PyObject *\r | |
2195 | PyUnicode_DecodeUTF32Stateful(const char *s,\r | |
2196 | Py_ssize_t size,\r | |
2197 | const char *errors,\r | |
2198 | int *byteorder,\r | |
2199 | Py_ssize_t *consumed)\r | |
2200 | {\r | |
2201 | const char *starts = s;\r | |
2202 | Py_ssize_t startinpos;\r | |
2203 | Py_ssize_t endinpos;\r | |
2204 | Py_ssize_t outpos;\r | |
2205 | PyUnicodeObject *unicode;\r | |
2206 | Py_UNICODE *p;\r | |
2207 | #ifndef Py_UNICODE_WIDE\r | |
2208 | int pairs = 0;\r | |
2209 | const unsigned char *qq;\r | |
2210 | #else\r | |
2211 | const int pairs = 0;\r | |
2212 | #endif\r | |
2213 | const unsigned char *q, *e;\r | |
2214 | int bo = 0; /* assume native ordering by default */\r | |
2215 | const char *errmsg = "";\r | |
2216 | /* Offsets from q for retrieving bytes in the right order. */\r | |
2217 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN\r | |
2218 | int iorder[] = {0, 1, 2, 3};\r | |
2219 | #else\r | |
2220 | int iorder[] = {3, 2, 1, 0};\r | |
2221 | #endif\r | |
2222 | PyObject *errorHandler = NULL;\r | |
2223 | PyObject *exc = NULL;\r | |
de08c53b | 2224 | \r |
4710c53d | 2225 | q = (unsigned char *)s;\r |
2226 | e = q + size;\r | |
2227 | \r | |
2228 | if (byteorder)\r | |
2229 | bo = *byteorder;\r | |
2230 | \r | |
2231 | /* Check for BOM marks (U+FEFF) in the input and adjust current\r | |
2232 | byte order setting accordingly. In native mode, the leading BOM\r | |
2233 | mark is skipped, in all other modes, it is copied to the output\r | |
2234 | stream as-is (giving a ZWNBSP character). */\r | |
2235 | if (bo == 0) {\r | |
2236 | if (size >= 4) {\r | |
2237 | const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |\r | |
2238 | (q[iorder[1]] << 8) | q[iorder[0]];\r | |
2239 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN\r | |
2240 | if (bom == 0x0000FEFF) {\r | |
2241 | q += 4;\r | |
2242 | bo = -1;\r | |
2243 | }\r | |
2244 | else if (bom == 0xFFFE0000) {\r | |
2245 | q += 4;\r | |
2246 | bo = 1;\r | |
2247 | }\r | |
2248 | #else\r | |
2249 | if (bom == 0x0000FEFF) {\r | |
2250 | q += 4;\r | |
2251 | bo = 1;\r | |
2252 | }\r | |
2253 | else if (bom == 0xFFFE0000) {\r | |
2254 | q += 4;\r | |
2255 | bo = -1;\r | |
2256 | }\r | |
2257 | #endif\r | |
2258 | }\r | |
2259 | }\r | |
2260 | \r | |
2261 | if (bo == -1) {\r | |
2262 | /* force LE */\r | |
2263 | iorder[0] = 0;\r | |
2264 | iorder[1] = 1;\r | |
2265 | iorder[2] = 2;\r | |
2266 | iorder[3] = 3;\r | |
2267 | }\r | |
2268 | else if (bo == 1) {\r | |
2269 | /* force BE */\r | |
2270 | iorder[0] = 3;\r | |
2271 | iorder[1] = 2;\r | |
2272 | iorder[2] = 1;\r | |
2273 | iorder[3] = 0;\r | |
2274 | }\r | |
2275 | \r | |
2276 | /* On narrow builds we split characters outside the BMP into two\r | |
2277 | codepoints => count how much extra space we need. */\r | |
2278 | #ifndef Py_UNICODE_WIDE\r | |
2279 | for (qq = q; qq < e; qq += 4)\r | |
2280 | if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)\r | |
2281 | pairs++;\r | |
2282 | #endif\r | |
2283 | \r | |
2284 | /* This might be one to much, because of a BOM */\r | |
2285 | unicode = _PyUnicode_New((size+3)/4+pairs);\r | |
2286 | if (!unicode)\r | |
2287 | return NULL;\r | |
2288 | if (size == 0)\r | |
2289 | return (PyObject *)unicode;\r | |
2290 | \r | |
2291 | /* Unpack UTF-32 encoded data */\r | |
2292 | p = unicode->str;\r | |
2293 | \r | |
2294 | while (q < e) {\r | |
2295 | Py_UCS4 ch;\r | |
2296 | /* remaining bytes at the end? (size should be divisible by 4) */\r | |
2297 | if (e-q<4) {\r | |
2298 | if (consumed)\r | |
2299 | break;\r | |
2300 | errmsg = "truncated data";\r | |
2301 | startinpos = ((const char *)q)-starts;\r | |
2302 | endinpos = ((const char *)e)-starts;\r | |
2303 | goto utf32Error;\r | |
2304 | /* The remaining input chars are ignored if the callback\r | |
2305 | chooses to skip the input */\r | |
2306 | }\r | |
2307 | ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |\r | |
2308 | (q[iorder[1]] << 8) | q[iorder[0]];\r | |
2309 | \r | |
2310 | if (ch >= 0x110000)\r | |
2311 | {\r | |
2312 | errmsg = "codepoint not in range(0x110000)";\r | |
2313 | startinpos = ((const char *)q)-starts;\r | |
2314 | endinpos = startinpos+4;\r | |
2315 | goto utf32Error;\r | |
2316 | }\r | |
2317 | #ifndef Py_UNICODE_WIDE\r | |
2318 | if (ch >= 0x10000)\r | |
2319 | {\r | |
2320 | *p++ = 0xD800 | ((ch-0x10000) >> 10);\r | |
2321 | *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);\r | |
2322 | }\r | |
2323 | else\r | |
2324 | #endif\r | |
2325 | *p++ = ch;\r | |
2326 | q += 4;\r | |
2327 | continue;\r | |
2328 | utf32Error:\r | |
2329 | outpos = p-PyUnicode_AS_UNICODE(unicode);\r | |
2330 | if (unicode_decode_call_errorhandler(\r | |
2331 | errors, &errorHandler,\r | |
2332 | "utf32", errmsg,\r | |
2333 | starts, size, &startinpos, &endinpos, &exc, (const char **)&q,\r | |
2334 | &unicode, &outpos, &p))\r | |
2335 | goto onError;\r | |
2336 | }\r | |
2337 | \r | |
2338 | if (byteorder)\r | |
2339 | *byteorder = bo;\r | |
2340 | \r | |
2341 | if (consumed)\r | |
2342 | *consumed = (const char *)q-starts;\r | |
2343 | \r | |
2344 | /* Adjust length */\r | |
2345 | if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)\r | |
2346 | goto onError;\r | |
2347 | \r | |
2348 | Py_XDECREF(errorHandler);\r | |
2349 | Py_XDECREF(exc);\r | |
2350 | return (PyObject *)unicode;\r | |
2351 | \r | |
2352 | onError:\r | |
2353 | Py_DECREF(unicode);\r | |
2354 | Py_XDECREF(errorHandler);\r | |
2355 | Py_XDECREF(exc);\r | |
2356 | return NULL;\r | |
2357 | }\r | |
2358 | \r | |
2359 | PyObject *\r | |
2360 | PyUnicode_EncodeUTF32(const Py_UNICODE *s,\r | |
2361 | Py_ssize_t size,\r | |
2362 | const char *errors,\r | |
2363 | int byteorder)\r | |
2364 | {\r | |
2365 | PyObject *v;\r | |
2366 | unsigned char *p;\r | |
2367 | Py_ssize_t nsize, bytesize;\r | |
2368 | #ifndef Py_UNICODE_WIDE\r | |
2369 | Py_ssize_t i, pairs;\r | |
2370 | #else\r | |
2371 | const int pairs = 0;\r | |
2372 | #endif\r | |
2373 | /* Offsets from p for storing byte pairs in the right order. */\r | |
2374 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN\r | |
2375 | int iorder[] = {0, 1, 2, 3};\r | |
2376 | #else\r | |
2377 | int iorder[] = {3, 2, 1, 0};\r | |
2378 | #endif\r | |
2379 | \r | |
2380 | #define STORECHAR(CH) \\r | |
2381 | do { \\r | |
2382 | p[iorder[3]] = ((CH) >> 24) & 0xff; \\r | |
2383 | p[iorder[2]] = ((CH) >> 16) & 0xff; \\r | |
2384 | p[iorder[1]] = ((CH) >> 8) & 0xff; \\r | |
2385 | p[iorder[0]] = (CH) & 0xff; \\r | |
2386 | p += 4; \\r | |
2387 | } while(0)\r | |
2388 | \r | |
2389 | /* In narrow builds we can output surrogate pairs as one codepoint,\r | |
2390 | so we need less space. */\r | |
2391 | #ifndef Py_UNICODE_WIDE\r | |
2392 | for (i = pairs = 0; i < size-1; i++)\r | |
2393 | if (0xD800 <= s[i] && s[i] <= 0xDBFF &&\r | |
2394 | 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)\r | |
2395 | pairs++;\r | |
2396 | #endif\r | |
2397 | nsize = (size - pairs + (byteorder == 0));\r | |
2398 | bytesize = nsize * 4;\r | |
2399 | if (bytesize / 4 != nsize)\r | |
2400 | return PyErr_NoMemory();\r | |
2401 | v = PyString_FromStringAndSize(NULL, bytesize);\r | |
2402 | if (v == NULL)\r | |
2403 | return NULL;\r | |
2404 | \r | |
2405 | p = (unsigned char *)PyString_AS_STRING(v);\r | |
2406 | if (byteorder == 0)\r | |
2407 | STORECHAR(0xFEFF);\r | |
2408 | if (size == 0)\r | |
2409 | return v;\r | |
2410 | \r | |
2411 | if (byteorder == -1) {\r | |
2412 | /* force LE */\r | |
2413 | iorder[0] = 0;\r | |
2414 | iorder[1] = 1;\r | |
2415 | iorder[2] = 2;\r | |
2416 | iorder[3] = 3;\r | |
2417 | }\r | |
2418 | else if (byteorder == 1) {\r | |
2419 | /* force BE */\r | |
2420 | iorder[0] = 3;\r | |
2421 | iorder[1] = 2;\r | |
2422 | iorder[2] = 1;\r | |
2423 | iorder[3] = 0;\r | |
2424 | }\r | |
2425 | \r | |
2426 | while (size-- > 0) {\r | |
2427 | Py_UCS4 ch = *s++;\r | |
2428 | #ifndef Py_UNICODE_WIDE\r | |
2429 | if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {\r | |
2430 | Py_UCS4 ch2 = *s;\r | |
2431 | if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r | |
2432 | ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;\r | |
2433 | s++;\r | |
2434 | size--;\r | |
2435 | }\r | |
2436 | }\r | |
2437 | #endif\r | |
2438 | STORECHAR(ch);\r | |
2439 | }\r | |
2440 | return v;\r | |
2441 | #undef STORECHAR\r | |
2442 | }\r | |
2443 | \r | |
2444 | PyObject *PyUnicode_AsUTF32String(PyObject *unicode)\r | |
2445 | {\r | |
2446 | if (!PyUnicode_Check(unicode)) {\r | |
2447 | PyErr_BadArgument();\r | |
2448 | return NULL;\r | |
2449 | }\r | |
2450 | return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),\r | |
2451 | PyUnicode_GET_SIZE(unicode),\r | |
2452 | NULL,\r | |
2453 | 0);\r | |
2454 | }\r | |
2455 | \r | |
2456 | /* --- UTF-16 Codec ------------------------------------------------------- */\r | |
2457 | \r | |
2458 | PyObject *\r | |
2459 | PyUnicode_DecodeUTF16(const char *s,\r | |
2460 | Py_ssize_t size,\r | |
2461 | const char *errors,\r | |
2462 | int *byteorder)\r | |
2463 | {\r | |
2464 | return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);\r | |
2465 | }\r | |
2466 | \r | |
2467 | PyObject *\r | |
2468 | PyUnicode_DecodeUTF16Stateful(const char *s,\r | |
2469 | Py_ssize_t size,\r | |
2470 | const char *errors,\r | |
2471 | int *byteorder,\r | |
2472 | Py_ssize_t *consumed)\r | |
2473 | {\r | |
2474 | const char *starts = s;\r | |
2475 | Py_ssize_t startinpos;\r | |
2476 | Py_ssize_t endinpos;\r | |
2477 | Py_ssize_t outpos;\r | |
2478 | PyUnicodeObject *unicode;\r | |
2479 | Py_UNICODE *p;\r | |
2480 | const unsigned char *q, *e;\r | |
2481 | int bo = 0; /* assume native ordering by default */\r | |
2482 | const char *errmsg = "";\r | |
2483 | /* Offsets from q for retrieving byte pairs in the right order. */\r | |
2484 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN\r | |
2485 | int ihi = 1, ilo = 0;\r | |
2486 | #else\r | |
2487 | int ihi = 0, ilo = 1;\r | |
2488 | #endif\r | |
2489 | PyObject *errorHandler = NULL;\r | |
2490 | PyObject *exc = NULL;\r | |
2491 | \r | |
2492 | /* Note: size will always be longer than the resulting Unicode\r | |
2493 | character count */\r | |
2494 | unicode = _PyUnicode_New(size);\r | |
2495 | if (!unicode)\r | |
2496 | return NULL;\r | |
2497 | if (size == 0)\r | |
2498 | return (PyObject *)unicode;\r | |
2499 | \r | |
2500 | /* Unpack UTF-16 encoded data */\r | |
2501 | p = unicode->str;\r | |
2502 | q = (unsigned char *)s;\r | |
2503 | e = q + size;\r | |
2504 | \r | |
2505 | if (byteorder)\r | |
2506 | bo = *byteorder;\r | |
2507 | \r | |
2508 | /* Check for BOM marks (U+FEFF) in the input and adjust current\r | |
2509 | byte order setting accordingly. In native mode, the leading BOM\r | |
2510 | mark is skipped, in all other modes, it is copied to the output\r | |
2511 | stream as-is (giving a ZWNBSP character). */\r | |
2512 | if (bo == 0) {\r | |
2513 | if (size >= 2) {\r | |
2514 | const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];\r | |
2515 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN\r | |
2516 | if (bom == 0xFEFF) {\r | |
2517 | q += 2;\r | |
2518 | bo = -1;\r | |
2519 | }\r | |
2520 | else if (bom == 0xFFFE) {\r | |
2521 | q += 2;\r | |
2522 | bo = 1;\r | |
2523 | }\r | |
2524 | #else\r | |
2525 | if (bom == 0xFEFF) {\r | |
2526 | q += 2;\r | |
2527 | bo = 1;\r | |
2528 | }\r | |
2529 | else if (bom == 0xFFFE) {\r | |
2530 | q += 2;\r | |
2531 | bo = -1;\r | |
2532 | }\r | |
2533 | #endif\r | |
2534 | }\r | |
2535 | }\r | |
2536 | \r | |
2537 | if (bo == -1) {\r | |
2538 | /* force LE */\r | |
2539 | ihi = 1;\r | |
2540 | ilo = 0;\r | |
2541 | }\r | |
2542 | else if (bo == 1) {\r | |
2543 | /* force BE */\r | |
2544 | ihi = 0;\r | |
2545 | ilo = 1;\r | |
2546 | }\r | |
2547 | \r | |
2548 | while (q < e) {\r | |
2549 | Py_UNICODE ch;\r | |
2550 | /* remaining bytes at the end? (size should be even) */\r | |
2551 | if (e-q<2) {\r | |
2552 | if (consumed)\r | |
2553 | break;\r | |
2554 | errmsg = "truncated data";\r | |
2555 | startinpos = ((const char *)q)-starts;\r | |
2556 | endinpos = ((const char *)e)-starts;\r | |
2557 | goto utf16Error;\r | |
2558 | /* The remaining input chars are ignored if the callback\r | |
2559 | chooses to skip the input */\r | |
2560 | }\r | |
2561 | ch = (q[ihi] << 8) | q[ilo];\r | |
2562 | \r | |
2563 | q += 2;\r | |
2564 | \r | |
2565 | if (ch < 0xD800 || ch > 0xDFFF) {\r | |
2566 | *p++ = ch;\r | |
2567 | continue;\r | |
2568 | }\r | |
2569 | \r | |
2570 | /* UTF-16 code pair: */\r | |
2571 | if (q >= e) {\r | |
2572 | errmsg = "unexpected end of data";\r | |
2573 | startinpos = (((const char *)q)-2)-starts;\r | |
2574 | endinpos = ((const char *)e)-starts;\r | |
2575 | goto utf16Error;\r | |
2576 | }\r | |
2577 | if (0xD800 <= ch && ch <= 0xDBFF) {\r | |
2578 | Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];\r | |
2579 | q += 2;\r | |
2580 | if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r | |
2581 | #ifndef Py_UNICODE_WIDE\r | |
2582 | *p++ = ch;\r | |
2583 | *p++ = ch2;\r | |
2584 | #else\r | |
2585 | *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;\r | |
2586 | #endif\r | |
2587 | continue;\r | |
2588 | }\r | |
2589 | else {\r | |
2590 | errmsg = "illegal UTF-16 surrogate";\r | |
2591 | startinpos = (((const char *)q)-4)-starts;\r | |
2592 | endinpos = startinpos+2;\r | |
2593 | goto utf16Error;\r | |
2594 | }\r | |
2595 | \r | |
2596 | }\r | |
2597 | errmsg = "illegal encoding";\r | |
2598 | startinpos = (((const char *)q)-2)-starts;\r | |
2599 | endinpos = startinpos+2;\r | |
2600 | /* Fall through to report the error */\r | |
2601 | \r | |
2602 | utf16Error:\r | |
2603 | outpos = p-PyUnicode_AS_UNICODE(unicode);\r | |
2604 | if (unicode_decode_call_errorhandler(\r | |
2605 | errors, &errorHandler,\r | |
2606 | "utf16", errmsg,\r | |
2607 | starts, size, &startinpos, &endinpos, &exc, (const char **)&q,\r | |
2608 | &unicode, &outpos, &p))\r | |
2609 | goto onError;\r | |
2610 | }\r | |
2611 | \r | |
2612 | if (byteorder)\r | |
2613 | *byteorder = bo;\r | |
2614 | \r | |
2615 | if (consumed)\r | |
2616 | *consumed = (const char *)q-starts;\r | |
2617 | \r | |
2618 | /* Adjust length */\r | |
2619 | if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)\r | |
2620 | goto onError;\r | |
2621 | \r | |
2622 | Py_XDECREF(errorHandler);\r | |
2623 | Py_XDECREF(exc);\r | |
2624 | return (PyObject *)unicode;\r | |
2625 | \r | |
2626 | onError:\r | |
2627 | Py_DECREF(unicode);\r | |
2628 | Py_XDECREF(errorHandler);\r | |
2629 | Py_XDECREF(exc);\r | |
2630 | return NULL;\r | |
2631 | }\r | |
2632 | \r | |
2633 | PyObject *\r | |
2634 | PyUnicode_EncodeUTF16(const Py_UNICODE *s,\r | |
2635 | Py_ssize_t size,\r | |
2636 | const char *errors,\r | |
2637 | int byteorder)\r | |
2638 | {\r | |
2639 | PyObject *v;\r | |
2640 | unsigned char *p;\r | |
2641 | Py_ssize_t nsize, bytesize;\r | |
2642 | #ifdef Py_UNICODE_WIDE\r | |
2643 | Py_ssize_t i, pairs;\r | |
2644 | #else\r | |
2645 | const int pairs = 0;\r | |
2646 | #endif\r | |
2647 | /* Offsets from p for storing byte pairs in the right order. */\r | |
2648 | #ifdef BYTEORDER_IS_LITTLE_ENDIAN\r | |
2649 | int ihi = 1, ilo = 0;\r | |
2650 | #else\r | |
2651 | int ihi = 0, ilo = 1;\r | |
2652 | #endif\r | |
2653 | \r | |
2654 | #define STORECHAR(CH) \\r | |
2655 | do { \\r | |
2656 | p[ihi] = ((CH) >> 8) & 0xff; \\r | |
2657 | p[ilo] = (CH) & 0xff; \\r | |
2658 | p += 2; \\r | |
2659 | } while(0)\r | |
2660 | \r | |
2661 | #ifdef Py_UNICODE_WIDE\r | |
2662 | for (i = pairs = 0; i < size; i++)\r | |
2663 | if (s[i] >= 0x10000)\r | |
2664 | pairs++;\r | |
2665 | #endif\r | |
2666 | /* 2 * (size + pairs + (byteorder == 0)) */\r | |
2667 | if (size > PY_SSIZE_T_MAX ||\r | |
2668 | size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))\r | |
2669 | return PyErr_NoMemory();\r | |
2670 | nsize = size + pairs + (byteorder == 0);\r | |
2671 | bytesize = nsize * 2;\r | |
2672 | if (bytesize / 2 != nsize)\r | |
2673 | return PyErr_NoMemory();\r | |
2674 | v = PyString_FromStringAndSize(NULL, bytesize);\r | |
2675 | if (v == NULL)\r | |
2676 | return NULL;\r | |
2677 | \r | |
2678 | p = (unsigned char *)PyString_AS_STRING(v);\r | |
2679 | if (byteorder == 0)\r | |
2680 | STORECHAR(0xFEFF);\r | |
2681 | if (size == 0)\r | |
2682 | return v;\r | |
2683 | \r | |
2684 | if (byteorder == -1) {\r | |
2685 | /* force LE */\r | |
2686 | ihi = 1;\r | |
2687 | ilo = 0;\r | |
2688 | }\r | |
2689 | else if (byteorder == 1) {\r | |
2690 | /* force BE */\r | |
2691 | ihi = 0;\r | |
2692 | ilo = 1;\r | |
2693 | }\r | |
2694 | \r | |
2695 | while (size-- > 0) {\r | |
2696 | Py_UNICODE ch = *s++;\r | |
2697 | Py_UNICODE ch2 = 0;\r | |
2698 | #ifdef Py_UNICODE_WIDE\r | |
2699 | if (ch >= 0x10000) {\r | |
2700 | ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);\r | |
2701 | ch = 0xD800 | ((ch-0x10000) >> 10);\r | |
2702 | }\r | |
2703 | #endif\r | |
2704 | STORECHAR(ch);\r | |
2705 | if (ch2)\r | |
2706 | STORECHAR(ch2);\r | |
2707 | }\r | |
2708 | return v;\r | |
2709 | #undef STORECHAR\r | |
2710 | }\r | |
2711 | \r | |
2712 | PyObject *PyUnicode_AsUTF16String(PyObject *unicode)\r | |
2713 | {\r | |
2714 | if (!PyUnicode_Check(unicode)) {\r | |
2715 | PyErr_BadArgument();\r | |
2716 | return NULL;\r | |
2717 | }\r | |
2718 | return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),\r | |
2719 | PyUnicode_GET_SIZE(unicode),\r | |
2720 | NULL,\r | |
2721 | 0);\r | |
2722 | }\r | |
2723 | \r | |
2724 | /* --- Unicode Escape Codec ----------------------------------------------- */\r | |
2725 | \r | |
2726 | static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;\r | |
2727 | \r | |
2728 | PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,\r | |
2729 | Py_ssize_t size,\r | |
2730 | const char *errors)\r | |
2731 | {\r | |
2732 | const char *starts = s;\r | |
2733 | Py_ssize_t startinpos;\r | |
2734 | Py_ssize_t endinpos;\r | |
2735 | Py_ssize_t outpos;\r | |
2736 | int i;\r | |
2737 | PyUnicodeObject *v;\r | |
2738 | Py_UNICODE *p;\r | |
2739 | const char *end;\r | |
2740 | char* message;\r | |
2741 | Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */\r | |
2742 | PyObject *errorHandler = NULL;\r | |
2743 | PyObject *exc = NULL;\r | |
2744 | \r | |
2745 | /* Escaped strings will always be longer than the resulting\r | |
2746 | Unicode string, so we start with size here and then reduce the\r | |
2747 | length after conversion to the true value.\r | |
2748 | (but if the error callback returns a long replacement string\r | |
2749 | we'll have to allocate more space) */\r | |
2750 | v = _PyUnicode_New(size);\r | |
2751 | if (v == NULL)\r | |
2752 | goto onError;\r | |
2753 | if (size == 0)\r | |
2754 | return (PyObject *)v;\r | |
2755 | \r | |
2756 | p = PyUnicode_AS_UNICODE(v);\r | |
2757 | end = s + size;\r | |
2758 | \r | |
2759 | while (s < end) {\r | |
2760 | unsigned char c;\r | |
2761 | Py_UNICODE x;\r | |
2762 | int digits;\r | |
2763 | \r | |
2764 | /* Non-escape characters are interpreted as Unicode ordinals */\r | |
2765 | if (*s != '\\') {\r | |
2766 | *p++ = (unsigned char) *s++;\r | |
2767 | continue;\r | |
2768 | }\r | |
2769 | \r | |
2770 | startinpos = s-starts;\r | |
2771 | /* \ - Escapes */\r | |
2772 | s++;\r | |
2773 | c = *s++;\r | |
2774 | if (s > end)\r | |
2775 | c = '\0'; /* Invalid after \ */\r | |
2776 | switch (c) {\r | |
2777 | \r | |
2778 | /* \x escapes */\r | |
2779 | case '\n': break;\r | |
2780 | case '\\': *p++ = '\\'; break;\r | |
2781 | case '\'': *p++ = '\''; break;\r | |
2782 | case '\"': *p++ = '\"'; break;\r | |
2783 | case 'b': *p++ = '\b'; break;\r | |
2784 | case 'f': *p++ = '\014'; break; /* FF */\r | |
2785 | case 't': *p++ = '\t'; break;\r | |
2786 | case 'n': *p++ = '\n'; break;\r | |
2787 | case 'r': *p++ = '\r'; break;\r | |
2788 | case 'v': *p++ = '\013'; break; /* VT */\r | |
2789 | case 'a': *p++ = '\007'; break; /* BEL, not classic C */\r | |
2790 | \r | |
2791 | /* \OOO (octal) escapes */\r | |
2792 | case '0': case '1': case '2': case '3':\r | |
2793 | case '4': case '5': case '6': case '7':\r | |
2794 | x = s[-1] - '0';\r | |
2795 | if (s < end && '0' <= *s && *s <= '7') {\r | |
2796 | x = (x<<3) + *s++ - '0';\r | |
2797 | if (s < end && '0' <= *s && *s <= '7')\r | |
2798 | x = (x<<3) + *s++ - '0';\r | |
2799 | }\r | |
2800 | *p++ = x;\r | |
2801 | break;\r | |
2802 | \r | |
2803 | /* hex escapes */\r | |
2804 | /* \xXX */\r | |
2805 | case 'x':\r | |
2806 | digits = 2;\r | |
2807 | message = "truncated \\xXX escape";\r | |
2808 | goto hexescape;\r | |
2809 | \r | |
2810 | /* \uXXXX */\r | |
2811 | case 'u':\r | |
2812 | digits = 4;\r | |
2813 | message = "truncated \\uXXXX escape";\r | |
2814 | goto hexescape;\r | |
2815 | \r | |
2816 | /* \UXXXXXXXX */\r | |
2817 | case 'U':\r | |
2818 | digits = 8;\r | |
2819 | message = "truncated \\UXXXXXXXX escape";\r | |
2820 | hexescape:\r | |
2821 | chr = 0;\r | |
2822 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
2823 | if (s+digits>end) {\r | |
2824 | endinpos = size;\r | |
2825 | if (unicode_decode_call_errorhandler(\r | |
2826 | errors, &errorHandler,\r | |
2827 | "unicodeescape", "end of string in escape sequence",\r | |
2828 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
2829 | &v, &outpos, &p))\r | |
2830 | goto onError;\r | |
2831 | goto nextByte;\r | |
2832 | }\r | |
2833 | for (i = 0; i < digits; ++i) {\r | |
2834 | c = (unsigned char) s[i];\r | |
2835 | if (!isxdigit(c)) {\r | |
2836 | endinpos = (s+i+1)-starts;\r | |
2837 | if (unicode_decode_call_errorhandler(\r | |
2838 | errors, &errorHandler,\r | |
2839 | "unicodeescape", message,\r | |
2840 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
2841 | &v, &outpos, &p))\r | |
2842 | goto onError;\r | |
2843 | goto nextByte;\r | |
2844 | }\r | |
2845 | chr = (chr<<4) & ~0xF;\r | |
2846 | if (c >= '0' && c <= '9')\r | |
2847 | chr += c - '0';\r | |
2848 | else if (c >= 'a' && c <= 'f')\r | |
2849 | chr += 10 + c - 'a';\r | |
2850 | else\r | |
2851 | chr += 10 + c - 'A';\r | |
2852 | }\r | |
2853 | s += i;\r | |
2854 | if (chr == 0xffffffff && PyErr_Occurred())\r | |
2855 | /* _decoding_error will have already written into the\r | |
2856 | target buffer. */\r | |
2857 | break;\r | |
2858 | store:\r | |
2859 | /* when we get here, chr is a 32-bit unicode character */\r | |
2860 | if (chr <= 0xffff)\r | |
2861 | /* UCS-2 character */\r | |
2862 | *p++ = (Py_UNICODE) chr;\r | |
2863 | else if (chr <= 0x10ffff) {\r | |
2864 | /* UCS-4 character. Either store directly, or as\r | |
2865 | surrogate pair. */\r | |
2866 | #ifdef Py_UNICODE_WIDE\r | |
2867 | *p++ = chr;\r | |
2868 | #else\r | |
2869 | chr -= 0x10000L;\r | |
2870 | *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);\r | |
2871 | *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);\r | |
2872 | #endif\r | |
2873 | } else {\r | |
2874 | endinpos = s-starts;\r | |
2875 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
2876 | if (unicode_decode_call_errorhandler(\r | |
2877 | errors, &errorHandler,\r | |
2878 | "unicodeescape", "illegal Unicode character",\r | |
2879 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
2880 | &v, &outpos, &p))\r | |
2881 | goto onError;\r | |
2882 | }\r | |
2883 | break;\r | |
2884 | \r | |
2885 | /* \N{name} */\r | |
2886 | case 'N':\r | |
2887 | message = "malformed \\N character escape";\r | |
2888 | if (ucnhash_CAPI == NULL) {\r | |
2889 | /* load the unicode data module */\r | |
2890 | ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);\r | |
2891 | if (ucnhash_CAPI == NULL)\r | |
2892 | goto ucnhashError;\r | |
2893 | }\r | |
2894 | if (*s == '{') {\r | |
2895 | const char *start = s+1;\r | |
2896 | /* look for the closing brace */\r | |
2897 | while (*s != '}' && s < end)\r | |
2898 | s++;\r | |
2899 | if (s > start && s < end && *s == '}') {\r | |
2900 | /* found a name. look it up in the unicode database */\r | |
2901 | message = "unknown Unicode character name";\r | |
2902 | s++;\r | |
2903 | if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))\r | |
2904 | goto store;\r | |
2905 | }\r | |
2906 | }\r | |
2907 | endinpos = s-starts;\r | |
2908 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
2909 | if (unicode_decode_call_errorhandler(\r | |
2910 | errors, &errorHandler,\r | |
2911 | "unicodeescape", message,\r | |
2912 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
2913 | &v, &outpos, &p))\r | |
2914 | goto onError;\r | |
2915 | break;\r | |
2916 | \r | |
2917 | default:\r | |
2918 | if (s > end) {\r | |
2919 | message = "\\ at end of string";\r | |
2920 | s--;\r | |
2921 | endinpos = s-starts;\r | |
2922 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
2923 | if (unicode_decode_call_errorhandler(\r | |
2924 | errors, &errorHandler,\r | |
2925 | "unicodeescape", message,\r | |
2926 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
2927 | &v, &outpos, &p))\r | |
2928 | goto onError;\r | |
2929 | }\r | |
2930 | else {\r | |
2931 | *p++ = '\\';\r | |
2932 | *p++ = (unsigned char)s[-1];\r | |
2933 | }\r | |
2934 | break;\r | |
2935 | }\r | |
2936 | nextByte:\r | |
2937 | ;\r | |
2938 | }\r | |
2939 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r | |
2940 | goto onError;\r | |
2941 | Py_XDECREF(errorHandler);\r | |
2942 | Py_XDECREF(exc);\r | |
2943 | return (PyObject *)v;\r | |
2944 | \r | |
2945 | ucnhashError:\r | |
2946 | PyErr_SetString(\r | |
2947 | PyExc_UnicodeError,\r | |
2948 | "\\N escapes not supported (can't load unicodedata module)"\r | |
2949 | );\r | |
2950 | Py_XDECREF(v);\r | |
2951 | Py_XDECREF(errorHandler);\r | |
2952 | Py_XDECREF(exc);\r | |
2953 | return NULL;\r | |
2954 | \r | |
2955 | onError:\r | |
2956 | Py_XDECREF(v);\r | |
2957 | Py_XDECREF(errorHandler);\r | |
2958 | Py_XDECREF(exc);\r | |
2959 | return NULL;\r | |
2960 | }\r | |
2961 | \r | |
2962 | /* Return a Unicode-Escape string version of the Unicode object.\r | |
2963 | \r | |
2964 | If quotes is true, the string is enclosed in u"" or u'' quotes as\r | |
2965 | appropriate.\r | |
2966 | \r | |
2967 | */\r | |
2968 | \r | |
2969 | Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,\r | |
2970 | Py_ssize_t size,\r | |
2971 | Py_UNICODE ch)\r | |
2972 | {\r | |
2973 | /* like wcschr, but doesn't stop at NULL characters */\r | |
2974 | \r | |
2975 | while (size-- > 0) {\r | |
2976 | if (*s == ch)\r | |
2977 | return s;\r | |
2978 | s++;\r | |
2979 | }\r | |
2980 | \r | |
2981 | return NULL;\r | |
2982 | }\r | |
2983 | \r | |
2984 | static\r | |
2985 | PyObject *unicodeescape_string(const Py_UNICODE *s,\r | |
2986 | Py_ssize_t size,\r | |
2987 | int quotes)\r | |
2988 | {\r | |
2989 | PyObject *repr;\r | |
2990 | char *p;\r | |
2991 | \r | |
2992 | static const char *hexdigit = "0123456789abcdef";\r | |
2993 | #ifdef Py_UNICODE_WIDE\r | |
2994 | const Py_ssize_t expandsize = 10;\r | |
2995 | #else\r | |
2996 | const Py_ssize_t expandsize = 6;\r | |
2997 | #endif\r | |
2998 | \r | |
2999 | /* XXX(nnorwitz): rather than over-allocating, it would be\r | |
3000 | better to choose a different scheme. Perhaps scan the\r | |
3001 | first N-chars of the string and allocate based on that size.\r | |
3002 | */\r | |
3003 | /* Initial allocation is based on the longest-possible unichr\r | |
3004 | escape.\r | |
3005 | \r | |
3006 | In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source\r | |
3007 | unichr, so in this case it's the longest unichr escape. In\r | |
3008 | narrow (UTF-16) builds this is five chars per source unichr\r | |
3009 | since there are two unichrs in the surrogate pair, so in narrow\r | |
3010 | (UTF-16) builds it's not the longest unichr escape.\r | |
3011 | \r | |
3012 | In wide or narrow builds '\uxxxx' is 6 chars per source unichr,\r | |
3013 | so in the narrow (UTF-16) build case it's the longest unichr\r | |
3014 | escape.\r | |
3015 | */\r | |
3016 | \r | |
3017 | if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)\r | |
3018 | return PyErr_NoMemory();\r | |
3019 | \r | |
3020 | repr = PyString_FromStringAndSize(NULL,\r | |
3021 | 2\r | |
3022 | + expandsize*size\r | |
3023 | + 1);\r | |
3024 | if (repr == NULL)\r | |
3025 | return NULL;\r | |
3026 | \r | |
3027 | p = PyString_AS_STRING(repr);\r | |
3028 | \r | |
3029 | if (quotes) {\r | |
3030 | *p++ = 'u';\r | |
3031 | *p++ = (findchar(s, size, '\'') &&\r | |
3032 | !findchar(s, size, '"')) ? '"' : '\'';\r | |
3033 | }\r | |
3034 | while (size-- > 0) {\r | |
3035 | Py_UNICODE ch = *s++;\r | |
3036 | \r | |
3037 | /* Escape quotes and backslashes */\r | |
3038 | if ((quotes &&\r | |
3039 | ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {\r | |
3040 | *p++ = '\\';\r | |
3041 | *p++ = (char) ch;\r | |
3042 | continue;\r | |
3043 | }\r | |
3044 | \r | |
3045 | #ifdef Py_UNICODE_WIDE\r | |
3046 | /* Map 21-bit characters to '\U00xxxxxx' */\r | |
3047 | else if (ch >= 0x10000) {\r | |
3048 | *p++ = '\\';\r | |
3049 | *p++ = 'U';\r | |
3050 | *p++ = hexdigit[(ch >> 28) & 0x0000000F];\r | |
3051 | *p++ = hexdigit[(ch >> 24) & 0x0000000F];\r | |
3052 | *p++ = hexdigit[(ch >> 20) & 0x0000000F];\r | |
3053 | *p++ = hexdigit[(ch >> 16) & 0x0000000F];\r | |
3054 | *p++ = hexdigit[(ch >> 12) & 0x0000000F];\r | |
3055 | *p++ = hexdigit[(ch >> 8) & 0x0000000F];\r | |
3056 | *p++ = hexdigit[(ch >> 4) & 0x0000000F];\r | |
3057 | *p++ = hexdigit[ch & 0x0000000F];\r | |
3058 | continue;\r | |
3059 | }\r | |
3060 | #else\r | |
3061 | /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */\r | |
3062 | else if (ch >= 0xD800 && ch < 0xDC00) {\r | |
3063 | Py_UNICODE ch2;\r | |
3064 | Py_UCS4 ucs;\r | |
3065 | \r | |
3066 | ch2 = *s++;\r | |
3067 | size--;\r | |
3068 | if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {\r | |
3069 | ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;\r | |
3070 | *p++ = '\\';\r | |
3071 | *p++ = 'U';\r | |
3072 | *p++ = hexdigit[(ucs >> 28) & 0x0000000F];\r | |
3073 | *p++ = hexdigit[(ucs >> 24) & 0x0000000F];\r | |
3074 | *p++ = hexdigit[(ucs >> 20) & 0x0000000F];\r | |
3075 | *p++ = hexdigit[(ucs >> 16) & 0x0000000F];\r | |
3076 | *p++ = hexdigit[(ucs >> 12) & 0x0000000F];\r | |
3077 | *p++ = hexdigit[(ucs >> 8) & 0x0000000F];\r | |
3078 | *p++ = hexdigit[(ucs >> 4) & 0x0000000F];\r | |
3079 | *p++ = hexdigit[ucs & 0x0000000F];\r | |
3080 | continue;\r | |
3081 | }\r | |
3082 | /* Fall through: isolated surrogates are copied as-is */\r | |
3083 | s--;\r | |
3084 | size++;\r | |
3085 | }\r | |
3086 | #endif\r | |
3087 | \r | |
3088 | /* Map 16-bit characters to '\uxxxx' */\r | |
3089 | if (ch >= 256) {\r | |
3090 | *p++ = '\\';\r | |
3091 | *p++ = 'u';\r | |
3092 | *p++ = hexdigit[(ch >> 12) & 0x000F];\r | |
3093 | *p++ = hexdigit[(ch >> 8) & 0x000F];\r | |
3094 | *p++ = hexdigit[(ch >> 4) & 0x000F];\r | |
3095 | *p++ = hexdigit[ch & 0x000F];\r | |
3096 | }\r | |
3097 | \r | |
3098 | /* Map special whitespace to '\t', \n', '\r' */\r | |
3099 | else if (ch == '\t') {\r | |
3100 | *p++ = '\\';\r | |
3101 | *p++ = 't';\r | |
3102 | }\r | |
3103 | else if (ch == '\n') {\r | |
3104 | *p++ = '\\';\r | |
3105 | *p++ = 'n';\r | |
3106 | }\r | |
3107 | else if (ch == '\r') {\r | |
3108 | *p++ = '\\';\r | |
3109 | *p++ = 'r';\r | |
3110 | }\r | |
3111 | \r | |
3112 | /* Map non-printable US ASCII to '\xhh' */\r | |
3113 | else if (ch < ' ' || ch >= 0x7F) {\r | |
3114 | *p++ = '\\';\r | |
3115 | *p++ = 'x';\r | |
3116 | *p++ = hexdigit[(ch >> 4) & 0x000F];\r | |
3117 | *p++ = hexdigit[ch & 0x000F];\r | |
3118 | }\r | |
3119 | \r | |
3120 | /* Copy everything else as-is */\r | |
3121 | else\r | |
3122 | *p++ = (char) ch;\r | |
3123 | }\r | |
3124 | if (quotes)\r | |
3125 | *p++ = PyString_AS_STRING(repr)[1];\r | |
3126 | \r | |
3127 | *p = '\0';\r | |
3128 | if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))\r | |
3129 | return NULL;\r | |
3130 | return repr;\r | |
3131 | }\r | |
3132 | \r | |
3133 | PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,\r | |
3134 | Py_ssize_t size)\r | |
3135 | {\r | |
3136 | return unicodeescape_string(s, size, 0);\r | |
3137 | }\r | |
3138 | \r | |
3139 | PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)\r | |
3140 | {\r | |
3141 | if (!PyUnicode_Check(unicode)) {\r | |
3142 | PyErr_BadArgument();\r | |
3143 | return NULL;\r | |
3144 | }\r | |
3145 | return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),\r | |
3146 | PyUnicode_GET_SIZE(unicode));\r | |
3147 | }\r | |
3148 | \r | |
3149 | /* --- Raw Unicode Escape Codec ------------------------------------------- */\r | |
3150 | \r | |
3151 | PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,\r | |
3152 | Py_ssize_t size,\r | |
3153 | const char *errors)\r | |
3154 | {\r | |
3155 | const char *starts = s;\r | |
3156 | Py_ssize_t startinpos;\r | |
3157 | Py_ssize_t endinpos;\r | |
3158 | Py_ssize_t outpos;\r | |
3159 | PyUnicodeObject *v;\r | |
3160 | Py_UNICODE *p;\r | |
3161 | const char *end;\r | |
3162 | const char *bs;\r | |
3163 | PyObject *errorHandler = NULL;\r | |
3164 | PyObject *exc = NULL;\r | |
3165 | \r | |
3166 | /* Escaped strings will always be longer than the resulting\r | |
3167 | Unicode string, so we start with size here and then reduce the\r | |
3168 | length after conversion to the true value. (But decoding error\r | |
3169 | handler might have to resize the string) */\r | |
3170 | v = _PyUnicode_New(size);\r | |
3171 | if (v == NULL)\r | |
3172 | goto onError;\r | |
3173 | if (size == 0)\r | |
3174 | return (PyObject *)v;\r | |
3175 | p = PyUnicode_AS_UNICODE(v);\r | |
3176 | end = s + size;\r | |
3177 | while (s < end) {\r | |
3178 | unsigned char c;\r | |
3179 | Py_UCS4 x;\r | |
3180 | int i;\r | |
3181 | int count;\r | |
3182 | \r | |
3183 | /* Non-escape characters are interpreted as Unicode ordinals */\r | |
3184 | if (*s != '\\') {\r | |
3185 | *p++ = (unsigned char)*s++;\r | |
3186 | continue;\r | |
3187 | }\r | |
3188 | startinpos = s-starts;\r | |
3189 | \r | |
3190 | /* \u-escapes are only interpreted iff the number of leading\r | |
3191 | backslashes if odd */\r | |
3192 | bs = s;\r | |
3193 | for (;s < end;) {\r | |
3194 | if (*s != '\\')\r | |
3195 | break;\r | |
3196 | *p++ = (unsigned char)*s++;\r | |
3197 | }\r | |
3198 | if (((s - bs) & 1) == 0 ||\r | |
3199 | s >= end ||\r | |
3200 | (*s != 'u' && *s != 'U')) {\r | |
3201 | continue;\r | |
3202 | }\r | |
3203 | p--;\r | |
3204 | count = *s=='u' ? 4 : 8;\r | |
3205 | s++;\r | |
3206 | \r | |
3207 | /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */\r | |
3208 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
3209 | for (x = 0, i = 0; i < count; ++i, ++s) {\r | |
3210 | c = (unsigned char)*s;\r | |
3211 | if (!isxdigit(c)) {\r | |
3212 | endinpos = s-starts;\r | |
3213 | if (unicode_decode_call_errorhandler(\r | |
3214 | errors, &errorHandler,\r | |
3215 | "rawunicodeescape", "truncated \\uXXXX",\r | |
3216 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
3217 | &v, &outpos, &p))\r | |
3218 | goto onError;\r | |
3219 | goto nextByte;\r | |
3220 | }\r | |
3221 | x = (x<<4) & ~0xF;\r | |
3222 | if (c >= '0' && c <= '9')\r | |
3223 | x += c - '0';\r | |
3224 | else if (c >= 'a' && c <= 'f')\r | |
3225 | x += 10 + c - 'a';\r | |
3226 | else\r | |
3227 | x += 10 + c - 'A';\r | |
3228 | }\r | |
3229 | if (x <= 0xffff)\r | |
3230 | /* UCS-2 character */\r | |
3231 | *p++ = (Py_UNICODE) x;\r | |
3232 | else if (x <= 0x10ffff) {\r | |
3233 | /* UCS-4 character. Either store directly, or as\r | |
3234 | surrogate pair. */\r | |
3235 | #ifdef Py_UNICODE_WIDE\r | |
3236 | *p++ = (Py_UNICODE) x;\r | |
3237 | #else\r | |
3238 | x -= 0x10000L;\r | |
3239 | *p++ = 0xD800 + (Py_UNICODE) (x >> 10);\r | |
3240 | *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);\r | |
3241 | #endif\r | |
3242 | } else {\r | |
3243 | endinpos = s-starts;\r | |
3244 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
3245 | if (unicode_decode_call_errorhandler(\r | |
3246 | errors, &errorHandler,\r | |
3247 | "rawunicodeescape", "\\Uxxxxxxxx out of range",\r | |
3248 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
3249 | &v, &outpos, &p))\r | |
3250 | goto onError;\r | |
3251 | }\r | |
3252 | nextByte:\r | |
3253 | ;\r | |
3254 | }\r | |
3255 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r | |
3256 | goto onError;\r | |
3257 | Py_XDECREF(errorHandler);\r | |
3258 | Py_XDECREF(exc);\r | |
3259 | return (PyObject *)v;\r | |
3260 | \r | |
3261 | onError:\r | |
3262 | Py_XDECREF(v);\r | |
3263 | Py_XDECREF(errorHandler);\r | |
3264 | Py_XDECREF(exc);\r | |
3265 | return NULL;\r | |
3266 | }\r | |
3267 | \r | |
3268 | PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,\r | |
3269 | Py_ssize_t size)\r | |
3270 | {\r | |
3271 | PyObject *repr;\r | |
3272 | char *p;\r | |
3273 | char *q;\r | |
3274 | \r | |
3275 | static const char *hexdigit = "0123456789abcdef";\r | |
3276 | #ifdef Py_UNICODE_WIDE\r | |
3277 | const Py_ssize_t expandsize = 10;\r | |
3278 | #else\r | |
3279 | const Py_ssize_t expandsize = 6;\r | |
3280 | #endif\r | |
3281 | \r | |
3282 | if (size > PY_SSIZE_T_MAX / expandsize)\r | |
3283 | return PyErr_NoMemory();\r | |
3284 | \r | |
3285 | repr = PyString_FromStringAndSize(NULL, expandsize * size);\r | |
3286 | if (repr == NULL)\r | |
3287 | return NULL;\r | |
3288 | if (size == 0)\r | |
3289 | return repr;\r | |
3290 | \r | |
3291 | p = q = PyString_AS_STRING(repr);\r | |
3292 | while (size-- > 0) {\r | |
3293 | Py_UNICODE ch = *s++;\r | |
3294 | #ifdef Py_UNICODE_WIDE\r | |
3295 | /* Map 32-bit characters to '\Uxxxxxxxx' */\r | |
3296 | if (ch >= 0x10000) {\r | |
3297 | *p++ = '\\';\r | |
3298 | *p++ = 'U';\r | |
3299 | *p++ = hexdigit[(ch >> 28) & 0xf];\r | |
3300 | *p++ = hexdigit[(ch >> 24) & 0xf];\r | |
3301 | *p++ = hexdigit[(ch >> 20) & 0xf];\r | |
3302 | *p++ = hexdigit[(ch >> 16) & 0xf];\r | |
3303 | *p++ = hexdigit[(ch >> 12) & 0xf];\r | |
3304 | *p++ = hexdigit[(ch >> 8) & 0xf];\r | |
3305 | *p++ = hexdigit[(ch >> 4) & 0xf];\r | |
3306 | *p++ = hexdigit[ch & 15];\r | |
3307 | }\r | |
3308 | else\r | |
3309 | #else\r | |
3310 | /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */\r | |
3311 | if (ch >= 0xD800 && ch < 0xDC00) {\r | |
3312 | Py_UNICODE ch2;\r | |
3313 | Py_UCS4 ucs;\r | |
3314 | \r | |
3315 | ch2 = *s++;\r | |
3316 | size--;\r | |
3317 | if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {\r | |
3318 | ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;\r | |
3319 | *p++ = '\\';\r | |
3320 | *p++ = 'U';\r | |
3321 | *p++ = hexdigit[(ucs >> 28) & 0xf];\r | |
3322 | *p++ = hexdigit[(ucs >> 24) & 0xf];\r | |
3323 | *p++ = hexdigit[(ucs >> 20) & 0xf];\r | |
3324 | *p++ = hexdigit[(ucs >> 16) & 0xf];\r | |
3325 | *p++ = hexdigit[(ucs >> 12) & 0xf];\r | |
3326 | *p++ = hexdigit[(ucs >> 8) & 0xf];\r | |
3327 | *p++ = hexdigit[(ucs >> 4) & 0xf];\r | |
3328 | *p++ = hexdigit[ucs & 0xf];\r | |
3329 | continue;\r | |
3330 | }\r | |
3331 | /* Fall through: isolated surrogates are copied as-is */\r | |
3332 | s--;\r | |
3333 | size++;\r | |
3334 | }\r | |
3335 | #endif\r | |
3336 | /* Map 16-bit characters to '\uxxxx' */\r | |
3337 | if (ch >= 256) {\r | |
3338 | *p++ = '\\';\r | |
3339 | *p++ = 'u';\r | |
3340 | *p++ = hexdigit[(ch >> 12) & 0xf];\r | |
3341 | *p++ = hexdigit[(ch >> 8) & 0xf];\r | |
3342 | *p++ = hexdigit[(ch >> 4) & 0xf];\r | |
3343 | *p++ = hexdigit[ch & 15];\r | |
3344 | }\r | |
3345 | /* Copy everything else as-is */\r | |
3346 | else\r | |
3347 | *p++ = (char) ch;\r | |
3348 | }\r | |
3349 | *p = '\0';\r | |
3350 | if (_PyString_Resize(&repr, p - q))\r | |
3351 | return NULL;\r | |
3352 | return repr;\r | |
3353 | }\r | |
3354 | \r | |
3355 | PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)\r | |
3356 | {\r | |
3357 | if (!PyUnicode_Check(unicode)) {\r | |
3358 | PyErr_BadArgument();\r | |
3359 | return NULL;\r | |
3360 | }\r | |
3361 | return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),\r | |
3362 | PyUnicode_GET_SIZE(unicode));\r | |
3363 | }\r | |
3364 | \r | |
3365 | /* --- Unicode Internal Codec ------------------------------------------- */\r | |
3366 | \r | |
3367 | PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,\r | |
3368 | Py_ssize_t size,\r | |
3369 | const char *errors)\r | |
3370 | {\r | |
3371 | const char *starts = s;\r | |
3372 | Py_ssize_t startinpos;\r | |
3373 | Py_ssize_t endinpos;\r | |
3374 | Py_ssize_t outpos;\r | |
3375 | PyUnicodeObject *v;\r | |
3376 | Py_UNICODE *p;\r | |
3377 | const char *end;\r | |
3378 | const char *reason;\r | |
3379 | PyObject *errorHandler = NULL;\r | |
3380 | PyObject *exc = NULL;\r | |
3381 | \r | |
3382 | #ifdef Py_UNICODE_WIDE\r | |
3383 | Py_UNICODE unimax = PyUnicode_GetMax();\r | |
3384 | #endif\r | |
3385 | \r | |
3386 | /* XXX overflow detection missing */\r | |
3387 | v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);\r | |
3388 | if (v == NULL)\r | |
3389 | goto onError;\r | |
3390 | if (PyUnicode_GetSize((PyObject *)v) == 0)\r | |
3391 | return (PyObject *)v;\r | |
3392 | p = PyUnicode_AS_UNICODE(v);\r | |
3393 | end = s + size;\r | |
3394 | \r | |
3395 | while (s < end) {\r | |
3396 | memcpy(p, s, sizeof(Py_UNICODE));\r | |
3397 | /* We have to sanity check the raw data, otherwise doom looms for\r | |
3398 | some malformed UCS-4 data. */\r | |
3399 | if (\r | |
3400 | #ifdef Py_UNICODE_WIDE\r | |
3401 | *p > unimax || *p < 0 ||\r | |
3402 | #endif\r | |
3403 | end-s < Py_UNICODE_SIZE\r | |
3404 | )\r | |
3405 | {\r | |
3406 | startinpos = s - starts;\r | |
3407 | if (end-s < Py_UNICODE_SIZE) {\r | |
3408 | endinpos = end-starts;\r | |
3409 | reason = "truncated input";\r | |
3410 | }\r | |
3411 | else {\r | |
3412 | endinpos = s - starts + Py_UNICODE_SIZE;\r | |
3413 | reason = "illegal code point (> 0x10FFFF)";\r | |
3414 | }\r | |
3415 | outpos = p - PyUnicode_AS_UNICODE(v);\r | |
3416 | if (unicode_decode_call_errorhandler(\r | |
3417 | errors, &errorHandler,\r | |
3418 | "unicode_internal", reason,\r | |
3419 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
3420 | &v, &outpos, &p)) {\r | |
3421 | goto onError;\r | |
3422 | }\r | |
3423 | }\r | |
3424 | else {\r | |
3425 | p++;\r | |
3426 | s += Py_UNICODE_SIZE;\r | |
3427 | }\r | |
3428 | }\r | |
3429 | \r | |
3430 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r | |
3431 | goto onError;\r | |
3432 | Py_XDECREF(errorHandler);\r | |
3433 | Py_XDECREF(exc);\r | |
3434 | return (PyObject *)v;\r | |
3435 | \r | |
3436 | onError:\r | |
3437 | Py_XDECREF(v);\r | |
3438 | Py_XDECREF(errorHandler);\r | |
3439 | Py_XDECREF(exc);\r | |
3440 | return NULL;\r | |
3441 | }\r | |
3442 | \r | |
3443 | /* --- Latin-1 Codec ------------------------------------------------------ */\r | |
3444 | \r | |
3445 | PyObject *PyUnicode_DecodeLatin1(const char *s,\r | |
3446 | Py_ssize_t size,\r | |
3447 | const char *errors)\r | |
3448 | {\r | |
3449 | PyUnicodeObject *v;\r | |
3450 | Py_UNICODE *p;\r | |
3451 | \r | |
3452 | /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */\r | |
3453 | if (size == 1) {\r | |
3454 | Py_UNICODE r = *(unsigned char*)s;\r | |
3455 | return PyUnicode_FromUnicode(&r, 1);\r | |
3456 | }\r | |
3457 | \r | |
3458 | v = _PyUnicode_New(size);\r | |
3459 | if (v == NULL)\r | |
3460 | goto onError;\r | |
3461 | if (size == 0)\r | |
3462 | return (PyObject *)v;\r | |
3463 | p = PyUnicode_AS_UNICODE(v);\r | |
3464 | while (size-- > 0)\r | |
3465 | *p++ = (unsigned char)*s++;\r | |
3466 | return (PyObject *)v;\r | |
3467 | \r | |
3468 | onError:\r | |
3469 | Py_XDECREF(v);\r | |
3470 | return NULL;\r | |
3471 | }\r | |
3472 | \r | |
3473 | /* create or adjust a UnicodeEncodeError */\r | |
3474 | static void make_encode_exception(PyObject **exceptionObject,\r | |
3475 | const char *encoding,\r | |
3476 | const Py_UNICODE *unicode, Py_ssize_t size,\r | |
3477 | Py_ssize_t startpos, Py_ssize_t endpos,\r | |
3478 | const char *reason)\r | |
3479 | {\r | |
3480 | if (*exceptionObject == NULL) {\r | |
3481 | *exceptionObject = PyUnicodeEncodeError_Create(\r | |
3482 | encoding, unicode, size, startpos, endpos, reason);\r | |
3483 | }\r | |
3484 | else {\r | |
3485 | if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))\r | |
3486 | goto onError;\r | |
3487 | if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))\r | |
3488 | goto onError;\r | |
3489 | if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))\r | |
3490 | goto onError;\r | |
3491 | return;\r | |
3492 | onError:\r | |
3493 | Py_DECREF(*exceptionObject);\r | |
3494 | *exceptionObject = NULL;\r | |
3495 | }\r | |
3496 | }\r | |
3497 | \r | |
3498 | /* raises a UnicodeEncodeError */\r | |
3499 | static void raise_encode_exception(PyObject **exceptionObject,\r | |
3500 | const char *encoding,\r | |
3501 | const Py_UNICODE *unicode, Py_ssize_t size,\r | |
3502 | Py_ssize_t startpos, Py_ssize_t endpos,\r | |
3503 | const char *reason)\r | |
3504 | {\r | |
3505 | make_encode_exception(exceptionObject,\r | |
3506 | encoding, unicode, size, startpos, endpos, reason);\r | |
3507 | if (*exceptionObject != NULL)\r | |
3508 | PyCodec_StrictErrors(*exceptionObject);\r | |
3509 | }\r | |
3510 | \r | |
3511 | /* error handling callback helper:\r | |
3512 | build arguments, call the callback and check the arguments,\r | |
3513 | put the result into newpos and return the replacement string, which\r | |
3514 | has to be freed by the caller */\r | |
3515 | static PyObject *unicode_encode_call_errorhandler(const char *errors,\r | |
3516 | PyObject **errorHandler,\r | |
3517 | const char *encoding, const char *reason,\r | |
3518 | const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,\r | |
3519 | Py_ssize_t startpos, Py_ssize_t endpos,\r | |
3520 | Py_ssize_t *newpos)\r | |
3521 | {\r | |
3522 | static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";\r | |
3523 | \r | |
3524 | PyObject *restuple;\r | |
3525 | PyObject *resunicode;\r | |
3526 | \r | |
3527 | if (*errorHandler == NULL) {\r | |
3528 | *errorHandler = PyCodec_LookupError(errors);\r | |
3529 | if (*errorHandler == NULL)\r | |
3530 | return NULL;\r | |
3531 | }\r | |
3532 | \r | |
3533 | make_encode_exception(exceptionObject,\r | |
3534 | encoding, unicode, size, startpos, endpos, reason);\r | |
3535 | if (*exceptionObject == NULL)\r | |
3536 | return NULL;\r | |
3537 | \r | |
3538 | restuple = PyObject_CallFunctionObjArgs(\r | |
3539 | *errorHandler, *exceptionObject, NULL);\r | |
3540 | if (restuple == NULL)\r | |
3541 | return NULL;\r | |
3542 | if (!PyTuple_Check(restuple)) {\r | |
3543 | PyErr_SetString(PyExc_TypeError, &argparse[4]);\r | |
3544 | Py_DECREF(restuple);\r | |
3545 | return NULL;\r | |
3546 | }\r | |
3547 | if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,\r | |
3548 | &resunicode, newpos)) {\r | |
3549 | Py_DECREF(restuple);\r | |
3550 | return NULL;\r | |
3551 | }\r | |
3552 | if (*newpos<0)\r | |
3553 | *newpos = size+*newpos;\r | |
3554 | if (*newpos<0 || *newpos>size) {\r | |
3555 | PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);\r | |
3556 | Py_DECREF(restuple);\r | |
3557 | return NULL;\r | |
3558 | }\r | |
3559 | Py_INCREF(resunicode);\r | |
3560 | Py_DECREF(restuple);\r | |
3561 | return resunicode;\r | |
3562 | }\r | |
3563 | \r | |
3564 | static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,\r | |
3565 | Py_ssize_t size,\r | |
3566 | const char *errors,\r | |
3567 | int limit)\r | |
3568 | {\r | |
3569 | /* output object */\r | |
3570 | PyObject *res;\r | |
3571 | /* pointers to the beginning and end+1 of input */\r | |
3572 | const Py_UNICODE *startp = p;\r | |
3573 | const Py_UNICODE *endp = p + size;\r | |
3574 | /* pointer to the beginning of the unencodable characters */\r | |
3575 | /* const Py_UNICODE *badp = NULL; */\r | |
3576 | /* pointer into the output */\r | |
3577 | char *str;\r | |
3578 | /* current output position */\r | |
3579 | Py_ssize_t respos = 0;\r | |
3580 | Py_ssize_t ressize;\r | |
3581 | const char *encoding = (limit == 256) ? "latin-1" : "ascii";\r | |
3582 | const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";\r | |
3583 | PyObject *errorHandler = NULL;\r | |
3584 | PyObject *exc = NULL;\r | |
3585 | /* the following variable is used for caching string comparisons\r | |
3586 | * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */\r | |
3587 | int known_errorHandler = -1;\r | |
3588 | \r | |
3589 | /* allocate enough for a simple encoding without\r | |
3590 | replacements, if we need more, we'll resize */\r | |
3591 | res = PyString_FromStringAndSize(NULL, size);\r | |
3592 | if (res == NULL)\r | |
3593 | goto onError;\r | |
3594 | if (size == 0)\r | |
3595 | return res;\r | |
3596 | str = PyString_AS_STRING(res);\r | |
3597 | ressize = size;\r | |
3598 | \r | |
3599 | while (p<endp) {\r | |
3600 | Py_UNICODE c = *p;\r | |
3601 | \r | |
3602 | /* can we encode this? */\r | |
3603 | if (c<limit) {\r | |
3604 | /* no overflow check, because we know that the space is enough */\r | |
3605 | *str++ = (char)c;\r | |
3606 | ++p;\r | |
3607 | }\r | |
3608 | else {\r | |
3609 | Py_ssize_t unicodepos = p-startp;\r | |
3610 | Py_ssize_t requiredsize;\r | |
3611 | PyObject *repunicode;\r | |
3612 | Py_ssize_t repsize;\r | |
3613 | Py_ssize_t newpos;\r | |
3614 | Py_ssize_t respos;\r | |
3615 | Py_UNICODE *uni2;\r | |
3616 | /* startpos for collecting unencodable chars */\r | |
3617 | const Py_UNICODE *collstart = p;\r | |
3618 | const Py_UNICODE *collend = p;\r | |
3619 | /* find all unecodable characters */\r | |
3620 | while ((collend < endp) && ((*collend)>=limit))\r | |
3621 | ++collend;\r | |
3622 | /* cache callback name lookup (if not done yet, i.e. it's the first error) */\r | |
3623 | if (known_errorHandler==-1) {\r | |
3624 | if ((errors==NULL) || (!strcmp(errors, "strict")))\r | |
3625 | known_errorHandler = 1;\r | |
3626 | else if (!strcmp(errors, "replace"))\r | |
3627 | known_errorHandler = 2;\r | |
3628 | else if (!strcmp(errors, "ignore"))\r | |
3629 | known_errorHandler = 3;\r | |
3630 | else if (!strcmp(errors, "xmlcharrefreplace"))\r | |
3631 | known_errorHandler = 4;\r | |
3632 | else\r | |
3633 | known_errorHandler = 0;\r | |
3634 | }\r | |
3635 | switch (known_errorHandler) {\r | |
3636 | case 1: /* strict */\r | |
3637 | raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);\r | |
3638 | goto onError;\r | |
3639 | case 2: /* replace */\r | |
3640 | while (collstart++<collend)\r | |
3641 | *str++ = '?'; /* fall through */\r | |
3642 | case 3: /* ignore */\r | |
3643 | p = collend;\r | |
3644 | break;\r | |
3645 | case 4: /* xmlcharrefreplace */\r | |
3646 | respos = str-PyString_AS_STRING(res);\r | |
3647 | /* determine replacement size (temporarily (mis)uses p) */\r | |
3648 | for (p = collstart, repsize = 0; p < collend; ++p) {\r | |
3649 | if (*p<10)\r | |
3650 | repsize += 2+1+1;\r | |
3651 | else if (*p<100)\r | |
3652 | repsize += 2+2+1;\r | |
3653 | else if (*p<1000)\r | |
3654 | repsize += 2+3+1;\r | |
3655 | else if (*p<10000)\r | |
3656 | repsize += 2+4+1;\r | |
3657 | #ifndef Py_UNICODE_WIDE\r | |
3658 | else\r | |
3659 | repsize += 2+5+1;\r | |
3660 | #else\r | |
3661 | else if (*p<100000)\r | |
3662 | repsize += 2+5+1;\r | |
3663 | else if (*p<1000000)\r | |
3664 | repsize += 2+6+1;\r | |
3665 | else\r | |
3666 | repsize += 2+7+1;\r | |
3667 | #endif\r | |
3668 | }\r | |
3669 | requiredsize = respos+repsize+(endp-collend);\r | |
3670 | if (requiredsize > ressize) {\r | |
3671 | if (requiredsize<2*ressize)\r | |
3672 | requiredsize = 2*ressize;\r | |
3673 | if (_PyString_Resize(&res, requiredsize))\r | |
3674 | goto onError;\r | |
3675 | str = PyString_AS_STRING(res) + respos;\r | |
3676 | ressize = requiredsize;\r | |
3677 | }\r | |
3678 | /* generate replacement (temporarily (mis)uses p) */\r | |
3679 | for (p = collstart; p < collend; ++p) {\r | |
3680 | str += sprintf(str, "&#%d;", (int)*p);\r | |
3681 | }\r | |
3682 | p = collend;\r | |
3683 | break;\r | |
3684 | default:\r | |
3685 | repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,\r | |
3686 | encoding, reason, startp, size, &exc,\r | |
3687 | collstart-startp, collend-startp, &newpos);\r | |
3688 | if (repunicode == NULL)\r | |
3689 | goto onError;\r | |
3690 | /* need more space? (at least enough for what we have+the\r | |
3691 | replacement+the rest of the string, so we won't have to\r | |
3692 | check space for encodable characters) */\r | |
3693 | respos = str-PyString_AS_STRING(res);\r | |
3694 | repsize = PyUnicode_GET_SIZE(repunicode);\r | |
3695 | requiredsize = respos+repsize+(endp-collend);\r | |
3696 | if (requiredsize > ressize) {\r | |
3697 | if (requiredsize<2*ressize)\r | |
3698 | requiredsize = 2*ressize;\r | |
3699 | if (_PyString_Resize(&res, requiredsize)) {\r | |
3700 | Py_DECREF(repunicode);\r | |
3701 | goto onError;\r | |
3702 | }\r | |
3703 | str = PyString_AS_STRING(res) + respos;\r | |
3704 | ressize = requiredsize;\r | |
3705 | }\r | |
3706 | /* check if there is anything unencodable in the replacement\r | |
3707 | and copy it to the output */\r | |
3708 | for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {\r | |
3709 | c = *uni2;\r | |
3710 | if (c >= limit) {\r | |
3711 | raise_encode_exception(&exc, encoding, startp, size,\r | |
3712 | unicodepos, unicodepos+1, reason);\r | |
3713 | Py_DECREF(repunicode);\r | |
3714 | goto onError;\r | |
3715 | }\r | |
3716 | *str = (char)c;\r | |
3717 | }\r | |
3718 | p = startp + newpos;\r | |
3719 | Py_DECREF(repunicode);\r | |
3720 | }\r | |
3721 | }\r | |
3722 | }\r | |
3723 | /* Resize if we allocated to much */\r | |
3724 | respos = str-PyString_AS_STRING(res);\r | |
3725 | if (respos<ressize)\r | |
3726 | /* If this falls res will be NULL */\r | |
3727 | _PyString_Resize(&res, respos);\r | |
3728 | Py_XDECREF(errorHandler);\r | |
3729 | Py_XDECREF(exc);\r | |
3730 | return res;\r | |
3731 | \r | |
3732 | onError:\r | |
3733 | Py_XDECREF(res);\r | |
3734 | Py_XDECREF(errorHandler);\r | |
3735 | Py_XDECREF(exc);\r | |
3736 | return NULL;\r | |
3737 | }\r | |
3738 | \r | |
3739 | PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,\r | |
3740 | Py_ssize_t size,\r | |
3741 | const char *errors)\r | |
3742 | {\r | |
3743 | return unicode_encode_ucs1(p, size, errors, 256);\r | |
3744 | }\r | |
3745 | \r | |
3746 | PyObject *PyUnicode_AsLatin1String(PyObject *unicode)\r | |
3747 | {\r | |
3748 | if (!PyUnicode_Check(unicode)) {\r | |
3749 | PyErr_BadArgument();\r | |
3750 | return NULL;\r | |
3751 | }\r | |
3752 | return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),\r | |
3753 | PyUnicode_GET_SIZE(unicode),\r | |
3754 | NULL);\r | |
3755 | }\r | |
3756 | \r | |
3757 | /* --- 7-bit ASCII Codec -------------------------------------------------- */\r | |
3758 | \r | |
3759 | PyObject *PyUnicode_DecodeASCII(const char *s,\r | |
3760 | Py_ssize_t size,\r | |
3761 | const char *errors)\r | |
3762 | {\r | |
3763 | const char *starts = s;\r | |
3764 | PyUnicodeObject *v;\r | |
3765 | Py_UNICODE *p;\r | |
3766 | Py_ssize_t startinpos;\r | |
3767 | Py_ssize_t endinpos;\r | |
3768 | Py_ssize_t outpos;\r | |
3769 | const char *e;\r | |
3770 | PyObject *errorHandler = NULL;\r | |
3771 | PyObject *exc = NULL;\r | |
3772 | \r | |
3773 | /* ASCII is equivalent to the first 128 ordinals in Unicode. */\r | |
3774 | if (size == 1 && *(unsigned char*)s < 128) {\r | |
3775 | Py_UNICODE r = *(unsigned char*)s;\r | |
3776 | return PyUnicode_FromUnicode(&r, 1);\r | |
3777 | }\r | |
3778 | \r | |
3779 | v = _PyUnicode_New(size);\r | |
3780 | if (v == NULL)\r | |
3781 | goto onError;\r | |
3782 | if (size == 0)\r | |
3783 | return (PyObject *)v;\r | |
3784 | p = PyUnicode_AS_UNICODE(v);\r | |
3785 | e = s + size;\r | |
3786 | while (s < e) {\r | |
3787 | register unsigned char c = (unsigned char)*s;\r | |
3788 | if (c < 128) {\r | |
3789 | *p++ = c;\r | |
3790 | ++s;\r | |
3791 | }\r | |
3792 | else {\r | |
3793 | startinpos = s-starts;\r | |
3794 | endinpos = startinpos + 1;\r | |
3795 | outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);\r | |
3796 | if (unicode_decode_call_errorhandler(\r | |
3797 | errors, &errorHandler,\r | |
3798 | "ascii", "ordinal not in range(128)",\r | |
3799 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
3800 | &v, &outpos, &p))\r | |
3801 | goto onError;\r | |
3802 | }\r | |
3803 | }\r | |
3804 | if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))\r | |
3805 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r | |
3806 | goto onError;\r | |
3807 | Py_XDECREF(errorHandler);\r | |
3808 | Py_XDECREF(exc);\r | |
3809 | return (PyObject *)v;\r | |
3810 | \r | |
3811 | onError:\r | |
3812 | Py_XDECREF(v);\r | |
3813 | Py_XDECREF(errorHandler);\r | |
3814 | Py_XDECREF(exc);\r | |
3815 | return NULL;\r | |
3816 | }\r | |
3817 | \r | |
3818 | PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,\r | |
3819 | Py_ssize_t size,\r | |
3820 | const char *errors)\r | |
3821 | {\r | |
3822 | return unicode_encode_ucs1(p, size, errors, 128);\r | |
3823 | }\r | |
3824 | \r | |
3825 | PyObject *PyUnicode_AsASCIIString(PyObject *unicode)\r | |
3826 | {\r | |
3827 | if (!PyUnicode_Check(unicode)) {\r | |
3828 | PyErr_BadArgument();\r | |
3829 | return NULL;\r | |
3830 | }\r | |
3831 | return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),\r | |
3832 | PyUnicode_GET_SIZE(unicode),\r | |
3833 | NULL);\r | |
3834 | }\r | |
3835 | \r | |
3836 | #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)\r | |
3837 | \r | |
3838 | /* --- MBCS codecs for Windows -------------------------------------------- */\r | |
3839 | \r | |
3840 | #if SIZEOF_INT < SIZEOF_SIZE_T\r | |
3841 | #define NEED_RETRY\r | |
3842 | #endif\r | |
3843 | \r | |
3844 | /* XXX This code is limited to "true" double-byte encodings, as\r | |
3845 | a) it assumes an incomplete character consists of a single byte, and\r | |
3846 | b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte\r | |
3847 | encodings, see IsDBCSLeadByteEx documentation. */\r | |
3848 | \r | |
3849 | static int is_dbcs_lead_byte(const char *s, int offset)\r | |
3850 | {\r | |
3851 | const char *curr = s + offset;\r | |
3852 | \r | |
3853 | if (IsDBCSLeadByte(*curr)) {\r | |
3854 | const char *prev = CharPrev(s, curr);\r | |
3855 | return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);\r | |
3856 | }\r | |
3857 | return 0;\r | |
3858 | }\r | |
3859 | \r | |
3860 | /*\r | |
3861 | * Decode MBCS string into unicode object. If 'final' is set, converts\r | |
3862 | * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.\r | |
3863 | */\r | |
3864 | static int decode_mbcs(PyUnicodeObject **v,\r | |
3865 | const char *s, /* MBCS string */\r | |
3866 | int size, /* sizeof MBCS string */\r | |
3867 | int final)\r | |
3868 | {\r | |
3869 | Py_UNICODE *p;\r | |
3870 | Py_ssize_t n = 0;\r | |
3871 | int usize = 0;\r | |
3872 | \r | |
3873 | assert(size >= 0);\r | |
3874 | \r | |
3875 | /* Skip trailing lead-byte unless 'final' is set */\r | |
3876 | if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))\r | |
3877 | --size;\r | |
3878 | \r | |
3879 | /* First get the size of the result */\r | |
3880 | if (size > 0) {\r | |
3881 | usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);\r | |
3882 | if (usize == 0) {\r | |
3883 | PyErr_SetFromWindowsErrWithFilename(0, NULL);\r | |
3884 | return -1;\r | |
3885 | }\r | |
3886 | }\r | |
3887 | \r | |
3888 | if (*v == NULL) {\r | |
3889 | /* Create unicode object */\r | |
3890 | *v = _PyUnicode_New(usize);\r | |
3891 | if (*v == NULL)\r | |
3892 | return -1;\r | |
3893 | }\r | |
3894 | else {\r | |
3895 | /* Extend unicode object */\r | |
3896 | n = PyUnicode_GET_SIZE(*v);\r | |
3897 | if (_PyUnicode_Resize(v, n + usize) < 0)\r | |
3898 | return -1;\r | |
3899 | }\r | |
3900 | \r | |
3901 | /* Do the conversion */\r | |
3902 | if (size > 0) {\r | |
3903 | p = PyUnicode_AS_UNICODE(*v) + n;\r | |
3904 | if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {\r | |
3905 | PyErr_SetFromWindowsErrWithFilename(0, NULL);\r | |
3906 | return -1;\r | |
3907 | }\r | |
3908 | }\r | |
3909 | \r | |
3910 | return size;\r | |
3911 | }\r | |
3912 | \r | |
3913 | PyObject *PyUnicode_DecodeMBCSStateful(const char *s,\r | |
3914 | Py_ssize_t size,\r | |
3915 | const char *errors,\r | |
3916 | Py_ssize_t *consumed)\r | |
3917 | {\r | |
3918 | PyUnicodeObject *v = NULL;\r | |
3919 | int done;\r | |
3920 | \r | |
3921 | if (consumed)\r | |
3922 | *consumed = 0;\r | |
3923 | \r | |
3924 | #ifdef NEED_RETRY\r | |
3925 | retry:\r | |
3926 | if (size > INT_MAX)\r | |
3927 | done = decode_mbcs(&v, s, INT_MAX, 0);\r | |
3928 | else\r | |
3929 | #endif\r | |
3930 | done = decode_mbcs(&v, s, (int)size, !consumed);\r | |
3931 | \r | |
3932 | if (done < 0) {\r | |
3933 | Py_XDECREF(v);\r | |
3934 | return NULL;\r | |
3935 | }\r | |
3936 | \r | |
3937 | if (consumed)\r | |
3938 | *consumed += done;\r | |
3939 | \r | |
3940 | #ifdef NEED_RETRY\r | |
3941 | if (size > INT_MAX) {\r | |
3942 | s += done;\r | |
3943 | size -= done;\r | |
3944 | goto retry;\r | |
3945 | }\r | |
3946 | #endif\r | |
3947 | \r | |
3948 | return (PyObject *)v;\r | |
3949 | }\r | |
3950 | \r | |
3951 | PyObject *PyUnicode_DecodeMBCS(const char *s,\r | |
3952 | Py_ssize_t size,\r | |
3953 | const char *errors)\r | |
3954 | {\r | |
3955 | return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);\r | |
3956 | }\r | |
3957 | \r | |
3958 | /*\r | |
3959 | * Convert unicode into string object (MBCS).\r | |
3960 | * Returns 0 if succeed, -1 otherwise.\r | |
3961 | */\r | |
3962 | static int encode_mbcs(PyObject **repr,\r | |
3963 | const Py_UNICODE *p, /* unicode */\r | |
3964 | int size) /* size of unicode */\r | |
3965 | {\r | |
3966 | int mbcssize = 0;\r | |
3967 | Py_ssize_t n = 0;\r | |
3968 | \r | |
3969 | assert(size >= 0);\r | |
3970 | \r | |
3971 | /* First get the size of the result */\r | |
3972 | if (size > 0) {\r | |
3973 | mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);\r | |
3974 | if (mbcssize == 0) {\r | |
3975 | PyErr_SetFromWindowsErrWithFilename(0, NULL);\r | |
3976 | return -1;\r | |
3977 | }\r | |
3978 | }\r | |
3979 | \r | |
3980 | if (*repr == NULL) {\r | |
3981 | /* Create string object */\r | |
3982 | *repr = PyString_FromStringAndSize(NULL, mbcssize);\r | |
3983 | if (*repr == NULL)\r | |
3984 | return -1;\r | |
3985 | }\r | |
3986 | else {\r | |
3987 | /* Extend string object */\r | |
3988 | n = PyString_Size(*repr);\r | |
3989 | if (_PyString_Resize(repr, n + mbcssize) < 0)\r | |
3990 | return -1;\r | |
3991 | }\r | |
3992 | \r | |
3993 | /* Do the conversion */\r | |
3994 | if (size > 0) {\r | |
3995 | char *s = PyString_AS_STRING(*repr) + n;\r | |
3996 | if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {\r | |
3997 | PyErr_SetFromWindowsErrWithFilename(0, NULL);\r | |
3998 | return -1;\r | |
3999 | }\r | |
4000 | }\r | |
4001 | \r | |
4002 | return 0;\r | |
4003 | }\r | |
4004 | \r | |
4005 | PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,\r | |
4006 | Py_ssize_t size,\r | |
4007 | const char *errors)\r | |
4008 | {\r | |
4009 | PyObject *repr = NULL;\r | |
4010 | int ret;\r | |
4011 | \r | |
4012 | #ifdef NEED_RETRY\r | |
4013 | retry:\r | |
4014 | if (size > INT_MAX)\r | |
4015 | ret = encode_mbcs(&repr, p, INT_MAX);\r | |
4016 | else\r | |
4017 | #endif\r | |
4018 | ret = encode_mbcs(&repr, p, (int)size);\r | |
4019 | \r | |
4020 | if (ret < 0) {\r | |
4021 | Py_XDECREF(repr);\r | |
4022 | return NULL;\r | |
4023 | }\r | |
4024 | \r | |
4025 | #ifdef NEED_RETRY\r | |
4026 | if (size > INT_MAX) {\r | |
4027 | p += INT_MAX;\r | |
4028 | size -= INT_MAX;\r | |
4029 | goto retry;\r | |
4030 | }\r | |
4031 | #endif\r | |
4032 | \r | |
4033 | return repr;\r | |
4034 | }\r | |
4035 | \r | |
4036 | PyObject *PyUnicode_AsMBCSString(PyObject *unicode)\r | |
4037 | {\r | |
4038 | if (!PyUnicode_Check(unicode)) {\r | |
4039 | PyErr_BadArgument();\r | |
4040 | return NULL;\r | |
4041 | }\r | |
4042 | return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),\r | |
4043 | PyUnicode_GET_SIZE(unicode),\r | |
4044 | NULL);\r | |
4045 | }\r | |
4046 | \r | |
4047 | #undef NEED_RETRY\r | |
4048 | \r | |
4049 | #endif /* MS_WINDOWS */\r | |
4050 | \r | |
4051 | /* --- Character Mapping Codec -------------------------------------------- */\r | |
4052 | \r | |
4053 | PyObject *PyUnicode_DecodeCharmap(const char *s,\r | |
4054 | Py_ssize_t size,\r | |
4055 | PyObject *mapping,\r | |
4056 | const char *errors)\r | |
4057 | {\r | |
4058 | const char *starts = s;\r | |
4059 | Py_ssize_t startinpos;\r | |
4060 | Py_ssize_t endinpos;\r | |
4061 | Py_ssize_t outpos;\r | |
4062 | const char *e;\r | |
4063 | PyUnicodeObject *v;\r | |
4064 | Py_UNICODE *p;\r | |
4065 | Py_ssize_t extrachars = 0;\r | |
4066 | PyObject *errorHandler = NULL;\r | |
4067 | PyObject *exc = NULL;\r | |
4068 | Py_UNICODE *mapstring = NULL;\r | |
4069 | Py_ssize_t maplen = 0;\r | |
4070 | \r | |
4071 | /* Default to Latin-1 */\r | |
4072 | if (mapping == NULL)\r | |
4073 | return PyUnicode_DecodeLatin1(s, size, errors);\r | |
4074 | \r | |
4075 | v = _PyUnicode_New(size);\r | |
4076 | if (v == NULL)\r | |
4077 | goto onError;\r | |
4078 | if (size == 0)\r | |
4079 | return (PyObject *)v;\r | |
4080 | p = PyUnicode_AS_UNICODE(v);\r | |
4081 | e = s + size;\r | |
4082 | if (PyUnicode_CheckExact(mapping)) {\r | |
4083 | mapstring = PyUnicode_AS_UNICODE(mapping);\r | |
4084 | maplen = PyUnicode_GET_SIZE(mapping);\r | |
4085 | while (s < e) {\r | |
4086 | unsigned char ch = *s;\r | |
4087 | Py_UNICODE x = 0xfffe; /* illegal value */\r | |
4088 | \r | |
4089 | if (ch < maplen)\r | |
4090 | x = mapstring[ch];\r | |
4091 | \r | |
4092 | if (x == 0xfffe) {\r | |
4093 | /* undefined mapping */\r | |
4094 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
4095 | startinpos = s-starts;\r | |
4096 | endinpos = startinpos+1;\r | |
4097 | if (unicode_decode_call_errorhandler(\r | |
4098 | errors, &errorHandler,\r | |
4099 | "charmap", "character maps to <undefined>",\r | |
4100 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
4101 | &v, &outpos, &p)) {\r | |
4102 | goto onError;\r | |
4103 | }\r | |
4104 | continue;\r | |
4105 | }\r | |
4106 | *p++ = x;\r | |
4107 | ++s;\r | |
4108 | }\r | |
4109 | }\r | |
4110 | else {\r | |
4111 | while (s < e) {\r | |
4112 | unsigned char ch = *s;\r | |
4113 | PyObject *w, *x;\r | |
4114 | \r | |
4115 | /* Get mapping (char ordinal -> integer, Unicode char or None) */\r | |
4116 | w = PyInt_FromLong((long)ch);\r | |
4117 | if (w == NULL)\r | |
4118 | goto onError;\r | |
4119 | x = PyObject_GetItem(mapping, w);\r | |
4120 | Py_DECREF(w);\r | |
4121 | if (x == NULL) {\r | |
4122 | if (PyErr_ExceptionMatches(PyExc_LookupError)) {\r | |
4123 | /* No mapping found means: mapping is undefined. */\r | |
4124 | PyErr_Clear();\r | |
4125 | x = Py_None;\r | |
4126 | Py_INCREF(x);\r | |
4127 | } else\r | |
4128 | goto onError;\r | |
4129 | }\r | |
4130 | \r | |
4131 | /* Apply mapping */\r | |
4132 | if (PyInt_Check(x)) {\r | |
4133 | long value = PyInt_AS_LONG(x);\r | |
4134 | if (value < 0 || value > 65535) {\r | |
4135 | PyErr_SetString(PyExc_TypeError,\r | |
4136 | "character mapping must be in range(65536)");\r | |
4137 | Py_DECREF(x);\r | |
4138 | goto onError;\r | |
4139 | }\r | |
4140 | *p++ = (Py_UNICODE)value;\r | |
4141 | }\r | |
4142 | else if (x == Py_None) {\r | |
4143 | /* undefined mapping */\r | |
4144 | outpos = p-PyUnicode_AS_UNICODE(v);\r | |
4145 | startinpos = s-starts;\r | |
4146 | endinpos = startinpos+1;\r | |
4147 | if (unicode_decode_call_errorhandler(\r | |
4148 | errors, &errorHandler,\r | |
4149 | "charmap", "character maps to <undefined>",\r | |
4150 | starts, size, &startinpos, &endinpos, &exc, &s,\r | |
4151 | &v, &outpos, &p)) {\r | |
4152 | Py_DECREF(x);\r | |
4153 | goto onError;\r | |
4154 | }\r | |
4155 | Py_DECREF(x);\r | |
4156 | continue;\r | |
4157 | }\r | |
4158 | else if (PyUnicode_Check(x)) {\r | |
4159 | Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);\r | |
4160 | \r | |
4161 | if (targetsize == 1)\r | |
4162 | /* 1-1 mapping */\r | |
4163 | *p++ = *PyUnicode_AS_UNICODE(x);\r | |
4164 | \r | |
4165 | else if (targetsize > 1) {\r | |
4166 | /* 1-n mapping */\r | |
4167 | if (targetsize > extrachars) {\r | |
4168 | /* resize first */\r | |
4169 | Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);\r | |
4170 | Py_ssize_t needed = (targetsize - extrachars) + \\r | |
4171 | (targetsize << 2);\r | |
4172 | extrachars += needed;\r | |
4173 | /* XXX overflow detection missing */\r | |
4174 | if (_PyUnicode_Resize(&v,\r | |
4175 | PyUnicode_GET_SIZE(v) + needed) < 0) {\r | |
4176 | Py_DECREF(x);\r | |
4177 | goto onError;\r | |
4178 | }\r | |
4179 | p = PyUnicode_AS_UNICODE(v) + oldpos;\r | |
4180 | }\r | |
4181 | Py_UNICODE_COPY(p,\r | |
4182 | PyUnicode_AS_UNICODE(x),\r | |
4183 | targetsize);\r | |
4184 | p += targetsize;\r | |
4185 | extrachars -= targetsize;\r | |
4186 | }\r | |
4187 | /* 1-0 mapping: skip the character */\r | |
4188 | }\r | |
4189 | else {\r | |
4190 | /* wrong return value */\r | |
4191 | PyErr_SetString(PyExc_TypeError,\r | |
4192 | "character mapping must return integer, None or unicode");\r | |
4193 | Py_DECREF(x);\r | |
4194 | goto onError;\r | |
4195 | }\r | |
4196 | Py_DECREF(x);\r | |
4197 | ++s;\r | |
4198 | }\r | |
4199 | }\r | |
4200 | if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))\r | |
4201 | if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r | |
4202 | goto onError;\r | |
4203 | Py_XDECREF(errorHandler);\r | |
4204 | Py_XDECREF(exc);\r | |
4205 | return (PyObject *)v;\r | |
4206 | \r | |
4207 | onError:\r | |
4208 | Py_XDECREF(errorHandler);\r | |
4209 | Py_XDECREF(exc);\r | |
4210 | Py_XDECREF(v);\r | |
4211 | return NULL;\r | |
4212 | }\r | |
4213 | \r | |
4214 | /* Charmap encoding: the lookup table */\r | |
4215 | \r | |
4216 | struct encoding_map{\r | |
4217 | PyObject_HEAD\r | |
4218 | unsigned char level1[32];\r | |
4219 | int count2, count3;\r | |
4220 | unsigned char level23[1];\r | |
4221 | };\r | |
4222 | \r | |
4223 | static PyObject*\r | |
4224 | encoding_map_size(PyObject *obj, PyObject* args)\r | |
4225 | {\r | |
4226 | struct encoding_map *map = (struct encoding_map*)obj;\r | |
4227 | return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +\r | |
4228 | 128*map->count3);\r | |
4229 | }\r | |
4230 | \r | |
4231 | static PyMethodDef encoding_map_methods[] = {\r | |
4232 | {"size", encoding_map_size, METH_NOARGS,\r | |
4233 | PyDoc_STR("Return the size (in bytes) of this object") },\r | |
4234 | { 0 }\r | |
4235 | };\r | |
4236 | \r | |
4237 | static void\r | |
4238 | encoding_map_dealloc(PyObject* o)\r | |
4239 | {\r | |
4240 | PyObject_FREE(o);\r | |
4241 | }\r | |
4242 | \r | |
4243 | static PyTypeObject EncodingMapType = {\r | |
4244 | PyVarObject_HEAD_INIT(NULL, 0)\r | |
4245 | "EncodingMap", /*tp_name*/\r | |
4246 | sizeof(struct encoding_map), /*tp_basicsize*/\r | |
4247 | 0, /*tp_itemsize*/\r | |
4248 | /* methods */\r | |
4249 | encoding_map_dealloc, /*tp_dealloc*/\r | |
4250 | 0, /*tp_print*/\r | |
4251 | 0, /*tp_getattr*/\r | |
4252 | 0, /*tp_setattr*/\r | |
4253 | 0, /*tp_compare*/\r | |
4254 | 0, /*tp_repr*/\r | |
4255 | 0, /*tp_as_number*/\r | |
4256 | 0, /*tp_as_sequence*/\r | |
4257 | 0, /*tp_as_mapping*/\r | |
4258 | 0, /*tp_hash*/\r | |
4259 | 0, /*tp_call*/\r | |
4260 | 0, /*tp_str*/\r | |
4261 | 0, /*tp_getattro*/\r | |
4262 | 0, /*tp_setattro*/\r | |
4263 | 0, /*tp_as_buffer*/\r | |
4264 | Py_TPFLAGS_DEFAULT, /*tp_flags*/\r | |
4265 | 0, /*tp_doc*/\r | |
4266 | 0, /*tp_traverse*/\r | |
4267 | 0, /*tp_clear*/\r | |
4268 | 0, /*tp_richcompare*/\r | |
4269 | 0, /*tp_weaklistoffset*/\r | |
4270 | 0, /*tp_iter*/\r | |
4271 | 0, /*tp_iternext*/\r | |
4272 | encoding_map_methods, /*tp_methods*/\r | |
4273 | 0, /*tp_members*/\r | |
4274 | 0, /*tp_getset*/\r | |
4275 | 0, /*tp_base*/\r | |
4276 | 0, /*tp_dict*/\r | |
4277 | 0, /*tp_descr_get*/\r | |
4278 | 0, /*tp_descr_set*/\r | |
4279 | 0, /*tp_dictoffset*/\r | |
4280 | 0, /*tp_init*/\r | |
4281 | 0, /*tp_alloc*/\r | |
4282 | 0, /*tp_new*/\r | |
4283 | 0, /*tp_free*/\r | |
4284 | 0, /*tp_is_gc*/\r | |
4285 | };\r | |
4286 | \r | |
4287 | PyObject*\r | |
4288 | PyUnicode_BuildEncodingMap(PyObject* string)\r | |
4289 | {\r | |
4290 | Py_UNICODE *decode;\r | |
4291 | PyObject *result;\r | |
4292 | struct encoding_map *mresult;\r | |
4293 | int i;\r | |
4294 | int need_dict = 0;\r | |
4295 | unsigned char level1[32];\r | |
4296 | unsigned char level2[512];\r | |
4297 | unsigned char *mlevel1, *mlevel2, *mlevel3;\r | |
4298 | int count2 = 0, count3 = 0;\r | |
4299 | \r | |
4300 | if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {\r | |
4301 | PyErr_BadArgument();\r | |
4302 | return NULL;\r | |
4303 | }\r | |
4304 | decode = PyUnicode_AS_UNICODE(string);\r | |
4305 | memset(level1, 0xFF, sizeof level1);\r | |
4306 | memset(level2, 0xFF, sizeof level2);\r | |
4307 | \r | |
4308 | /* If there isn't a one-to-one mapping of NULL to \0,\r | |
4309 | or if there are non-BMP characters, we need to use\r | |
4310 | a mapping dictionary. */\r | |
4311 | if (decode[0] != 0)\r | |
4312 | need_dict = 1;\r | |
4313 | for (i = 1; i < 256; i++) {\r | |
4314 | int l1, l2;\r | |
4315 | if (decode[i] == 0\r | |
4316 | #ifdef Py_UNICODE_WIDE\r | |
4317 | || decode[i] > 0xFFFF\r | |
4318 | #endif\r | |
4319 | ) {\r | |
4320 | need_dict = 1;\r | |
4321 | break;\r | |
4322 | }\r | |
4323 | if (decode[i] == 0xFFFE)\r | |
4324 | /* unmapped character */\r | |
4325 | continue;\r | |
4326 | l1 = decode[i] >> 11;\r | |
4327 | l2 = decode[i] >> 7;\r | |
4328 | if (level1[l1] == 0xFF)\r | |
4329 | level1[l1] = count2++;\r | |
4330 | if (level2[l2] == 0xFF)\r | |
4331 | level2[l2] = count3++;\r | |
4332 | }\r | |
4333 | \r | |
4334 | if (count2 >= 0xFF || count3 >= 0xFF)\r | |
4335 | need_dict = 1;\r | |
4336 | \r | |
4337 | if (need_dict) {\r | |
4338 | PyObject *result = PyDict_New();\r | |
4339 | PyObject *key, *value;\r | |
4340 | if (!result)\r | |
4341 | return NULL;\r | |
4342 | for (i = 0; i < 256; i++) {\r | |
4343 | value = NULL;\r | |
4344 | key = PyInt_FromLong(decode[i]);\r | |
4345 | value = PyInt_FromLong(i);\r | |
4346 | if (!key || !value)\r | |
4347 | goto failed1;\r | |
4348 | if (PyDict_SetItem(result, key, value) == -1)\r | |
4349 | goto failed1;\r | |
4350 | Py_DECREF(key);\r | |
4351 | Py_DECREF(value);\r | |
4352 | }\r | |
4353 | return result;\r | |
4354 | failed1:\r | |
4355 | Py_XDECREF(key);\r | |
4356 | Py_XDECREF(value);\r | |
4357 | Py_DECREF(result);\r | |
4358 | return NULL;\r | |
4359 | }\r | |
4360 | \r | |
4361 | /* Create a three-level trie */\r | |
4362 | result = PyObject_MALLOC(sizeof(struct encoding_map) +\r | |
4363 | 16*count2 + 128*count3 - 1);\r | |
4364 | if (!result)\r | |
4365 | return PyErr_NoMemory();\r | |
4366 | PyObject_Init(result, &EncodingMapType);\r | |
4367 | mresult = (struct encoding_map*)result;\r | |
4368 | mresult->count2 = count2;\r | |
4369 | mresult->count3 = count3;\r | |
4370 | mlevel1 = mresult->level1;\r | |
4371 | mlevel2 = mresult->level23;\r | |
4372 | mlevel3 = mresult->level23 + 16*count2;\r | |
4373 | memcpy(mlevel1, level1, 32);\r | |
4374 | memset(mlevel2, 0xFF, 16*count2);\r | |
4375 | memset(mlevel3, 0, 128*count3);\r | |
4376 | count3 = 0;\r | |
4377 | for (i = 1; i < 256; i++) {\r | |
4378 | int o1, o2, o3, i2, i3;\r | |
4379 | if (decode[i] == 0xFFFE)\r | |
4380 | /* unmapped character */\r | |
4381 | continue;\r | |
4382 | o1 = decode[i]>>11;\r | |
4383 | o2 = (decode[i]>>7) & 0xF;\r | |
4384 | i2 = 16*mlevel1[o1] + o2;\r | |
4385 | if (mlevel2[i2] == 0xFF)\r | |
4386 | mlevel2[i2] = count3++;\r | |
4387 | o3 = decode[i] & 0x7F;\r | |
4388 | i3 = 128*mlevel2[i2] + o3;\r | |
4389 | mlevel3[i3] = i;\r | |
4390 | }\r | |
4391 | return result;\r | |
4392 | }\r | |
4393 | \r | |
4394 | static int\r | |
4395 | encoding_map_lookup(Py_UNICODE c, PyObject *mapping)\r | |
4396 | {\r | |
4397 | struct encoding_map *map = (struct encoding_map*)mapping;\r | |
4398 | int l1 = c>>11;\r | |
4399 | int l2 = (c>>7) & 0xF;\r | |
4400 | int l3 = c & 0x7F;\r | |
4401 | int i;\r | |
4402 | \r | |
4403 | #ifdef Py_UNICODE_WIDE\r | |
4404 | if (c > 0xFFFF) {\r | |
4405 | return -1;\r | |
4406 | }\r | |
4407 | #endif\r | |
4408 | if (c == 0)\r | |
4409 | return 0;\r | |
4410 | /* level 1*/\r | |
4411 | i = map->level1[l1];\r | |
4412 | if (i == 0xFF) {\r | |
4413 | return -1;\r | |
4414 | }\r | |
4415 | /* level 2*/\r | |
4416 | i = map->level23[16*i+l2];\r | |
4417 | if (i == 0xFF) {\r | |
4418 | return -1;\r | |
4419 | }\r | |
4420 | /* level 3 */\r | |
4421 | i = map->level23[16*map->count2 + 128*i + l3];\r | |
4422 | if (i == 0) {\r | |
4423 | return -1;\r | |
4424 | }\r | |
4425 | return i;\r | |
4426 | }\r | |
4427 | \r | |
4428 | /* Lookup the character ch in the mapping. If the character\r | |
4429 | can't be found, Py_None is returned (or NULL, if another\r | |
4430 | error occurred). */\r | |
4431 | static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)\r | |
4432 | {\r | |
4433 | PyObject *w = PyInt_FromLong((long)c);\r | |
4434 | PyObject *x;\r | |
4435 | \r | |
4436 | if (w == NULL)\r | |
4437 | return NULL;\r | |
4438 | x = PyObject_GetItem(mapping, w);\r | |
4439 | Py_DECREF(w);\r | |
4440 | if (x == NULL) {\r | |
4441 | if (PyErr_ExceptionMatches(PyExc_LookupError)) {\r | |
4442 | /* No mapping found means: mapping is undefined. */\r | |
4443 | PyErr_Clear();\r | |
4444 | x = Py_None;\r | |
4445 | Py_INCREF(x);\r | |
4446 | return x;\r | |
4447 | } else\r | |
4448 | return NULL;\r | |
4449 | }\r | |
4450 | else if (x == Py_None)\r | |
4451 | return x;\r | |
4452 | else if (PyInt_Check(x)) {\r | |
4453 | long value = PyInt_AS_LONG(x);\r | |
4454 | if (value < 0 || value > 255) {\r | |
4455 | PyErr_SetString(PyExc_TypeError,\r | |
4456 | "character mapping must be in range(256)");\r | |
4457 | Py_DECREF(x);\r | |
4458 | return NULL;\r | |
4459 | }\r | |
4460 | return x;\r | |
4461 | }\r | |
4462 | else if (PyString_Check(x))\r | |
4463 | return x;\r | |
4464 | else {\r | |
4465 | /* wrong return value */\r | |
4466 | PyErr_SetString(PyExc_TypeError,\r | |
4467 | "character mapping must return integer, None or str");\r | |
4468 | Py_DECREF(x);\r | |
4469 | return NULL;\r | |
4470 | }\r | |
4471 | }\r | |
4472 | \r | |
4473 | static int\r | |
4474 | charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)\r | |
4475 | {\r | |
4476 | Py_ssize_t outsize = PyString_GET_SIZE(*outobj);\r | |
4477 | /* exponentially overallocate to minimize reallocations */\r | |
4478 | if (requiredsize < 2*outsize)\r | |
4479 | requiredsize = 2*outsize;\r | |
4480 | if (_PyString_Resize(outobj, requiredsize)) {\r | |
4481 | return 0;\r | |
4482 | }\r | |
4483 | return 1;\r | |
4484 | }\r | |
4485 | \r | |
4486 | typedef enum charmapencode_result {\r | |
4487 | enc_SUCCESS, enc_FAILED, enc_EXCEPTION\r | |
4488 | }charmapencode_result;\r | |
4489 | /* lookup the character, put the result in the output string and adjust\r | |
4490 | various state variables. Reallocate the output string if not enough\r | |
4491 | space is available. Return a new reference to the object that\r | |
4492 | was put in the output buffer, or Py_None, if the mapping was undefined\r | |
4493 | (in which case no character was written) or NULL, if a\r | |
4494 | reallocation error occurred. The caller must decref the result */\r | |
4495 | static\r | |
4496 | charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,\r | |
4497 | PyObject **outobj, Py_ssize_t *outpos)\r | |
4498 | {\r | |
4499 | PyObject *rep;\r | |
4500 | char *outstart;\r | |
4501 | Py_ssize_t outsize = PyString_GET_SIZE(*outobj);\r | |
4502 | \r | |
4503 | if (Py_TYPE(mapping) == &EncodingMapType) {\r | |
4504 | int res = encoding_map_lookup(c, mapping);\r | |
4505 | Py_ssize_t requiredsize = *outpos+1;\r | |
4506 | if (res == -1)\r | |
4507 | return enc_FAILED;\r | |
4508 | if (outsize<requiredsize)\r | |
4509 | if (!charmapencode_resize(outobj, outpos, requiredsize))\r | |
4510 | return enc_EXCEPTION;\r | |
4511 | outstart = PyString_AS_STRING(*outobj);\r | |
4512 | outstart[(*outpos)++] = (char)res;\r | |
4513 | return enc_SUCCESS;\r | |
4514 | }\r | |
4515 | \r | |
4516 | rep = charmapencode_lookup(c, mapping);\r | |
4517 | if (rep==NULL)\r | |
4518 | return enc_EXCEPTION;\r | |
4519 | else if (rep==Py_None) {\r | |
4520 | Py_DECREF(rep);\r | |
4521 | return enc_FAILED;\r | |
4522 | } else {\r | |
4523 | if (PyInt_Check(rep)) {\r | |
4524 | Py_ssize_t requiredsize = *outpos+1;\r | |
4525 | if (outsize<requiredsize)\r | |
4526 | if (!charmapencode_resize(outobj, outpos, requiredsize)) {\r | |
4527 | Py_DECREF(rep);\r | |
4528 | return enc_EXCEPTION;\r | |
4529 | }\r | |
4530 | outstart = PyString_AS_STRING(*outobj);\r | |
4531 | outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);\r | |
4532 | }\r | |
4533 | else {\r | |
4534 | const char *repchars = PyString_AS_STRING(rep);\r | |
4535 | Py_ssize_t repsize = PyString_GET_SIZE(rep);\r | |
4536 | Py_ssize_t requiredsize = *outpos+repsize;\r | |
4537 | if (outsize<requiredsize)\r | |
4538 | if (!charmapencode_resize(outobj, outpos, requiredsize)) {\r | |
4539 | Py_DECREF(rep);\r | |
4540 | return enc_EXCEPTION;\r | |
4541 | }\r | |
4542 | outstart = PyString_AS_STRING(*outobj);\r | |
4543 | memcpy(outstart + *outpos, repchars, repsize);\r | |
4544 | *outpos += repsize;\r | |
4545 | }\r | |
4546 | }\r | |
4547 | Py_DECREF(rep);\r | |
4548 | return enc_SUCCESS;\r | |
4549 | }\r | |
4550 | \r | |
4551 | /* handle an error in PyUnicode_EncodeCharmap\r | |
4552 | Return 0 on success, -1 on error */\r | |
4553 | static\r | |
4554 | int charmap_encoding_error(\r | |
4555 | const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,\r | |
4556 | PyObject **exceptionObject,\r | |
4557 | int *known_errorHandler, PyObject **errorHandler, const char *errors,\r | |
4558 | PyObject **res, Py_ssize_t *respos)\r | |
4559 | {\r | |
4560 | PyObject *repunicode = NULL; /* initialize to prevent gcc warning */\r | |
4561 | Py_ssize_t repsize;\r | |
4562 | Py_ssize_t newpos;\r | |
4563 | Py_UNICODE *uni2;\r | |
4564 | /* startpos for collecting unencodable chars */\r | |
4565 | Py_ssize_t collstartpos = *inpos;\r | |
4566 | Py_ssize_t collendpos = *inpos+1;\r | |
4567 | Py_ssize_t collpos;\r | |
4568 | char *encoding = "charmap";\r | |
4569 | char *reason = "character maps to <undefined>";\r | |
4570 | charmapencode_result x;\r | |
4571 | \r | |
4572 | /* find all unencodable characters */\r | |
4573 | while (collendpos < size) {\r | |
4574 | PyObject *rep;\r | |
4575 | if (Py_TYPE(mapping) == &EncodingMapType) {\r | |
4576 | int res = encoding_map_lookup(p[collendpos], mapping);\r | |
4577 | if (res != -1)\r | |
4578 | break;\r | |
4579 | ++collendpos;\r | |
4580 | continue;\r | |
4581 | }\r | |
4582 | \r | |
4583 | rep = charmapencode_lookup(p[collendpos], mapping);\r | |
4584 | if (rep==NULL)\r | |
4585 | return -1;\r | |
4586 | else if (rep!=Py_None) {\r | |
4587 | Py_DECREF(rep);\r | |
4588 | break;\r | |
4589 | }\r | |
4590 | Py_DECREF(rep);\r | |
4591 | ++collendpos;\r | |
4592 | }\r | |
4593 | /* cache callback name lookup\r | |
4594 | * (if not done yet, i.e. it's the first error) */\r | |
4595 | if (*known_errorHandler==-1) {\r | |
4596 | if ((errors==NULL) || (!strcmp(errors, "strict")))\r | |
4597 | *known_errorHandler = 1;\r | |
4598 | else if (!strcmp(errors, "replace"))\r | |
4599 | *known_errorHandler = 2;\r | |
4600 | else if (!strcmp(errors, "ignore"))\r | |
4601 | *known_errorHandler = 3;\r | |
4602 | else if (!strcmp(errors, "xmlcharrefreplace"))\r | |
4603 | *known_errorHandler = 4;\r | |
4604 | else\r | |
4605 | *known_errorHandler = 0;\r | |
4606 | }\r | |
4607 | switch (*known_errorHandler) {\r | |
4608 | case 1: /* strict */\r | |
4609 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r | |
4610 | return -1;\r | |
4611 | case 2: /* replace */\r | |
4612 | for (collpos = collstartpos; collpos<collendpos; ++collpos) {\r | |
4613 | x = charmapencode_output('?', mapping, res, respos);\r | |
4614 | if (x==enc_EXCEPTION) {\r | |
4615 | return -1;\r | |
4616 | }\r | |
4617 | else if (x==enc_FAILED) {\r | |
4618 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r | |
4619 | return -1;\r | |
4620 | }\r | |
4621 | }\r | |
4622 | /* fall through */\r | |
4623 | case 3: /* ignore */\r | |
4624 | *inpos = collendpos;\r | |
4625 | break;\r | |
4626 | case 4: /* xmlcharrefreplace */\r | |
4627 | /* generate replacement (temporarily (mis)uses p) */\r | |
4628 | for (collpos = collstartpos; collpos < collendpos; ++collpos) {\r | |
4629 | char buffer[2+29+1+1];\r | |
4630 | char *cp;\r | |
4631 | sprintf(buffer, "&#%d;", (int)p[collpos]);\r | |
4632 | for (cp = buffer; *cp; ++cp) {\r | |
4633 | x = charmapencode_output(*cp, mapping, res, respos);\r | |
4634 | if (x==enc_EXCEPTION)\r | |
4635 | return -1;\r | |
4636 | else if (x==enc_FAILED) {\r | |
4637 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r | |
4638 | return -1;\r | |
4639 | }\r | |
4640 | }\r | |
4641 | }\r | |
4642 | *inpos = collendpos;\r | |
4643 | break;\r | |
4644 | default:\r | |
4645 | repunicode = unicode_encode_call_errorhandler(errors, errorHandler,\r | |
4646 | encoding, reason, p, size, exceptionObject,\r | |
4647 | collstartpos, collendpos, &newpos);\r | |
4648 | if (repunicode == NULL)\r | |
4649 | return -1;\r | |
4650 | /* generate replacement */\r | |
4651 | repsize = PyUnicode_GET_SIZE(repunicode);\r | |
4652 | for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {\r | |
4653 | x = charmapencode_output(*uni2, mapping, res, respos);\r | |
4654 | if (x==enc_EXCEPTION) {\r | |
4655 | return -1;\r | |
4656 | }\r | |
4657 | else if (x==enc_FAILED) {\r | |
4658 | Py_DECREF(repunicode);\r | |
4659 | raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r | |
4660 | return -1;\r | |
4661 | }\r | |
4662 | }\r | |
4663 | *inpos = newpos;\r | |
4664 | Py_DECREF(repunicode);\r | |
4665 | }\r | |
4666 | return 0;\r | |
4667 | }\r | |
4668 | \r | |
4669 | PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,\r | |
4670 | Py_ssize_t size,\r | |
4671 | PyObject *mapping,\r | |
4672 | const char *errors)\r | |
4673 | {\r | |
4674 | /* output object */\r | |
4675 | PyObject *res = NULL;\r | |
4676 | /* current input position */\r | |
4677 | Py_ssize_t inpos = 0;\r | |
4678 | /* current output position */\r | |
4679 | Py_ssize_t respos = 0;\r | |
4680 | PyObject *errorHandler = NULL;\r | |
4681 | PyObject *exc = NULL;\r | |
4682 | /* the following variable is used for caching string comparisons\r | |
4683 | * -1=not initialized, 0=unknown, 1=strict, 2=replace,\r | |
4684 | * 3=ignore, 4=xmlcharrefreplace */\r | |
4685 | int known_errorHandler = -1;\r | |
4686 | \r | |
4687 | /* Default to Latin-1 */\r | |
4688 | if (mapping == NULL)\r | |
4689 | return PyUnicode_EncodeLatin1(p, size, errors);\r | |
4690 | \r | |
4691 | /* allocate enough for a simple encoding without\r | |
4692 | replacements, if we need more, we'll resize */\r | |
4693 | res = PyString_FromStringAndSize(NULL, size);\r | |
4694 | if (res == NULL)\r | |
4695 | goto onError;\r | |
4696 | if (size == 0)\r | |
4697 | return res;\r | |
4698 | \r | |
4699 | while (inpos<size) {\r | |
4700 | /* try to encode it */\r | |
4701 | charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);\r | |
4702 | if (x==enc_EXCEPTION) /* error */\r | |
4703 | goto onError;\r | |
4704 | if (x==enc_FAILED) { /* unencodable character */\r | |
4705 | if (charmap_encoding_error(p, size, &inpos, mapping,\r | |
4706 | &exc,\r | |
4707 | &known_errorHandler, &errorHandler, errors,\r | |
4708 | &res, &respos)) {\r | |
4709 | goto onError;\r | |
4710 | }\r | |
4711 | }\r | |
4712 | else\r | |
4713 | /* done with this character => adjust input position */\r | |
4714 | ++inpos;\r | |
4715 | }\r | |
4716 | \r | |
4717 | /* Resize if we allocated to much */\r | |
4718 | if (respos<PyString_GET_SIZE(res)) {\r | |
4719 | if (_PyString_Resize(&res, respos))\r | |
4720 | goto onError;\r | |
4721 | }\r | |
4722 | Py_XDECREF(exc);\r | |
4723 | Py_XDECREF(errorHandler);\r | |
4724 | return res;\r | |
4725 | \r | |
4726 | onError:\r | |
4727 | Py_XDECREF(res);\r | |
4728 | Py_XDECREF(exc);\r | |
4729 | Py_XDECREF(errorHandler);\r | |
4730 | return NULL;\r | |
4731 | }\r | |
4732 | \r | |
4733 | PyObject *PyUnicode_AsCharmapString(PyObject *unicode,\r | |
4734 | PyObject *mapping)\r | |
4735 | {\r | |
4736 | if (!PyUnicode_Check(unicode) || mapping == NULL) {\r | |
4737 | PyErr_BadArgument();\r | |
4738 | return NULL;\r | |
4739 | }\r | |
4740 | return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),\r | |
4741 | PyUnicode_GET_SIZE(unicode),\r | |
4742 | mapping,\r | |
4743 | NULL);\r | |
4744 | }\r | |
4745 | \r | |
4746 | /* create or adjust a UnicodeTranslateError */\r | |
4747 | static void make_translate_exception(PyObject **exceptionObject,\r | |
4748 | const Py_UNICODE *unicode, Py_ssize_t size,\r | |
4749 | Py_ssize_t startpos, Py_ssize_t endpos,\r | |
4750 | const char *reason)\r | |
4751 | {\r | |
4752 | if (*exceptionObject == NULL) {\r | |
4753 | *exceptionObject = PyUnicodeTranslateError_Create(\r | |
4754 | unicode, size, startpos, endpos, reason);\r | |
4755 | }\r | |
4756 | else {\r | |
4757 | if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))\r | |
4758 | goto onError;\r | |
4759 | if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))\r | |
4760 | goto onError;\r | |
4761 | if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))\r | |
4762 | goto onError;\r | |
4763 | return;\r | |
4764 | onError:\r | |
4765 | Py_DECREF(*exceptionObject);\r | |
4766 | *exceptionObject = NULL;\r | |
4767 | }\r | |
4768 | }\r | |
4769 | \r | |
4770 | /* raises a UnicodeTranslateError */\r | |
4771 | static void raise_translate_exception(PyObject **exceptionObject,\r | |
4772 | const Py_UNICODE *unicode, Py_ssize_t size,\r | |
4773 | Py_ssize_t startpos, Py_ssize_t endpos,\r | |
4774 | const char *reason)\r | |
4775 | {\r | |
4776 | make_translate_exception(exceptionObject,\r | |
4777 | unicode, size, startpos, endpos, reason);\r | |
4778 | if (*exceptionObject != NULL)\r | |
4779 | PyCodec_StrictErrors(*exceptionObject);\r | |
4780 | }\r | |
4781 | \r | |
4782 | /* error handling callback helper:\r | |
4783 | build arguments, call the callback and check the arguments,\r | |
4784 | put the result into newpos and return the replacement string, which\r | |
4785 | has to be freed by the caller */\r | |
4786 | static PyObject *unicode_translate_call_errorhandler(const char *errors,\r | |
4787 | PyObject **errorHandler,\r | |
4788 | const char *reason,\r | |
4789 | const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,\r | |
4790 | Py_ssize_t startpos, Py_ssize_t endpos,\r | |
4791 | Py_ssize_t *newpos)\r | |
4792 | {\r | |
4793 | static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";\r | |
4794 | \r | |
4795 | Py_ssize_t i_newpos;\r | |
4796 | PyObject *restuple;\r | |
4797 | PyObject *resunicode;\r | |
4798 | \r | |
4799 | if (*errorHandler == NULL) {\r | |
4800 | *errorHandler = PyCodec_LookupError(errors);\r | |
4801 | if (*errorHandler == NULL)\r | |
4802 | return NULL;\r | |
4803 | }\r | |
4804 | \r | |
4805 | make_translate_exception(exceptionObject,\r | |
4806 | unicode, size, startpos, endpos, reason);\r | |
4807 | if (*exceptionObject == NULL)\r | |
4808 | return NULL;\r | |
4809 | \r | |
4810 | restuple = PyObject_CallFunctionObjArgs(\r | |
4811 | *errorHandler, *exceptionObject, NULL);\r | |
4812 | if (restuple == NULL)\r | |
4813 | return NULL;\r | |
4814 | if (!PyTuple_Check(restuple)) {\r | |
4815 | PyErr_SetString(PyExc_TypeError, &argparse[4]);\r | |
4816 | Py_DECREF(restuple);\r | |
4817 | return NULL;\r | |
4818 | }\r | |
4819 | if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,\r | |
4820 | &resunicode, &i_newpos)) {\r | |
4821 | Py_DECREF(restuple);\r | |
4822 | return NULL;\r | |
4823 | }\r | |
4824 | if (i_newpos<0)\r | |
4825 | *newpos = size+i_newpos;\r | |
4826 | else\r | |
4827 | *newpos = i_newpos;\r | |
4828 | if (*newpos<0 || *newpos>size) {\r | |
4829 | PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);\r | |
4830 | Py_DECREF(restuple);\r | |
4831 | return NULL;\r | |
4832 | }\r | |
4833 | Py_INCREF(resunicode);\r | |
4834 | Py_DECREF(restuple);\r | |
4835 | return resunicode;\r | |
4836 | }\r | |
4837 | \r | |
4838 | /* Lookup the character ch in the mapping and put the result in result,\r | |
4839 | which must be decrefed by the caller.\r | |
4840 | Return 0 on success, -1 on error */\r | |
4841 | static\r | |
4842 | int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)\r | |
4843 | {\r | |
4844 | PyObject *w = PyInt_FromLong((long)c);\r | |
4845 | PyObject *x;\r | |
4846 | \r | |
4847 | if (w == NULL)\r | |
4848 | return -1;\r | |
4849 | x = PyObject_GetItem(mapping, w);\r | |
4850 | Py_DECREF(w);\r | |
4851 | if (x == NULL) {\r | |
4852 | if (PyErr_ExceptionMatches(PyExc_LookupError)) {\r | |
4853 | /* No mapping found means: use 1:1 mapping. */\r | |
4854 | PyErr_Clear();\r | |
4855 | *result = NULL;\r | |
4856 | return 0;\r | |
4857 | } else\r | |
4858 | return -1;\r | |
4859 | }\r | |
4860 | else if (x == Py_None) {\r | |
4861 | *result = x;\r | |
4862 | return 0;\r | |
4863 | }\r | |
4864 | else if (PyInt_Check(x)) {\r | |
4865 | long value = PyInt_AS_LONG(x);\r | |
4866 | long max = PyUnicode_GetMax();\r | |
4867 | if (value < 0 || value > max) {\r | |
4868 | PyErr_Format(PyExc_TypeError,\r | |
4869 | "character mapping must be in range(0x%lx)", max+1);\r | |
4870 | Py_DECREF(x);\r | |
4871 | return -1;\r | |
4872 | }\r | |
4873 | *result = x;\r | |
4874 | return 0;\r | |
4875 | }\r | |
4876 | else if (PyUnicode_Check(x)) {\r | |
4877 | *result = x;\r | |
4878 | return 0;\r | |
4879 | }\r | |
4880 | else {\r | |
4881 | /* wrong return value */\r | |
4882 | PyErr_SetString(PyExc_TypeError,\r | |
4883 | "character mapping must return integer, None or unicode");\r | |
4884 | Py_DECREF(x);\r | |
4885 | return -1;\r | |
4886 | }\r | |
4887 | }\r | |
4888 | /* ensure that *outobj is at least requiredsize characters long,\r | |
4889 | if not reallocate and adjust various state variables.\r | |
4890 | Return 0 on success, -1 on error */\r | |
4891 | static\r | |
4892 | int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,\r | |
4893 | Py_ssize_t requiredsize)\r | |
4894 | {\r | |
4895 | Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);\r | |
4896 | if (requiredsize > oldsize) {\r | |
4897 | /* remember old output position */\r | |
4898 | Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);\r | |
4899 | /* exponentially overallocate to minimize reallocations */\r | |
4900 | if (requiredsize < 2 * oldsize)\r | |
4901 | requiredsize = 2 * oldsize;\r | |
4902 | if (PyUnicode_Resize(outobj, requiredsize) < 0)\r | |
4903 | return -1;\r | |
4904 | *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;\r | |
4905 | }\r | |
4906 | return 0;\r | |
4907 | }\r | |
4908 | /* lookup the character, put the result in the output string and adjust\r | |
4909 | various state variables. Return a new reference to the object that\r | |
4910 | was put in the output buffer in *result, or Py_None, if the mapping was\r | |
4911 | undefined (in which case no character was written).\r | |
4912 | The called must decref result.\r | |
4913 | Return 0 on success, -1 on error. */\r | |
4914 | static\r | |
4915 | int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,\r | |
4916 | Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,\r | |
4917 | PyObject **res)\r | |
4918 | {\r | |
4919 | if (charmaptranslate_lookup(*curinp, mapping, res))\r | |
4920 | return -1;\r | |
4921 | if (*res==NULL) {\r | |
4922 | /* not found => default to 1:1 mapping */\r | |
4923 | *(*outp)++ = *curinp;\r | |
4924 | }\r | |
4925 | else if (*res==Py_None)\r | |
4926 | ;\r | |
4927 | else if (PyInt_Check(*res)) {\r | |
4928 | /* no overflow check, because we know that the space is enough */\r | |
4929 | *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);\r | |
4930 | }\r | |
4931 | else if (PyUnicode_Check(*res)) {\r | |
4932 | Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);\r | |
4933 | if (repsize==1) {\r | |
4934 | /* no overflow check, because we know that the space is enough */\r | |
4935 | *(*outp)++ = *PyUnicode_AS_UNICODE(*res);\r | |
4936 | }\r | |
4937 | else if (repsize!=0) {\r | |
4938 | /* more than one character */\r | |
4939 | Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +\r | |
4940 | (insize - (curinp-startinp)) +\r | |
4941 | repsize - 1;\r | |
4942 | if (charmaptranslate_makespace(outobj, outp, requiredsize))\r | |
4943 | return -1;\r | |
4944 | memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);\r | |
4945 | *outp += repsize;\r | |
4946 | }\r | |
4947 | }\r | |
4948 | else\r | |
4949 | return -1;\r | |
4950 | return 0;\r | |
4951 | }\r | |
4952 | \r | |
4953 | PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,\r | |
4954 | Py_ssize_t size,\r | |
4955 | PyObject *mapping,\r | |
4956 | const char *errors)\r | |
4957 | {\r | |
4958 | /* output object */\r | |
4959 | PyObject *res = NULL;\r | |
4960 | /* pointers to the beginning and end+1 of input */\r | |
4961 | const Py_UNICODE *startp = p;\r | |
4962 | const Py_UNICODE *endp = p + size;\r | |
4963 | /* pointer into the output */\r | |
4964 | Py_UNICODE *str;\r | |
4965 | /* current output position */\r | |
4966 | Py_ssize_t respos = 0;\r | |
4967 | char *reason = "character maps to <undefined>";\r | |
4968 | PyObject *errorHandler = NULL;\r | |
4969 | PyObject *exc = NULL;\r | |
4970 | /* the following variable is used for caching string comparisons\r | |
4971 | * -1=not initialized, 0=unknown, 1=strict, 2=replace,\r | |
4972 | * 3=ignore, 4=xmlcharrefreplace */\r | |
4973 | int known_errorHandler = -1;\r | |
4974 | \r | |
4975 | if (mapping == NULL) {\r | |
4976 | PyErr_BadArgument();\r | |
4977 | return NULL;\r | |
4978 | }\r | |
4979 | \r | |
4980 | /* allocate enough for a simple 1:1 translation without\r | |
4981 | replacements, if we need more, we'll resize */\r | |
4982 | res = PyUnicode_FromUnicode(NULL, size);\r | |
4983 | if (res == NULL)\r | |
4984 | goto onError;\r | |
4985 | if (size == 0)\r | |
4986 | return res;\r | |
4987 | str = PyUnicode_AS_UNICODE(res);\r | |
4988 | \r | |
4989 | while (p<endp) {\r | |
4990 | /* try to encode it */\r | |
4991 | PyObject *x = NULL;\r | |
4992 | if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {\r | |
4993 | Py_XDECREF(x);\r | |
4994 | goto onError;\r | |
4995 | }\r | |
4996 | Py_XDECREF(x);\r | |
4997 | if (x!=Py_None) /* it worked => adjust input pointer */\r | |
4998 | ++p;\r | |
4999 | else { /* untranslatable character */\r | |
5000 | PyObject *repunicode = NULL; /* initialize to prevent gcc warning */\r | |
5001 | Py_ssize_t repsize;\r | |
5002 | Py_ssize_t newpos;\r | |
5003 | Py_UNICODE *uni2;\r | |
5004 | /* startpos for collecting untranslatable chars */\r | |
5005 | const Py_UNICODE *collstart = p;\r | |
5006 | const Py_UNICODE *collend = p+1;\r | |
5007 | const Py_UNICODE *coll;\r | |
5008 | \r | |
5009 | /* find all untranslatable characters */\r | |
5010 | while (collend < endp) {\r | |
5011 | if (charmaptranslate_lookup(*collend, mapping, &x))\r | |
5012 | goto onError;\r | |
5013 | Py_XDECREF(x);\r | |
5014 | if (x!=Py_None)\r | |
5015 | break;\r | |
5016 | ++collend;\r | |
5017 | }\r | |
5018 | /* cache callback name lookup\r | |
5019 | * (if not done yet, i.e. it's the first error) */\r | |
5020 | if (known_errorHandler==-1) {\r | |
5021 | if ((errors==NULL) || (!strcmp(errors, "strict")))\r | |
5022 | known_errorHandler = 1;\r | |
5023 | else if (!strcmp(errors, "replace"))\r | |
5024 | known_errorHandler = 2;\r | |
5025 | else if (!strcmp(errors, "ignore"))\r | |
5026 | known_errorHandler = 3;\r | |
5027 | else if (!strcmp(errors, "xmlcharrefreplace"))\r | |
5028 | known_errorHandler = 4;\r | |
5029 | else\r | |
5030 | known_errorHandler = 0;\r | |
5031 | }\r | |
5032 | switch (known_errorHandler) {\r | |
5033 | case 1: /* strict */\r | |
5034 | raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);\r | |
5035 | goto onError;\r | |
5036 | case 2: /* replace */\r | |
5037 | /* No need to check for space, this is a 1:1 replacement */\r | |
5038 | for (coll = collstart; coll<collend; ++coll)\r | |
5039 | *str++ = '?';\r | |
5040 | /* fall through */\r | |
5041 | case 3: /* ignore */\r | |
5042 | p = collend;\r | |
5043 | break;\r | |
5044 | case 4: /* xmlcharrefreplace */\r | |
5045 | /* generate replacement (temporarily (mis)uses p) */\r | |
5046 | for (p = collstart; p < collend; ++p) {\r | |
5047 | char buffer[2+29+1+1];\r | |
5048 | char *cp;\r | |
5049 | sprintf(buffer, "&#%d;", (int)*p);\r | |
5050 | if (charmaptranslate_makespace(&res, &str,\r | |
5051 | (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))\r | |
5052 | goto onError;\r | |
5053 | for (cp = buffer; *cp; ++cp)\r | |
5054 | *str++ = *cp;\r | |
5055 | }\r | |
5056 | p = collend;\r | |
5057 | break;\r | |
5058 | default:\r | |
5059 | repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,\r | |
5060 | reason, startp, size, &exc,\r | |
5061 | collstart-startp, collend-startp, &newpos);\r | |
5062 | if (repunicode == NULL)\r | |
5063 | goto onError;\r | |
5064 | /* generate replacement */\r | |
5065 | repsize = PyUnicode_GET_SIZE(repunicode);\r | |
5066 | if (charmaptranslate_makespace(&res, &str,\r | |
5067 | (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {\r | |
5068 | Py_DECREF(repunicode);\r | |
5069 | goto onError;\r | |
5070 | }\r | |
5071 | for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)\r | |
5072 | *str++ = *uni2;\r | |
5073 | p = startp + newpos;\r | |
5074 | Py_DECREF(repunicode);\r | |
5075 | }\r | |
5076 | }\r | |
5077 | }\r | |
5078 | /* Resize if we allocated to much */\r | |
5079 | respos = str-PyUnicode_AS_UNICODE(res);\r | |
5080 | if (respos<PyUnicode_GET_SIZE(res)) {\r | |
5081 | if (PyUnicode_Resize(&res, respos) < 0)\r | |
5082 | goto onError;\r | |
5083 | }\r | |
5084 | Py_XDECREF(exc);\r | |
5085 | Py_XDECREF(errorHandler);\r | |
5086 | return res;\r | |
5087 | \r | |
5088 | onError:\r | |
5089 | Py_XDECREF(res);\r | |
5090 | Py_XDECREF(exc);\r | |
5091 | Py_XDECREF(errorHandler);\r | |
5092 | return NULL;\r | |
5093 | }\r | |
5094 | \r | |
5095 | PyObject *PyUnicode_Translate(PyObject *str,\r | |
5096 | PyObject *mapping,\r | |
5097 | const char *errors)\r | |
5098 | {\r | |
5099 | PyObject *result;\r | |
5100 | \r | |
5101 | str = PyUnicode_FromObject(str);\r | |
5102 | if (str == NULL)\r | |
5103 | goto onError;\r | |
5104 | result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),\r | |
5105 | PyUnicode_GET_SIZE(str),\r | |
5106 | mapping,\r | |
5107 | errors);\r | |
5108 | Py_DECREF(str);\r | |
5109 | return result;\r | |
5110 | \r | |
5111 | onError:\r | |
5112 | Py_XDECREF(str);\r | |
5113 | return NULL;\r | |
5114 | }\r | |
5115 | \r | |
5116 | /* --- Decimal Encoder ---------------------------------------------------- */\r | |
5117 | \r | |
5118 | int PyUnicode_EncodeDecimal(Py_UNICODE *s,\r | |
5119 | Py_ssize_t length,\r | |
5120 | char *output,\r | |
5121 | const char *errors)\r | |
5122 | {\r | |
5123 | Py_UNICODE *p, *end;\r | |
5124 | PyObject *errorHandler = NULL;\r | |
5125 | PyObject *exc = NULL;\r | |
5126 | const char *encoding = "decimal";\r | |
5127 | const char *reason = "invalid decimal Unicode string";\r | |
5128 | /* the following variable is used for caching string comparisons\r | |
5129 | * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */\r | |
5130 | int known_errorHandler = -1;\r | |
5131 | \r | |
5132 | if (output == NULL) {\r | |
5133 | PyErr_BadArgument();\r | |
5134 | return -1;\r | |
5135 | }\r | |
5136 | \r | |
5137 | p = s;\r | |
5138 | end = s + length;\r | |
5139 | while (p < end) {\r | |
5140 | register Py_UNICODE ch = *p;\r | |
5141 | int decimal;\r | |
5142 | PyObject *repunicode;\r | |
5143 | Py_ssize_t repsize;\r | |
5144 | Py_ssize_t newpos;\r | |
5145 | Py_UNICODE *uni2;\r | |
5146 | Py_UNICODE *collstart;\r | |
5147 | Py_UNICODE *collend;\r | |
5148 | \r | |
5149 | if (Py_UNICODE_ISSPACE(ch)) {\r | |
5150 | *output++ = ' ';\r | |
5151 | ++p;\r | |
5152 | continue;\r | |
5153 | }\r | |
5154 | decimal = Py_UNICODE_TODECIMAL(ch);\r | |
5155 | if (decimal >= 0) {\r | |
5156 | *output++ = '0' + decimal;\r | |
5157 | ++p;\r | |
5158 | continue;\r | |
5159 | }\r | |
5160 | if (0 < ch && ch < 256) {\r | |
5161 | *output++ = (char)ch;\r | |
5162 | ++p;\r | |
5163 | continue;\r | |
5164 | }\r | |
5165 | /* All other characters are considered unencodable */\r | |
5166 | collstart = p;\r | |
5167 | collend = p+1;\r | |
5168 | while (collend < end) {\r | |
5169 | if ((0 < *collend && *collend < 256) ||\r | |
5170 | !Py_UNICODE_ISSPACE(*collend) ||\r | |
5171 | Py_UNICODE_TODECIMAL(*collend))\r | |
5172 | break;\r | |
5173 | }\r | |
5174 | /* cache callback name lookup\r | |
5175 | * (if not done yet, i.e. it's the first error) */\r | |
5176 | if (known_errorHandler==-1) {\r | |
5177 | if ((errors==NULL) || (!strcmp(errors, "strict")))\r | |
5178 | known_errorHandler = 1;\r | |
5179 | else if (!strcmp(errors, "replace"))\r | |
5180 | known_errorHandler = 2;\r | |
5181 | else if (!strcmp(errors, "ignore"))\r | |
5182 | known_errorHandler = 3;\r | |
5183 | else if (!strcmp(errors, "xmlcharrefreplace"))\r | |
5184 | known_errorHandler = 4;\r | |
5185 | else\r | |
5186 | known_errorHandler = 0;\r | |
5187 | }\r | |
5188 | switch (known_errorHandler) {\r | |
5189 | case 1: /* strict */\r | |
5190 | raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);\r | |
5191 | goto onError;\r | |
5192 | case 2: /* replace */\r | |
5193 | for (p = collstart; p < collend; ++p)\r | |
5194 | *output++ = '?';\r | |
5195 | /* fall through */\r | |
5196 | case 3: /* ignore */\r | |
5197 | p = collend;\r | |
5198 | break;\r | |
5199 | case 4: /* xmlcharrefreplace */\r | |
5200 | /* generate replacement (temporarily (mis)uses p) */\r | |
5201 | for (p = collstart; p < collend; ++p)\r | |
5202 | output += sprintf(output, "&#%d;", (int)*p);\r | |
5203 | p = collend;\r | |
5204 | break;\r | |
5205 | default:\r | |
5206 | repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,\r | |
5207 | encoding, reason, s, length, &exc,\r | |
5208 | collstart-s, collend-s, &newpos);\r | |
5209 | if (repunicode == NULL)\r | |
5210 | goto onError;\r | |
5211 | /* generate replacement */\r | |
5212 | repsize = PyUnicode_GET_SIZE(repunicode);\r | |
5213 | for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {\r | |
5214 | Py_UNICODE ch = *uni2;\r | |
5215 | if (Py_UNICODE_ISSPACE(ch))\r | |
5216 | *output++ = ' ';\r | |
5217 | else {\r | |
5218 | decimal = Py_UNICODE_TODECIMAL(ch);\r | |
5219 | if (decimal >= 0)\r | |
5220 | *output++ = '0' + decimal;\r | |
5221 | else if (0 < ch && ch < 256)\r | |
5222 | *output++ = (char)ch;\r | |
5223 | else {\r | |
5224 | Py_DECREF(repunicode);\r | |
5225 | raise_encode_exception(&exc, encoding,\r | |
5226 | s, length, collstart-s, collend-s, reason);\r | |
5227 | goto onError;\r | |
5228 | }\r | |
5229 | }\r | |
5230 | }\r | |
5231 | p = s + newpos;\r | |
5232 | Py_DECREF(repunicode);\r | |
5233 | }\r | |
5234 | }\r | |
5235 | /* 0-terminate the output string */\r | |
5236 | *output++ = '\0';\r | |
5237 | Py_XDECREF(exc);\r | |
5238 | Py_XDECREF(errorHandler);\r | |
5239 | return 0;\r | |
5240 | \r | |
5241 | onError:\r | |
5242 | Py_XDECREF(exc);\r | |
5243 | Py_XDECREF(errorHandler);\r | |
5244 | return -1;\r | |
5245 | }\r | |
5246 | \r | |
5247 | /* --- Helpers ------------------------------------------------------------ */\r | |
5248 | \r | |
5249 | #include "stringlib/unicodedefs.h"\r | |
5250 | #include "stringlib/fastsearch.h"\r | |
5251 | \r | |
5252 | #include "stringlib/count.h"\r | |
5253 | #include "stringlib/find.h"\r | |
5254 | #include "stringlib/partition.h"\r | |
5255 | #include "stringlib/split.h"\r | |
5256 | \r | |
5257 | /* helper macro to fixup start/end slice values */\r | |
5258 | #define ADJUST_INDICES(start, end, len) \\r | |
5259 | if (end > len) \\r | |
5260 | end = len; \\r | |
5261 | else if (end < 0) { \\r | |
5262 | end += len; \\r | |
5263 | if (end < 0) \\r | |
5264 | end = 0; \\r | |
5265 | } \\r | |
5266 | if (start < 0) { \\r | |
5267 | start += len; \\r | |
5268 | if (start < 0) \\r | |
5269 | start = 0; \\r | |
5270 | }\r | |
5271 | \r | |
5272 | Py_ssize_t PyUnicode_Count(PyObject *str,\r | |
5273 | PyObject *substr,\r | |
5274 | Py_ssize_t start,\r | |
5275 | Py_ssize_t end)\r | |
5276 | {\r | |
5277 | Py_ssize_t result;\r | |
5278 | PyUnicodeObject* str_obj;\r | |
5279 | PyUnicodeObject* sub_obj;\r | |
5280 | \r | |
5281 | str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);\r | |
5282 | if (!str_obj)\r | |
5283 | return -1;\r | |
5284 | sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);\r | |
5285 | if (!sub_obj) {\r | |
5286 | Py_DECREF(str_obj);\r | |
5287 | return -1;\r | |
5288 | }\r | |
5289 | \r | |
5290 | ADJUST_INDICES(start, end, str_obj->length);\r | |
5291 | result = stringlib_count(\r | |
5292 | str_obj->str + start, end - start, sub_obj->str, sub_obj->length,\r | |
5293 | PY_SSIZE_T_MAX\r | |
5294 | );\r | |
5295 | \r | |
5296 | Py_DECREF(sub_obj);\r | |
5297 | Py_DECREF(str_obj);\r | |
5298 | \r | |
5299 | return result;\r | |
5300 | }\r | |
5301 | \r | |
5302 | Py_ssize_t PyUnicode_Find(PyObject *str,\r | |
5303 | PyObject *sub,\r | |
5304 | Py_ssize_t start,\r | |
5305 | Py_ssize_t end,\r | |
5306 | int direction)\r | |
5307 | {\r | |
5308 | Py_ssize_t result;\r | |
5309 | \r | |
5310 | str = PyUnicode_FromObject(str);\r | |
5311 | if (!str)\r | |
5312 | return -2;\r | |
5313 | sub = PyUnicode_FromObject(sub);\r | |
5314 | if (!sub) {\r | |
5315 | Py_DECREF(str);\r | |
5316 | return -2;\r | |
5317 | }\r | |
5318 | \r | |
5319 | if (direction > 0)\r | |
5320 | result = stringlib_find_slice(\r | |
5321 | PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),\r | |
5322 | PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),\r | |
5323 | start, end\r | |
5324 | );\r | |
5325 | else\r | |
5326 | result = stringlib_rfind_slice(\r | |
5327 | PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),\r | |
5328 | PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),\r | |
5329 | start, end\r | |
5330 | );\r | |
5331 | \r | |
5332 | Py_DECREF(str);\r | |
5333 | Py_DECREF(sub);\r | |
5334 | \r | |
5335 | return result;\r | |
5336 | }\r | |
5337 | \r | |
5338 | static\r | |
5339 | int tailmatch(PyUnicodeObject *self,\r | |
5340 | PyUnicodeObject *substring,\r | |
5341 | Py_ssize_t start,\r | |
5342 | Py_ssize_t end,\r | |
5343 | int direction)\r | |
5344 | {\r | |
5345 | if (substring->length == 0)\r | |
5346 | return 1;\r | |
5347 | \r | |
5348 | ADJUST_INDICES(start, end, self->length);\r | |
5349 | end -= substring->length;\r | |
5350 | if (end < start)\r | |
5351 | return 0;\r | |
5352 | \r | |
5353 | if (direction > 0) {\r | |
5354 | if (Py_UNICODE_MATCH(self, end, substring))\r | |
5355 | return 1;\r | |
5356 | } else {\r | |
5357 | if (Py_UNICODE_MATCH(self, start, substring))\r | |
5358 | return 1;\r | |
5359 | }\r | |
5360 | \r | |
5361 | return 0;\r | |
5362 | }\r | |
5363 | \r | |
5364 | Py_ssize_t PyUnicode_Tailmatch(PyObject *str,\r | |
5365 | PyObject *substr,\r | |
5366 | Py_ssize_t start,\r | |
5367 | Py_ssize_t end,\r | |
5368 | int direction)\r | |
5369 | {\r | |
5370 | Py_ssize_t result;\r | |
5371 | \r | |
5372 | str = PyUnicode_FromObject(str);\r | |
5373 | if (str == NULL)\r | |
5374 | return -1;\r | |
5375 | substr = PyUnicode_FromObject(substr);\r | |
5376 | if (substr == NULL) {\r | |
5377 | Py_DECREF(str);\r | |
5378 | return -1;\r | |
5379 | }\r | |
5380 | \r | |
5381 | result = tailmatch((PyUnicodeObject *)str,\r | |
5382 | (PyUnicodeObject *)substr,\r | |
5383 | start, end, direction);\r | |
5384 | Py_DECREF(str);\r | |
5385 | Py_DECREF(substr);\r | |
5386 | return result;\r | |
5387 | }\r | |
5388 | \r | |
5389 | /* Apply fixfct filter to the Unicode object self and return a\r | |
5390 | reference to the modified object */\r | |
5391 | \r | |
5392 | static\r | |
5393 | PyObject *fixup(PyUnicodeObject *self,\r | |
5394 | int (*fixfct)(PyUnicodeObject *s))\r | |
5395 | {\r | |
5396 | \r | |
5397 | PyUnicodeObject *u;\r | |
5398 | \r | |
5399 | u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);\r | |
5400 | if (u == NULL)\r | |
5401 | return NULL;\r | |
5402 | \r | |
5403 | Py_UNICODE_COPY(u->str, self->str, self->length);\r | |
5404 | \r | |
5405 | if (!fixfct(u) && PyUnicode_CheckExact(self)) {\r | |
5406 | /* fixfct should return TRUE if it modified the buffer. If\r | |
5407 | FALSE, return a reference to the original buffer instead\r | |
5408 | (to save space, not time) */\r | |
5409 | Py_INCREF(self);\r | |
5410 | Py_DECREF(u);\r | |
5411 | return (PyObject*) self;\r | |
5412 | }\r | |
5413 | return (PyObject*) u;\r | |
5414 | }\r | |
5415 | \r | |
5416 | static\r | |
5417 | int fixupper(PyUnicodeObject *self)\r | |
5418 | {\r | |
5419 | Py_ssize_t len = self->length;\r | |
5420 | Py_UNICODE *s = self->str;\r | |
5421 | int status = 0;\r | |
5422 | \r | |
5423 | while (len-- > 0) {\r | |
5424 | register Py_UNICODE ch;\r | |
5425 | \r | |
5426 | ch = Py_UNICODE_TOUPPER(*s);\r | |
5427 | if (ch != *s) {\r | |
5428 | status = 1;\r | |
5429 | *s = ch;\r | |
5430 | }\r | |
5431 | s++;\r | |
5432 | }\r | |
5433 | \r | |
5434 | return status;\r | |
5435 | }\r | |
5436 | \r | |
5437 | static\r | |
5438 | int fixlower(PyUnicodeObject *self)\r | |
5439 | {\r | |
5440 | Py_ssize_t len = self->length;\r | |
5441 | Py_UNICODE *s = self->str;\r | |
5442 | int status = 0;\r | |
5443 | \r | |
5444 | while (len-- > 0) {\r | |
5445 | register Py_UNICODE ch;\r | |
5446 | \r | |
5447 | ch = Py_UNICODE_TOLOWER(*s);\r | |
5448 | if (ch != *s) {\r | |
5449 | status = 1;\r | |
5450 | *s = ch;\r | |
5451 | }\r | |
5452 | s++;\r | |
5453 | }\r | |
5454 | \r | |
5455 | return status;\r | |
5456 | }\r | |
5457 | \r | |
5458 | static\r | |
5459 | int fixswapcase(PyUnicodeObject *self)\r | |
5460 | {\r | |
5461 | Py_ssize_t len = self->length;\r | |
5462 | Py_UNICODE *s = self->str;\r | |
5463 | int status = 0;\r | |
5464 | \r | |
5465 | while (len-- > 0) {\r | |
5466 | if (Py_UNICODE_ISUPPER(*s)) {\r | |
5467 | *s = Py_UNICODE_TOLOWER(*s);\r | |
5468 | status = 1;\r | |
5469 | } else if (Py_UNICODE_ISLOWER(*s)) {\r | |
5470 | *s = Py_UNICODE_TOUPPER(*s);\r | |
5471 | status = 1;\r | |
5472 | }\r | |
5473 | s++;\r | |
5474 | }\r | |
5475 | \r | |
5476 | return status;\r | |
5477 | }\r | |
5478 | \r | |
5479 | static\r | |
5480 | int fixcapitalize(PyUnicodeObject *self)\r | |
5481 | {\r | |
5482 | Py_ssize_t len = self->length;\r | |
5483 | Py_UNICODE *s = self->str;\r | |
5484 | int status = 0;\r | |
5485 | \r | |
5486 | if (len == 0)\r | |
5487 | return 0;\r | |
5488 | if (Py_UNICODE_ISLOWER(*s)) {\r | |
5489 | *s = Py_UNICODE_TOUPPER(*s);\r | |
5490 | status = 1;\r | |
5491 | }\r | |
5492 | s++;\r | |
5493 | while (--len > 0) {\r | |
5494 | if (Py_UNICODE_ISUPPER(*s)) {\r | |
5495 | *s = Py_UNICODE_TOLOWER(*s);\r | |
5496 | status = 1;\r | |
5497 | }\r | |
5498 | s++;\r | |
5499 | }\r | |
5500 | return status;\r | |
5501 | }\r | |
5502 | \r | |
5503 | static\r | |
5504 | int fixtitle(PyUnicodeObject *self)\r | |
5505 | {\r | |
5506 | register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
5507 | register Py_UNICODE *e;\r | |
5508 | int previous_is_cased;\r | |
5509 | \r | |
5510 | /* Shortcut for single character strings */\r | |
5511 | if (PyUnicode_GET_SIZE(self) == 1) {\r | |
5512 | Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);\r | |
5513 | if (*p != ch) {\r | |
5514 | *p = ch;\r | |
5515 | return 1;\r | |
5516 | }\r | |
5517 | else\r | |
5518 | return 0;\r | |
5519 | }\r | |
5520 | \r | |
5521 | e = p + PyUnicode_GET_SIZE(self);\r | |
5522 | previous_is_cased = 0;\r | |
5523 | for (; p < e; p++) {\r | |
5524 | register const Py_UNICODE ch = *p;\r | |
5525 | \r | |
5526 | if (previous_is_cased)\r | |
5527 | *p = Py_UNICODE_TOLOWER(ch);\r | |
5528 | else\r | |
5529 | *p = Py_UNICODE_TOTITLE(ch);\r | |
5530 | \r | |
5531 | if (Py_UNICODE_ISLOWER(ch) ||\r | |
5532 | Py_UNICODE_ISUPPER(ch) ||\r | |
5533 | Py_UNICODE_ISTITLE(ch))\r | |
5534 | previous_is_cased = 1;\r | |
5535 | else\r | |
5536 | previous_is_cased = 0;\r | |
5537 | }\r | |
5538 | return 1;\r | |
5539 | }\r | |
5540 | \r | |
5541 | PyObject *\r | |
5542 | PyUnicode_Join(PyObject *separator, PyObject *seq)\r | |
5543 | {\r | |
5544 | PyObject *internal_separator = NULL;\r | |
5545 | const Py_UNICODE blank = ' ';\r | |
5546 | const Py_UNICODE *sep = ␣\r | |
5547 | Py_ssize_t seplen = 1;\r | |
5548 | PyUnicodeObject *res = NULL; /* the result */\r | |
5549 | Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */\r | |
5550 | Py_ssize_t res_used; /* # used bytes */\r | |
5551 | Py_UNICODE *res_p; /* pointer to free byte in res's string area */\r | |
5552 | PyObject *fseq; /* PySequence_Fast(seq) */\r | |
5553 | Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */\r | |
5554 | PyObject *item;\r | |
5555 | Py_ssize_t i;\r | |
5556 | \r | |
5557 | fseq = PySequence_Fast(seq, "");\r | |
5558 | if (fseq == NULL) {\r | |
5559 | return NULL;\r | |
5560 | }\r | |
5561 | \r | |
5562 | /* Grrrr. A codec may be invoked to convert str objects to\r | |
5563 | * Unicode, and so it's possible to call back into Python code\r | |
5564 | * during PyUnicode_FromObject(), and so it's possible for a sick\r | |
5565 | * codec to change the size of fseq (if seq is a list). Therefore\r | |
5566 | * we have to keep refetching the size -- can't assume seqlen\r | |
5567 | * is invariant.\r | |
5568 | */\r | |
5569 | seqlen = PySequence_Fast_GET_SIZE(fseq);\r | |
5570 | /* If empty sequence, return u"". */\r | |
5571 | if (seqlen == 0) {\r | |
5572 | res = _PyUnicode_New(0); /* empty sequence; return u"" */\r | |
5573 | goto Done;\r | |
5574 | }\r | |
5575 | /* If singleton sequence with an exact Unicode, return that. */\r | |
5576 | if (seqlen == 1) {\r | |
5577 | item = PySequence_Fast_GET_ITEM(fseq, 0);\r | |
5578 | if (PyUnicode_CheckExact(item)) {\r | |
5579 | Py_INCREF(item);\r | |
5580 | res = (PyUnicodeObject *)item;\r | |
5581 | goto Done;\r | |
5582 | }\r | |
5583 | }\r | |
5584 | \r | |
5585 | /* At least two items to join, or one that isn't exact Unicode. */\r | |
5586 | if (seqlen > 1) {\r | |
5587 | /* Set up sep and seplen -- they're needed. */\r | |
5588 | if (separator == NULL) {\r | |
5589 | sep = ␣\r | |
5590 | seplen = 1;\r | |
5591 | }\r | |
5592 | else {\r | |
5593 | internal_separator = PyUnicode_FromObject(separator);\r | |
5594 | if (internal_separator == NULL)\r | |
5595 | goto onError;\r | |
5596 | sep = PyUnicode_AS_UNICODE(internal_separator);\r | |
5597 | seplen = PyUnicode_GET_SIZE(internal_separator);\r | |
5598 | /* In case PyUnicode_FromObject() mutated seq. */\r | |
5599 | seqlen = PySequence_Fast_GET_SIZE(fseq);\r | |
5600 | }\r | |
5601 | }\r | |
5602 | \r | |
5603 | /* Get space. */\r | |
5604 | res = _PyUnicode_New(res_alloc);\r | |
5605 | if (res == NULL)\r | |
5606 | goto onError;\r | |
5607 | res_p = PyUnicode_AS_UNICODE(res);\r | |
5608 | res_used = 0;\r | |
5609 | \r | |
5610 | for (i = 0; i < seqlen; ++i) {\r | |
5611 | Py_ssize_t itemlen;\r | |
5612 | Py_ssize_t new_res_used;\r | |
5613 | \r | |
5614 | item = PySequence_Fast_GET_ITEM(fseq, i);\r | |
5615 | /* Convert item to Unicode. */\r | |
5616 | if (! PyUnicode_Check(item) && ! PyString_Check(item)) {\r | |
5617 | PyErr_Format(PyExc_TypeError,\r | |
5618 | "sequence item %zd: expected string or Unicode,"\r | |
5619 | " %.80s found",\r | |
5620 | i, Py_TYPE(item)->tp_name);\r | |
5621 | goto onError;\r | |
5622 | }\r | |
5623 | item = PyUnicode_FromObject(item);\r | |
5624 | if (item == NULL)\r | |
5625 | goto onError;\r | |
5626 | /* We own a reference to item from here on. */\r | |
5627 | \r | |
5628 | /* In case PyUnicode_FromObject() mutated seq. */\r | |
5629 | seqlen = PySequence_Fast_GET_SIZE(fseq);\r | |
5630 | \r | |
5631 | /* Make sure we have enough space for the separator and the item. */\r | |
5632 | itemlen = PyUnicode_GET_SIZE(item);\r | |
5633 | new_res_used = res_used + itemlen;\r | |
5634 | if (new_res_used < 0)\r | |
5635 | goto Overflow;\r | |
5636 | if (i < seqlen - 1) {\r | |
5637 | new_res_used += seplen;\r | |
5638 | if (new_res_used < 0)\r | |
5639 | goto Overflow;\r | |
5640 | }\r | |
5641 | if (new_res_used > res_alloc) {\r | |
5642 | /* double allocated size until it's big enough */\r | |
5643 | do {\r | |
5644 | res_alloc += res_alloc;\r | |
5645 | if (res_alloc <= 0)\r | |
5646 | goto Overflow;\r | |
5647 | } while (new_res_used > res_alloc);\r | |
5648 | if (_PyUnicode_Resize(&res, res_alloc) < 0) {\r | |
5649 | Py_DECREF(item);\r | |
5650 | goto onError;\r | |
5651 | }\r | |
5652 | res_p = PyUnicode_AS_UNICODE(res) + res_used;\r | |
5653 | }\r | |
5654 | \r | |
5655 | /* Copy item, and maybe the separator. */\r | |
5656 | Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);\r | |
5657 | res_p += itemlen;\r | |
5658 | if (i < seqlen - 1) {\r | |
5659 | Py_UNICODE_COPY(res_p, sep, seplen);\r | |
5660 | res_p += seplen;\r | |
5661 | }\r | |
5662 | Py_DECREF(item);\r | |
5663 | res_used = new_res_used;\r | |
5664 | }\r | |
5665 | \r | |
5666 | /* Shrink res to match the used area; this probably can't fail,\r | |
5667 | * but it's cheap to check.\r | |
5668 | */\r | |
5669 | if (_PyUnicode_Resize(&res, res_used) < 0)\r | |
5670 | goto onError;\r | |
5671 | \r | |
5672 | Done:\r | |
5673 | Py_XDECREF(internal_separator);\r | |
5674 | Py_DECREF(fseq);\r | |
5675 | return (PyObject *)res;\r | |
5676 | \r | |
5677 | Overflow:\r | |
5678 | PyErr_SetString(PyExc_OverflowError,\r | |
5679 | "join() result is too long for a Python string");\r | |
5680 | Py_DECREF(item);\r | |
5681 | /* fall through */\r | |
5682 | \r | |
5683 | onError:\r | |
5684 | Py_XDECREF(internal_separator);\r | |
5685 | Py_DECREF(fseq);\r | |
5686 | Py_XDECREF(res);\r | |
5687 | return NULL;\r | |
5688 | }\r | |
5689 | \r | |
5690 | static\r | |
5691 | PyUnicodeObject *pad(PyUnicodeObject *self,\r | |
5692 | Py_ssize_t left,\r | |
5693 | Py_ssize_t right,\r | |
5694 | Py_UNICODE fill)\r | |
5695 | {\r | |
5696 | PyUnicodeObject *u;\r | |
5697 | \r | |
5698 | if (left < 0)\r | |
5699 | left = 0;\r | |
5700 | if (right < 0)\r | |
5701 | right = 0;\r | |
5702 | \r | |
5703 | if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {\r | |
5704 | Py_INCREF(self);\r | |
5705 | return self;\r | |
5706 | }\r | |
5707 | \r | |
5708 | if (left > PY_SSIZE_T_MAX - self->length ||\r | |
5709 | right > PY_SSIZE_T_MAX - (left + self->length)) {\r | |
5710 | PyErr_SetString(PyExc_OverflowError, "padded string is too long");\r | |
5711 | return NULL;\r | |
5712 | }\r | |
5713 | u = _PyUnicode_New(left + self->length + right);\r | |
5714 | if (u) {\r | |
5715 | if (left)\r | |
5716 | Py_UNICODE_FILL(u->str, fill, left);\r | |
5717 | Py_UNICODE_COPY(u->str + left, self->str, self->length);\r | |
5718 | if (right)\r | |
5719 | Py_UNICODE_FILL(u->str + left + self->length, fill, right);\r | |
5720 | }\r | |
5721 | \r | |
5722 | return u;\r | |
5723 | }\r | |
5724 | \r | |
5725 | PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)\r | |
5726 | {\r | |
5727 | PyObject *list;\r | |
5728 | \r | |
5729 | string = PyUnicode_FromObject(string);\r | |
5730 | if (string == NULL)\r | |
5731 | return NULL;\r | |
5732 | \r | |
5733 | list = stringlib_splitlines(\r | |
5734 | (PyObject*) string, PyUnicode_AS_UNICODE(string),\r | |
5735 | PyUnicode_GET_SIZE(string), keepends);\r | |
5736 | \r | |
5737 | Py_DECREF(string);\r | |
5738 | return list;\r | |
5739 | }\r | |
5740 | \r | |
5741 | static\r | |
5742 | PyObject *split(PyUnicodeObject *self,\r | |
5743 | PyUnicodeObject *substring,\r | |
5744 | Py_ssize_t maxcount)\r | |
5745 | {\r | |
5746 | if (maxcount < 0)\r | |
5747 | maxcount = PY_SSIZE_T_MAX;\r | |
5748 | \r | |
5749 | if (substring == NULL)\r | |
5750 | return stringlib_split_whitespace(\r | |
5751 | (PyObject*) self, self->str, self->length, maxcount\r | |
5752 | );\r | |
5753 | \r | |
5754 | return stringlib_split(\r | |
5755 | (PyObject*) self, self->str, self->length,\r | |
5756 | substring->str, substring->length,\r | |
5757 | maxcount\r | |
5758 | );\r | |
5759 | }\r | |
5760 | \r | |
5761 | static\r | |
5762 | PyObject *rsplit(PyUnicodeObject *self,\r | |
5763 | PyUnicodeObject *substring,\r | |
5764 | Py_ssize_t maxcount)\r | |
5765 | {\r | |
5766 | if (maxcount < 0)\r | |
5767 | maxcount = PY_SSIZE_T_MAX;\r | |
5768 | \r | |
5769 | if (substring == NULL)\r | |
5770 | return stringlib_rsplit_whitespace(\r | |
5771 | (PyObject*) self, self->str, self->length, maxcount\r | |
5772 | );\r | |
5773 | \r | |
5774 | return stringlib_rsplit(\r | |
5775 | (PyObject*) self, self->str, self->length,\r | |
5776 | substring->str, substring->length,\r | |
5777 | maxcount\r | |
5778 | );\r | |
5779 | }\r | |
5780 | \r | |
5781 | static\r | |
5782 | PyObject *replace(PyUnicodeObject *self,\r | |
5783 | PyUnicodeObject *str1,\r | |
5784 | PyUnicodeObject *str2,\r | |
5785 | Py_ssize_t maxcount)\r | |
5786 | {\r | |
5787 | PyUnicodeObject *u;\r | |
5788 | \r | |
5789 | if (maxcount < 0)\r | |
5790 | maxcount = PY_SSIZE_T_MAX;\r | |
5791 | else if (maxcount == 0 || self->length == 0)\r | |
5792 | goto nothing;\r | |
5793 | \r | |
5794 | if (str1->length == str2->length) {\r | |
5795 | Py_ssize_t i;\r | |
5796 | /* same length */\r | |
5797 | if (str1->length == 0)\r | |
5798 | goto nothing;\r | |
5799 | if (str1->length == 1) {\r | |
5800 | /* replace characters */\r | |
5801 | Py_UNICODE u1, u2;\r | |
5802 | if (!findchar(self->str, self->length, str1->str[0]))\r | |
5803 | goto nothing;\r | |
5804 | u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);\r | |
5805 | if (!u)\r | |
5806 | return NULL;\r | |
5807 | Py_UNICODE_COPY(u->str, self->str, self->length);\r | |
5808 | u1 = str1->str[0];\r | |
5809 | u2 = str2->str[0];\r | |
5810 | for (i = 0; i < u->length; i++)\r | |
5811 | if (u->str[i] == u1) {\r | |
5812 | if (--maxcount < 0)\r | |
5813 | break;\r | |
5814 | u->str[i] = u2;\r | |
5815 | }\r | |
5816 | } else {\r | |
5817 | i = stringlib_find(\r | |
5818 | self->str, self->length, str1->str, str1->length, 0\r | |
5819 | );\r | |
5820 | if (i < 0)\r | |
5821 | goto nothing;\r | |
5822 | u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);\r | |
5823 | if (!u)\r | |
5824 | return NULL;\r | |
5825 | Py_UNICODE_COPY(u->str, self->str, self->length);\r | |
5826 | \r | |
5827 | /* change everything in-place, starting with this one */\r | |
5828 | Py_UNICODE_COPY(u->str+i, str2->str, str2->length);\r | |
5829 | i += str1->length;\r | |
5830 | \r | |
5831 | while ( --maxcount > 0) {\r | |
5832 | i = stringlib_find(self->str+i, self->length-i,\r | |
5833 | str1->str, str1->length,\r | |
5834 | i);\r | |
5835 | if (i == -1)\r | |
5836 | break;\r | |
5837 | Py_UNICODE_COPY(u->str+i, str2->str, str2->length);\r | |
5838 | i += str1->length;\r | |
5839 | }\r | |
5840 | }\r | |
5841 | } else {\r | |
5842 | \r | |
5843 | Py_ssize_t n, i, j;\r | |
5844 | Py_ssize_t product, new_size, delta;\r | |
5845 | Py_UNICODE *p;\r | |
5846 | \r | |
5847 | /* replace strings */\r | |
5848 | n = stringlib_count(self->str, self->length, str1->str, str1->length,\r | |
5849 | maxcount);\r | |
5850 | if (n == 0)\r | |
5851 | goto nothing;\r | |
5852 | /* new_size = self->length + n * (str2->length - str1->length)); */\r | |
5853 | delta = (str2->length - str1->length);\r | |
5854 | if (delta == 0) {\r | |
5855 | new_size = self->length;\r | |
5856 | } else {\r | |
5857 | product = n * (str2->length - str1->length);\r | |
5858 | if ((product / (str2->length - str1->length)) != n) {\r | |
5859 | PyErr_SetString(PyExc_OverflowError,\r | |
5860 | "replace string is too long");\r | |
5861 | return NULL;\r | |
5862 | }\r | |
5863 | new_size = self->length + product;\r | |
5864 | if (new_size < 0) {\r | |
5865 | PyErr_SetString(PyExc_OverflowError,\r | |
5866 | "replace string is too long");\r | |
5867 | return NULL;\r | |
5868 | }\r | |
5869 | }\r | |
5870 | u = _PyUnicode_New(new_size);\r | |
5871 | if (!u)\r | |
5872 | return NULL;\r | |
5873 | i = 0;\r | |
5874 | p = u->str;\r | |
5875 | if (str1->length > 0) {\r | |
5876 | while (n-- > 0) {\r | |
5877 | /* look for next match */\r | |
5878 | j = stringlib_find(self->str+i, self->length-i,\r | |
5879 | str1->str, str1->length,\r | |
5880 | i);\r | |
5881 | if (j == -1)\r | |
5882 | break;\r | |
5883 | else if (j > i) {\r | |
5884 | /* copy unchanged part [i:j] */\r | |
5885 | Py_UNICODE_COPY(p, self->str+i, j-i);\r | |
5886 | p += j - i;\r | |
5887 | }\r | |
5888 | /* copy substitution string */\r | |
5889 | if (str2->length > 0) {\r | |
5890 | Py_UNICODE_COPY(p, str2->str, str2->length);\r | |
5891 | p += str2->length;\r | |
5892 | }\r | |
5893 | i = j + str1->length;\r | |
5894 | }\r | |
5895 | if (i < self->length)\r | |
5896 | /* copy tail [i:] */\r | |
5897 | Py_UNICODE_COPY(p, self->str+i, self->length-i);\r | |
5898 | } else {\r | |
5899 | /* interleave */\r | |
5900 | while (n > 0) {\r | |
5901 | Py_UNICODE_COPY(p, str2->str, str2->length);\r | |
5902 | p += str2->length;\r | |
5903 | if (--n <= 0)\r | |
5904 | break;\r | |
5905 | *p++ = self->str[i++];\r | |
5906 | }\r | |
5907 | Py_UNICODE_COPY(p, self->str+i, self->length-i);\r | |
5908 | }\r | |
5909 | }\r | |
5910 | return (PyObject *) u;\r | |
5911 | \r | |
5912 | nothing:\r | |
5913 | /* nothing to replace; return original string (when possible) */\r | |
5914 | if (PyUnicode_CheckExact(self)) {\r | |
5915 | Py_INCREF(self);\r | |
5916 | return (PyObject *) self;\r | |
5917 | }\r | |
5918 | return PyUnicode_FromUnicode(self->str, self->length);\r | |
5919 | }\r | |
5920 | \r | |
5921 | /* --- Unicode Object Methods --------------------------------------------- */\r | |
5922 | \r | |
5923 | PyDoc_STRVAR(title__doc__,\r | |
5924 | "S.title() -> unicode\n\\r | |
5925 | \n\\r | |
5926 | Return a titlecased version of S, i.e. words start with title case\n\\r | |
5927 | characters, all remaining cased characters have lower case.");\r | |
5928 | \r | |
5929 | static PyObject*\r | |
5930 | unicode_title(PyUnicodeObject *self)\r | |
5931 | {\r | |
5932 | return fixup(self, fixtitle);\r | |
5933 | }\r | |
5934 | \r | |
5935 | PyDoc_STRVAR(capitalize__doc__,\r | |
5936 | "S.capitalize() -> unicode\n\\r | |
5937 | \n\\r | |
5938 | Return a capitalized version of S, i.e. make the first character\n\\r | |
5939 | have upper case and the rest lower case.");\r | |
5940 | \r | |
5941 | static PyObject*\r | |
5942 | unicode_capitalize(PyUnicodeObject *self)\r | |
5943 | {\r | |
5944 | return fixup(self, fixcapitalize);\r | |
5945 | }\r | |
5946 | \r | |
5947 | #if 0\r | |
5948 | PyDoc_STRVAR(capwords__doc__,\r | |
5949 | "S.capwords() -> unicode\n\\r | |
5950 | \n\\r | |
5951 | Apply .capitalize() to all words in S and return the result with\n\\r | |
5952 | normalized whitespace (all whitespace strings are replaced by ' ').");\r | |
5953 | \r | |
5954 | static PyObject*\r | |
5955 | unicode_capwords(PyUnicodeObject *self)\r | |
5956 | {\r | |
5957 | PyObject *list;\r | |
5958 | PyObject *item;\r | |
5959 | Py_ssize_t i;\r | |
5960 | \r | |
5961 | /* Split into words */\r | |
5962 | list = split(self, NULL, -1);\r | |
5963 | if (!list)\r | |
5964 | return NULL;\r | |
5965 | \r | |
5966 | /* Capitalize each word */\r | |
5967 | for (i = 0; i < PyList_GET_SIZE(list); i++) {\r | |
5968 | item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),\r | |
5969 | fixcapitalize);\r | |
5970 | if (item == NULL)\r | |
5971 | goto onError;\r | |
5972 | Py_DECREF(PyList_GET_ITEM(list, i));\r | |
5973 | PyList_SET_ITEM(list, i, item);\r | |
5974 | }\r | |
5975 | \r | |
5976 | /* Join the words to form a new string */\r | |
5977 | item = PyUnicode_Join(NULL, list);\r | |
5978 | \r | |
5979 | onError:\r | |
5980 | Py_DECREF(list);\r | |
5981 | return (PyObject *)item;\r | |
5982 | }\r | |
5983 | #endif\r | |
5984 | \r | |
5985 | /* Argument converter. Coerces to a single unicode character */\r | |
5986 | \r | |
5987 | static int\r | |
5988 | convert_uc(PyObject *obj, void *addr)\r | |
5989 | {\r | |
5990 | Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;\r | |
5991 | PyObject *uniobj;\r | |
5992 | Py_UNICODE *unistr;\r | |
5993 | \r | |
5994 | uniobj = PyUnicode_FromObject(obj);\r | |
5995 | if (uniobj == NULL) {\r | |
5996 | PyErr_SetString(PyExc_TypeError,\r | |
5997 | "The fill character cannot be converted to Unicode");\r | |
5998 | return 0;\r | |
5999 | }\r | |
6000 | if (PyUnicode_GET_SIZE(uniobj) != 1) {\r | |
6001 | PyErr_SetString(PyExc_TypeError,\r | |
6002 | "The fill character must be exactly one character long");\r | |
6003 | Py_DECREF(uniobj);\r | |
6004 | return 0;\r | |
6005 | }\r | |
6006 | unistr = PyUnicode_AS_UNICODE(uniobj);\r | |
6007 | *fillcharloc = unistr[0];\r | |
6008 | Py_DECREF(uniobj);\r | |
6009 | return 1;\r | |
6010 | }\r | |
6011 | \r | |
6012 | PyDoc_STRVAR(center__doc__,\r | |
6013 | "S.center(width[, fillchar]) -> unicode\n\\r | |
6014 | \n\\r | |
6015 | Return S centered in a Unicode string of length width. Padding is\n\\r | |
6016 | done using the specified fill character (default is a space)");\r | |
6017 | \r | |
6018 | static PyObject *\r | |
6019 | unicode_center(PyUnicodeObject *self, PyObject *args)\r | |
6020 | {\r | |
6021 | Py_ssize_t marg, left;\r | |
6022 | Py_ssize_t width;\r | |
6023 | Py_UNICODE fillchar = ' ';\r | |
6024 | \r | |
6025 | if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))\r | |
6026 | return NULL;\r | |
6027 | \r | |
6028 | if (self->length >= width && PyUnicode_CheckExact(self)) {\r | |
6029 | Py_INCREF(self);\r | |
6030 | return (PyObject*) self;\r | |
6031 | }\r | |
6032 | \r | |
6033 | marg = width - self->length;\r | |
6034 | left = marg / 2 + (marg & width & 1);\r | |
6035 | \r | |
6036 | return (PyObject*) pad(self, left, marg - left, fillchar);\r | |
6037 | }\r | |
6038 | \r | |
6039 | #if 0\r | |
6040 | \r | |
6041 | /* This code should go into some future Unicode collation support\r | |
6042 | module. The basic comparison should compare ordinals on a naive\r | |
6043 | basis (this is what Java does and thus Jython too). */\r | |
6044 | \r | |
6045 | /* speedy UTF-16 code point order comparison */\r | |
6046 | /* gleaned from: */\r | |
6047 | /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */\r | |
6048 | \r | |
6049 | static short utf16Fixup[32] =\r | |
6050 | {\r | |
6051 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
6052 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
6053 | 0, 0, 0, 0, 0, 0, 0, 0,\r | |
6054 | 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800\r | |
6055 | };\r | |
6056 | \r | |
6057 | static int\r | |
6058 | unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)\r | |
6059 | {\r | |
6060 | Py_ssize_t len1, len2;\r | |
6061 | \r | |
6062 | Py_UNICODE *s1 = str1->str;\r | |
6063 | Py_UNICODE *s2 = str2->str;\r | |
6064 | \r | |
6065 | len1 = str1->length;\r | |
6066 | len2 = str2->length;\r | |
6067 | \r | |
6068 | while (len1 > 0 && len2 > 0) {\r | |
6069 | Py_UNICODE c1, c2;\r | |
6070 | \r | |
6071 | c1 = *s1++;\r | |
6072 | c2 = *s2++;\r | |
6073 | \r | |
6074 | if (c1 > (1<<11) * 26)\r | |
6075 | c1 += utf16Fixup[c1>>11];\r | |
6076 | if (c2 > (1<<11) * 26)\r | |
6077 | c2 += utf16Fixup[c2>>11];\r | |
6078 | /* now c1 and c2 are in UTF-32-compatible order */\r | |
6079 | \r | |
6080 | if (c1 != c2)\r | |
6081 | return (c1 < c2) ? -1 : 1;\r | |
6082 | \r | |
6083 | len1--; len2--;\r | |
6084 | }\r | |
6085 | \r | |
6086 | return (len1 < len2) ? -1 : (len1 != len2);\r | |
6087 | }\r | |
6088 | \r | |
6089 | #else\r | |
6090 | \r | |
6091 | static int\r | |
6092 | unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)\r | |
6093 | {\r | |
6094 | register Py_ssize_t len1, len2;\r | |
6095 | \r | |
6096 | Py_UNICODE *s1 = str1->str;\r | |
6097 | Py_UNICODE *s2 = str2->str;\r | |
6098 | \r | |
6099 | len1 = str1->length;\r | |
6100 | len2 = str2->length;\r | |
6101 | \r | |
6102 | while (len1 > 0 && len2 > 0) {\r | |
6103 | Py_UNICODE c1, c2;\r | |
6104 | \r | |
6105 | c1 = *s1++;\r | |
6106 | c2 = *s2++;\r | |
6107 | \r | |
6108 | if (c1 != c2)\r | |
6109 | return (c1 < c2) ? -1 : 1;\r | |
6110 | \r | |
6111 | len1--; len2--;\r | |
6112 | }\r | |
6113 | \r | |
6114 | return (len1 < len2) ? -1 : (len1 != len2);\r | |
6115 | }\r | |
6116 | \r | |
6117 | #endif\r | |
6118 | \r | |
6119 | int PyUnicode_Compare(PyObject *left,\r | |
6120 | PyObject *right)\r | |
6121 | {\r | |
6122 | PyUnicodeObject *u = NULL, *v = NULL;\r | |
6123 | int result;\r | |
6124 | \r | |
6125 | /* Coerce the two arguments */\r | |
6126 | u = (PyUnicodeObject *)PyUnicode_FromObject(left);\r | |
6127 | if (u == NULL)\r | |
6128 | goto onError;\r | |
6129 | v = (PyUnicodeObject *)PyUnicode_FromObject(right);\r | |
6130 | if (v == NULL)\r | |
6131 | goto onError;\r | |
6132 | \r | |
6133 | /* Shortcut for empty or interned objects */\r | |
6134 | if (v == u) {\r | |
6135 | Py_DECREF(u);\r | |
6136 | Py_DECREF(v);\r | |
6137 | return 0;\r | |
6138 | }\r | |
6139 | \r | |
6140 | result = unicode_compare(u, v);\r | |
6141 | \r | |
6142 | Py_DECREF(u);\r | |
6143 | Py_DECREF(v);\r | |
6144 | return result;\r | |
6145 | \r | |
6146 | onError:\r | |
6147 | Py_XDECREF(u);\r | |
6148 | Py_XDECREF(v);\r | |
6149 | return -1;\r | |
6150 | }\r | |
6151 | \r | |
6152 | PyObject *PyUnicode_RichCompare(PyObject *left,\r | |
6153 | PyObject *right,\r | |
6154 | int op)\r | |
6155 | {\r | |
6156 | int result;\r | |
6157 | \r | |
6158 | result = PyUnicode_Compare(left, right);\r | |
6159 | if (result == -1 && PyErr_Occurred())\r | |
6160 | goto onError;\r | |
6161 | \r | |
6162 | /* Convert the return value to a Boolean */\r | |
6163 | switch (op) {\r | |
6164 | case Py_EQ:\r | |
6165 | result = (result == 0);\r | |
6166 | break;\r | |
6167 | case Py_NE:\r | |
6168 | result = (result != 0);\r | |
6169 | break;\r | |
6170 | case Py_LE:\r | |
6171 | result = (result <= 0);\r | |
6172 | break;\r | |
6173 | case Py_GE:\r | |
6174 | result = (result >= 0);\r | |
6175 | break;\r | |
6176 | case Py_LT:\r | |
6177 | result = (result == -1);\r | |
6178 | break;\r | |
6179 | case Py_GT:\r | |
6180 | result = (result == 1);\r | |
6181 | break;\r | |
6182 | }\r | |
6183 | return PyBool_FromLong(result);\r | |
6184 | \r | |
6185 | onError:\r | |
6186 | \r | |
6187 | /* Standard case\r | |
6188 | \r | |
6189 | Type errors mean that PyUnicode_FromObject() could not convert\r | |
6190 | one of the arguments (usually the right hand side) to Unicode,\r | |
6191 | ie. we can't handle the comparison request. However, it is\r | |
6192 | possible that the other object knows a comparison method, which\r | |
6193 | is why we return Py_NotImplemented to give the other object a\r | |
6194 | chance.\r | |
6195 | \r | |
6196 | */\r | |
6197 | if (PyErr_ExceptionMatches(PyExc_TypeError)) {\r | |
6198 | PyErr_Clear();\r | |
6199 | Py_INCREF(Py_NotImplemented);\r | |
6200 | return Py_NotImplemented;\r | |
6201 | }\r | |
6202 | if (op != Py_EQ && op != Py_NE)\r | |
6203 | return NULL;\r | |
6204 | \r | |
6205 | /* Equality comparison.\r | |
6206 | \r | |
6207 | This is a special case: we silence any PyExc_UnicodeDecodeError\r | |
6208 | and instead turn it into a PyErr_UnicodeWarning.\r | |
6209 | \r | |
6210 | */\r | |
6211 | if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))\r | |
6212 | return NULL;\r | |
6213 | PyErr_Clear();\r | |
6214 | if (PyErr_Warn(PyExc_UnicodeWarning,\r | |
6215 | (op == Py_EQ) ?\r | |
6216 | "Unicode equal comparison "\r | |
6217 | "failed to convert both arguments to Unicode - "\r | |
6218 | "interpreting them as being unequal" :\r | |
6219 | "Unicode unequal comparison "\r | |
6220 | "failed to convert both arguments to Unicode - "\r | |
6221 | "interpreting them as being unequal"\r | |
6222 | ) < 0)\r | |
6223 | return NULL;\r | |
6224 | result = (op == Py_NE);\r | |
6225 | return PyBool_FromLong(result);\r | |
6226 | }\r | |
6227 | \r | |
6228 | int PyUnicode_Contains(PyObject *container,\r | |
6229 | PyObject *element)\r | |
6230 | {\r | |
6231 | PyObject *str, *sub;\r | |
6232 | int result;\r | |
6233 | \r | |
6234 | /* Coerce the two arguments */\r | |
6235 | sub = PyUnicode_FromObject(element);\r | |
6236 | if (!sub) {\r | |
6237 | return -1;\r | |
6238 | }\r | |
6239 | \r | |
6240 | str = PyUnicode_FromObject(container);\r | |
6241 | if (!str) {\r | |
6242 | Py_DECREF(sub);\r | |
6243 | return -1;\r | |
6244 | }\r | |
6245 | \r | |
6246 | result = stringlib_contains_obj(str, sub);\r | |
6247 | \r | |
6248 | Py_DECREF(str);\r | |
6249 | Py_DECREF(sub);\r | |
6250 | \r | |
6251 | return result;\r | |
6252 | }\r | |
6253 | \r | |
6254 | /* Concat to string or Unicode object giving a new Unicode object. */\r | |
6255 | \r | |
6256 | PyObject *PyUnicode_Concat(PyObject *left,\r | |
6257 | PyObject *right)\r | |
6258 | {\r | |
6259 | PyUnicodeObject *u = NULL, *v = NULL, *w;\r | |
6260 | \r | |
6261 | /* Coerce the two arguments */\r | |
6262 | u = (PyUnicodeObject *)PyUnicode_FromObject(left);\r | |
6263 | if (u == NULL)\r | |
6264 | goto onError;\r | |
6265 | v = (PyUnicodeObject *)PyUnicode_FromObject(right);\r | |
6266 | if (v == NULL)\r | |
6267 | goto onError;\r | |
6268 | \r | |
6269 | /* Shortcuts */\r | |
6270 | if (v == unicode_empty) {\r | |
6271 | Py_DECREF(v);\r | |
6272 | return (PyObject *)u;\r | |
6273 | }\r | |
6274 | if (u == unicode_empty) {\r | |
6275 | Py_DECREF(u);\r | |
6276 | return (PyObject *)v;\r | |
6277 | }\r | |
6278 | \r | |
6279 | /* Concat the two Unicode strings */\r | |
6280 | w = _PyUnicode_New(u->length + v->length);\r | |
6281 | if (w == NULL)\r | |
6282 | goto onError;\r | |
6283 | Py_UNICODE_COPY(w->str, u->str, u->length);\r | |
6284 | Py_UNICODE_COPY(w->str + u->length, v->str, v->length);\r | |
6285 | \r | |
6286 | Py_DECREF(u);\r | |
6287 | Py_DECREF(v);\r | |
6288 | return (PyObject *)w;\r | |
6289 | \r | |
6290 | onError:\r | |
6291 | Py_XDECREF(u);\r | |
6292 | Py_XDECREF(v);\r | |
6293 | return NULL;\r | |
6294 | }\r | |
6295 | \r | |
6296 | PyDoc_STRVAR(count__doc__,\r | |
6297 | "S.count(sub[, start[, end]]) -> int\n\\r | |
6298 | \n\\r | |
6299 | Return the number of non-overlapping occurrences of substring sub in\n\\r | |
6300 | Unicode string S[start:end]. Optional arguments start and end are\n\\r | |
6301 | interpreted as in slice notation.");\r | |
6302 | \r | |
6303 | static PyObject *\r | |
6304 | unicode_count(PyUnicodeObject *self, PyObject *args)\r | |
6305 | {\r | |
6306 | PyUnicodeObject *substring;\r | |
6307 | Py_ssize_t start = 0;\r | |
6308 | Py_ssize_t end = PY_SSIZE_T_MAX;\r | |
6309 | PyObject *result;\r | |
6310 | \r | |
6311 | if (!stringlib_parse_args_finds_unicode("count", args, &substring,\r | |
6312 | &start, &end))\r | |
6313 | return NULL;\r | |
6314 | \r | |
6315 | ADJUST_INDICES(start, end, self->length);\r | |
6316 | result = PyInt_FromSsize_t(\r | |
6317 | stringlib_count(self->str + start, end - start,\r | |
6318 | substring->str, substring->length,\r | |
6319 | PY_SSIZE_T_MAX)\r | |
6320 | );\r | |
6321 | \r | |
6322 | Py_DECREF(substring);\r | |
6323 | \r | |
6324 | return result;\r | |
6325 | }\r | |
6326 | \r | |
6327 | PyDoc_STRVAR(encode__doc__,\r | |
6328 | "S.encode([encoding[,errors]]) -> string or unicode\n\\r | |
6329 | \n\\r | |
6330 | Encodes S using the codec registered for encoding. encoding defaults\n\\r | |
6331 | to the default encoding. errors may be given to set a different error\n\\r | |
6332 | handling scheme. Default is 'strict' meaning that encoding errors raise\n\\r | |
6333 | a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\\r | |
6334 | 'xmlcharrefreplace' as well as any other name registered with\n\\r | |
6335 | codecs.register_error that can handle UnicodeEncodeErrors.");\r | |
6336 | \r | |
6337 | static PyObject *\r | |
6338 | unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)\r | |
6339 | {\r | |
6340 | static char *kwlist[] = {"encoding", "errors", 0};\r | |
6341 | char *encoding = NULL;\r | |
6342 | char *errors = NULL;\r | |
6343 | PyObject *v;\r | |
6344 | \r | |
6345 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",\r | |
6346 | kwlist, &encoding, &errors))\r | |
6347 | return NULL;\r | |
6348 | v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);\r | |
6349 | if (v == NULL)\r | |
6350 | goto onError;\r | |
6351 | if (!PyString_Check(v) && !PyUnicode_Check(v)) {\r | |
6352 | PyErr_Format(PyExc_TypeError,\r | |
6353 | "encoder did not return a string/unicode object "\r | |
6354 | "(type=%.400s)",\r | |
6355 | Py_TYPE(v)->tp_name);\r | |
6356 | Py_DECREF(v);\r | |
6357 | return NULL;\r | |
6358 | }\r | |
6359 | return v;\r | |
6360 | \r | |
6361 | onError:\r | |
6362 | return NULL;\r | |
6363 | }\r | |
6364 | \r | |
6365 | PyDoc_STRVAR(decode__doc__,\r | |
6366 | "S.decode([encoding[,errors]]) -> string or unicode\n\\r | |
6367 | \n\\r | |
6368 | Decodes S using the codec registered for encoding. encoding defaults\n\\r | |
6369 | to the default encoding. errors may be given to set a different error\n\\r | |
6370 | handling scheme. Default is 'strict' meaning that encoding errors raise\n\\r | |
6371 | a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\\r | |
6372 | as well as any other name registerd with codecs.register_error that is\n\\r | |
6373 | able to handle UnicodeDecodeErrors.");\r | |
6374 | \r | |
6375 | static PyObject *\r | |
6376 | unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)\r | |
6377 | {\r | |
6378 | static char *kwlist[] = {"encoding", "errors", 0};\r | |
6379 | char *encoding = NULL;\r | |
6380 | char *errors = NULL;\r | |
6381 | PyObject *v;\r | |
6382 | \r | |
6383 | if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",\r | |
6384 | kwlist, &encoding, &errors))\r | |
6385 | return NULL;\r | |
6386 | v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);\r | |
6387 | if (v == NULL)\r | |
6388 | goto onError;\r | |
6389 | if (!PyString_Check(v) && !PyUnicode_Check(v)) {\r | |
6390 | PyErr_Format(PyExc_TypeError,\r | |
6391 | "decoder did not return a string/unicode object "\r | |
6392 | "(type=%.400s)",\r | |
6393 | Py_TYPE(v)->tp_name);\r | |
6394 | Py_DECREF(v);\r | |
6395 | return NULL;\r | |
6396 | }\r | |
6397 | return v;\r | |
6398 | \r | |
6399 | onError:\r | |
6400 | return NULL;\r | |
6401 | }\r | |
6402 | \r | |
6403 | PyDoc_STRVAR(expandtabs__doc__,\r | |
6404 | "S.expandtabs([tabsize]) -> unicode\n\\r | |
6405 | \n\\r | |
6406 | Return a copy of S where all tab characters are expanded using spaces.\n\\r | |
6407 | If tabsize is not given, a tab size of 8 characters is assumed.");\r | |
6408 | \r | |
6409 | static PyObject*\r | |
6410 | unicode_expandtabs(PyUnicodeObject *self, PyObject *args)\r | |
6411 | {\r | |
6412 | Py_UNICODE *e;\r | |
6413 | Py_UNICODE *p;\r | |
6414 | Py_UNICODE *q;\r | |
6415 | Py_UNICODE *qe;\r | |
6416 | Py_ssize_t i, j, incr;\r | |
6417 | PyUnicodeObject *u;\r | |
6418 | int tabsize = 8;\r | |
6419 | \r | |
6420 | if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))\r | |
6421 | return NULL;\r | |
6422 | \r | |
6423 | /* First pass: determine size of output string */\r | |
6424 | i = 0; /* chars up to and including most recent \n or \r */\r | |
6425 | j = 0; /* chars since most recent \n or \r (use in tab calculations) */\r | |
6426 | e = self->str + self->length; /* end of input */\r | |
6427 | for (p = self->str; p < e; p++)\r | |
6428 | if (*p == '\t') {\r | |
6429 | if (tabsize > 0) {\r | |
6430 | incr = tabsize - (j % tabsize); /* cannot overflow */\r | |
6431 | if (j > PY_SSIZE_T_MAX - incr)\r | |
6432 | goto overflow1;\r | |
6433 | j += incr;\r | |
6434 | }\r | |
6435 | }\r | |
6436 | else {\r | |
6437 | if (j > PY_SSIZE_T_MAX - 1)\r | |
6438 | goto overflow1;\r | |
6439 | j++;\r | |
6440 | if (*p == '\n' || *p == '\r') {\r | |
6441 | if (i > PY_SSIZE_T_MAX - j)\r | |
6442 | goto overflow1;\r | |
6443 | i += j;\r | |
6444 | j = 0;\r | |
6445 | }\r | |
6446 | }\r | |
6447 | \r | |
6448 | if (i > PY_SSIZE_T_MAX - j)\r | |
6449 | goto overflow1;\r | |
6450 | \r | |
6451 | /* Second pass: create output string and fill it */\r | |
6452 | u = _PyUnicode_New(i + j);\r | |
6453 | if (!u)\r | |
6454 | return NULL;\r | |
6455 | \r | |
6456 | j = 0; /* same as in first pass */\r | |
6457 | q = u->str; /* next output char */\r | |
6458 | qe = u->str + u->length; /* end of output */\r | |
6459 | \r | |
6460 | for (p = self->str; p < e; p++)\r | |
6461 | if (*p == '\t') {\r | |
6462 | if (tabsize > 0) {\r | |
6463 | i = tabsize - (j % tabsize);\r | |
6464 | j += i;\r | |
6465 | while (i--) {\r | |
6466 | if (q >= qe)\r | |
6467 | goto overflow2;\r | |
6468 | *q++ = ' ';\r | |
6469 | }\r | |
6470 | }\r | |
6471 | }\r | |
6472 | else {\r | |
6473 | if (q >= qe)\r | |
6474 | goto overflow2;\r | |
6475 | *q++ = *p;\r | |
6476 | j++;\r | |
6477 | if (*p == '\n' || *p == '\r')\r | |
6478 | j = 0;\r | |
6479 | }\r | |
6480 | \r | |
6481 | return (PyObject*) u;\r | |
6482 | \r | |
6483 | overflow2:\r | |
6484 | Py_DECREF(u);\r | |
6485 | overflow1:\r | |
6486 | PyErr_SetString(PyExc_OverflowError, "new string is too long");\r | |
6487 | return NULL;\r | |
6488 | }\r | |
6489 | \r | |
6490 | PyDoc_STRVAR(find__doc__,\r | |
6491 | "S.find(sub [,start [,end]]) -> int\n\\r | |
6492 | \n\\r | |
6493 | Return the lowest index in S where substring sub is found,\n\\r | |
6494 | such that sub is contained within s[start:end]. Optional\n\\r | |
6495 | arguments start and end are interpreted as in slice notation.\n\\r | |
6496 | \n\\r | |
6497 | Return -1 on failure.");\r | |
6498 | \r | |
6499 | static PyObject *\r | |
6500 | unicode_find(PyUnicodeObject *self, PyObject *args)\r | |
6501 | {\r | |
6502 | PyUnicodeObject *substring;\r | |
6503 | Py_ssize_t start;\r | |
6504 | Py_ssize_t end;\r | |
6505 | Py_ssize_t result;\r | |
6506 | \r | |
6507 | if (!stringlib_parse_args_finds_unicode("find", args, &substring,\r | |
6508 | &start, &end))\r | |
6509 | return NULL;\r | |
6510 | \r | |
6511 | result = stringlib_find_slice(\r | |
6512 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r | |
6513 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r | |
6514 | start, end\r | |
6515 | );\r | |
6516 | \r | |
6517 | Py_DECREF(substring);\r | |
6518 | \r | |
6519 | return PyInt_FromSsize_t(result);\r | |
6520 | }\r | |
6521 | \r | |
6522 | static PyObject *\r | |
6523 | unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)\r | |
6524 | {\r | |
6525 | if (index < 0 || index >= self->length) {\r | |
6526 | PyErr_SetString(PyExc_IndexError, "string index out of range");\r | |
6527 | return NULL;\r | |
6528 | }\r | |
6529 | \r | |
6530 | return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);\r | |
6531 | }\r | |
6532 | \r | |
6533 | static long\r | |
6534 | unicode_hash(PyUnicodeObject *self)\r | |
6535 | {\r | |
6536 | /* Since Unicode objects compare equal to their ASCII string\r | |
6537 | counterparts, they should use the individual character values\r | |
6538 | as basis for their hash value. This is needed to assure that\r | |
6539 | strings and Unicode objects behave in the same way as\r | |
6540 | dictionary keys. */\r | |
6541 | \r | |
6542 | register Py_ssize_t len;\r | |
6543 | register Py_UNICODE *p;\r | |
6544 | register long x;\r | |
6545 | \r | |
6546 | if (self->hash != -1)\r | |
6547 | return self->hash;\r | |
6548 | len = PyUnicode_GET_SIZE(self);\r | |
6549 | p = PyUnicode_AS_UNICODE(self);\r | |
6550 | x = *p << 7;\r | |
6551 | while (--len >= 0)\r | |
6552 | x = (1000003*x) ^ *p++;\r | |
6553 | x ^= PyUnicode_GET_SIZE(self);\r | |
6554 | if (x == -1)\r | |
6555 | x = -2;\r | |
6556 | self->hash = x;\r | |
6557 | return x;\r | |
6558 | }\r | |
6559 | \r | |
6560 | PyDoc_STRVAR(index__doc__,\r | |
6561 | "S.index(sub [,start [,end]]) -> int\n\\r | |
6562 | \n\\r | |
6563 | Like S.find() but raise ValueError when the substring is not found.");\r | |
6564 | \r | |
6565 | static PyObject *\r | |
6566 | unicode_index(PyUnicodeObject *self, PyObject *args)\r | |
6567 | {\r | |
6568 | Py_ssize_t result;\r | |
6569 | PyUnicodeObject *substring;\r | |
6570 | Py_ssize_t start;\r | |
6571 | Py_ssize_t end;\r | |
6572 | \r | |
6573 | if (!stringlib_parse_args_finds_unicode("index", args, &substring,\r | |
6574 | &start, &end))\r | |
6575 | return NULL;\r | |
6576 | \r | |
6577 | result = stringlib_find_slice(\r | |
6578 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r | |
6579 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r | |
6580 | start, end\r | |
6581 | );\r | |
6582 | \r | |
6583 | Py_DECREF(substring);\r | |
6584 | \r | |
6585 | if (result < 0) {\r | |
6586 | PyErr_SetString(PyExc_ValueError, "substring not found");\r | |
6587 | return NULL;\r | |
6588 | }\r | |
6589 | \r | |
6590 | return PyInt_FromSsize_t(result);\r | |
6591 | }\r | |
6592 | \r | |
6593 | PyDoc_STRVAR(islower__doc__,\r | |
6594 | "S.islower() -> bool\n\\r | |
6595 | \n\\r | |
6596 | Return True if all cased characters in S are lowercase and there is\n\\r | |
6597 | at least one cased character in S, False otherwise.");\r | |
6598 | \r | |
6599 | static PyObject*\r | |
6600 | unicode_islower(PyUnicodeObject *self)\r | |
6601 | {\r | |
6602 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6603 | register const Py_UNICODE *e;\r | |
6604 | int cased;\r | |
6605 | \r | |
6606 | /* Shortcut for single character strings */\r | |
6607 | if (PyUnicode_GET_SIZE(self) == 1)\r | |
6608 | return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));\r | |
6609 | \r | |
6610 | /* Special case for empty strings */\r | |
6611 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6612 | return PyBool_FromLong(0);\r | |
6613 | \r | |
6614 | e = p + PyUnicode_GET_SIZE(self);\r | |
6615 | cased = 0;\r | |
6616 | for (; p < e; p++) {\r | |
6617 | register const Py_UNICODE ch = *p;\r | |
6618 | \r | |
6619 | if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))\r | |
6620 | return PyBool_FromLong(0);\r | |
6621 | else if (!cased && Py_UNICODE_ISLOWER(ch))\r | |
6622 | cased = 1;\r | |
6623 | }\r | |
6624 | return PyBool_FromLong(cased);\r | |
6625 | }\r | |
6626 | \r | |
6627 | PyDoc_STRVAR(isupper__doc__,\r | |
6628 | "S.isupper() -> bool\n\\r | |
6629 | \n\\r | |
6630 | Return True if all cased characters in S are uppercase and there is\n\\r | |
6631 | at least one cased character in S, False otherwise.");\r | |
6632 | \r | |
6633 | static PyObject*\r | |
6634 | unicode_isupper(PyUnicodeObject *self)\r | |
6635 | {\r | |
6636 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6637 | register const Py_UNICODE *e;\r | |
6638 | int cased;\r | |
6639 | \r | |
6640 | /* Shortcut for single character strings */\r | |
6641 | if (PyUnicode_GET_SIZE(self) == 1)\r | |
6642 | return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);\r | |
6643 | \r | |
6644 | /* Special case for empty strings */\r | |
6645 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6646 | return PyBool_FromLong(0);\r | |
6647 | \r | |
6648 | e = p + PyUnicode_GET_SIZE(self);\r | |
6649 | cased = 0;\r | |
6650 | for (; p < e; p++) {\r | |
6651 | register const Py_UNICODE ch = *p;\r | |
6652 | \r | |
6653 | if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))\r | |
6654 | return PyBool_FromLong(0);\r | |
6655 | else if (!cased && Py_UNICODE_ISUPPER(ch))\r | |
6656 | cased = 1;\r | |
6657 | }\r | |
6658 | return PyBool_FromLong(cased);\r | |
6659 | }\r | |
6660 | \r | |
6661 | PyDoc_STRVAR(istitle__doc__,\r | |
6662 | "S.istitle() -> bool\n\\r | |
6663 | \n\\r | |
6664 | Return True if S is a titlecased string and there is at least one\n\\r | |
6665 | character in S, i.e. upper- and titlecase characters may only\n\\r | |
6666 | follow uncased characters and lowercase characters only cased ones.\n\\r | |
6667 | Return False otherwise.");\r | |
6668 | \r | |
6669 | static PyObject*\r | |
6670 | unicode_istitle(PyUnicodeObject *self)\r | |
6671 | {\r | |
6672 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6673 | register const Py_UNICODE *e;\r | |
6674 | int cased, previous_is_cased;\r | |
6675 | \r | |
6676 | /* Shortcut for single character strings */\r | |
6677 | if (PyUnicode_GET_SIZE(self) == 1)\r | |
6678 | return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||\r | |
6679 | (Py_UNICODE_ISUPPER(*p) != 0));\r | |
6680 | \r | |
6681 | /* Special case for empty strings */\r | |
6682 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6683 | return PyBool_FromLong(0);\r | |
6684 | \r | |
6685 | e = p + PyUnicode_GET_SIZE(self);\r | |
6686 | cased = 0;\r | |
6687 | previous_is_cased = 0;\r | |
6688 | for (; p < e; p++) {\r | |
6689 | register const Py_UNICODE ch = *p;\r | |
6690 | \r | |
6691 | if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {\r | |
6692 | if (previous_is_cased)\r | |
6693 | return PyBool_FromLong(0);\r | |
6694 | previous_is_cased = 1;\r | |
6695 | cased = 1;\r | |
6696 | }\r | |
6697 | else if (Py_UNICODE_ISLOWER(ch)) {\r | |
6698 | if (!previous_is_cased)\r | |
6699 | return PyBool_FromLong(0);\r | |
6700 | previous_is_cased = 1;\r | |
6701 | cased = 1;\r | |
6702 | }\r | |
6703 | else\r | |
6704 | previous_is_cased = 0;\r | |
6705 | }\r | |
6706 | return PyBool_FromLong(cased);\r | |
6707 | }\r | |
6708 | \r | |
6709 | PyDoc_STRVAR(isspace__doc__,\r | |
6710 | "S.isspace() -> bool\n\\r | |
6711 | \n\\r | |
6712 | Return True if all characters in S are whitespace\n\\r | |
6713 | and there is at least one character in S, False otherwise.");\r | |
6714 | \r | |
6715 | static PyObject*\r | |
6716 | unicode_isspace(PyUnicodeObject *self)\r | |
6717 | {\r | |
6718 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6719 | register const Py_UNICODE *e;\r | |
6720 | \r | |
6721 | /* Shortcut for single character strings */\r | |
6722 | if (PyUnicode_GET_SIZE(self) == 1 &&\r | |
6723 | Py_UNICODE_ISSPACE(*p))\r | |
6724 | return PyBool_FromLong(1);\r | |
6725 | \r | |
6726 | /* Special case for empty strings */\r | |
6727 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6728 | return PyBool_FromLong(0);\r | |
6729 | \r | |
6730 | e = p + PyUnicode_GET_SIZE(self);\r | |
6731 | for (; p < e; p++) {\r | |
6732 | if (!Py_UNICODE_ISSPACE(*p))\r | |
6733 | return PyBool_FromLong(0);\r | |
6734 | }\r | |
6735 | return PyBool_FromLong(1);\r | |
6736 | }\r | |
6737 | \r | |
6738 | PyDoc_STRVAR(isalpha__doc__,\r | |
6739 | "S.isalpha() -> bool\n\\r | |
6740 | \n\\r | |
6741 | Return True if all characters in S are alphabetic\n\\r | |
6742 | and there is at least one character in S, False otherwise.");\r | |
6743 | \r | |
6744 | static PyObject*\r | |
6745 | unicode_isalpha(PyUnicodeObject *self)\r | |
6746 | {\r | |
6747 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6748 | register const Py_UNICODE *e;\r | |
6749 | \r | |
6750 | /* Shortcut for single character strings */\r | |
6751 | if (PyUnicode_GET_SIZE(self) == 1 &&\r | |
6752 | Py_UNICODE_ISALPHA(*p))\r | |
6753 | return PyBool_FromLong(1);\r | |
6754 | \r | |
6755 | /* Special case for empty strings */\r | |
6756 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6757 | return PyBool_FromLong(0);\r | |
6758 | \r | |
6759 | e = p + PyUnicode_GET_SIZE(self);\r | |
6760 | for (; p < e; p++) {\r | |
6761 | if (!Py_UNICODE_ISALPHA(*p))\r | |
6762 | return PyBool_FromLong(0);\r | |
6763 | }\r | |
6764 | return PyBool_FromLong(1);\r | |
6765 | }\r | |
6766 | \r | |
6767 | PyDoc_STRVAR(isalnum__doc__,\r | |
6768 | "S.isalnum() -> bool\n\\r | |
6769 | \n\\r | |
6770 | Return True if all characters in S are alphanumeric\n\\r | |
6771 | and there is at least one character in S, False otherwise.");\r | |
6772 | \r | |
6773 | static PyObject*\r | |
6774 | unicode_isalnum(PyUnicodeObject *self)\r | |
6775 | {\r | |
6776 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6777 | register const Py_UNICODE *e;\r | |
6778 | \r | |
6779 | /* Shortcut for single character strings */\r | |
6780 | if (PyUnicode_GET_SIZE(self) == 1 &&\r | |
6781 | Py_UNICODE_ISALNUM(*p))\r | |
6782 | return PyBool_FromLong(1);\r | |
6783 | \r | |
6784 | /* Special case for empty strings */\r | |
6785 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6786 | return PyBool_FromLong(0);\r | |
6787 | \r | |
6788 | e = p + PyUnicode_GET_SIZE(self);\r | |
6789 | for (; p < e; p++) {\r | |
6790 | if (!Py_UNICODE_ISALNUM(*p))\r | |
6791 | return PyBool_FromLong(0);\r | |
6792 | }\r | |
6793 | return PyBool_FromLong(1);\r | |
6794 | }\r | |
6795 | \r | |
6796 | PyDoc_STRVAR(isdecimal__doc__,\r | |
6797 | "S.isdecimal() -> bool\n\\r | |
6798 | \n\\r | |
6799 | Return True if there are only decimal characters in S,\n\\r | |
6800 | False otherwise.");\r | |
6801 | \r | |
6802 | static PyObject*\r | |
6803 | unicode_isdecimal(PyUnicodeObject *self)\r | |
6804 | {\r | |
6805 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6806 | register const Py_UNICODE *e;\r | |
6807 | \r | |
6808 | /* Shortcut for single character strings */\r | |
6809 | if (PyUnicode_GET_SIZE(self) == 1 &&\r | |
6810 | Py_UNICODE_ISDECIMAL(*p))\r | |
6811 | return PyBool_FromLong(1);\r | |
6812 | \r | |
6813 | /* Special case for empty strings */\r | |
6814 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6815 | return PyBool_FromLong(0);\r | |
6816 | \r | |
6817 | e = p + PyUnicode_GET_SIZE(self);\r | |
6818 | for (; p < e; p++) {\r | |
6819 | if (!Py_UNICODE_ISDECIMAL(*p))\r | |
6820 | return PyBool_FromLong(0);\r | |
6821 | }\r | |
6822 | return PyBool_FromLong(1);\r | |
6823 | }\r | |
6824 | \r | |
6825 | PyDoc_STRVAR(isdigit__doc__,\r | |
6826 | "S.isdigit() -> bool\n\\r | |
6827 | \n\\r | |
6828 | Return True if all characters in S are digits\n\\r | |
6829 | and there is at least one character in S, False otherwise.");\r | |
6830 | \r | |
6831 | static PyObject*\r | |
6832 | unicode_isdigit(PyUnicodeObject *self)\r | |
6833 | {\r | |
6834 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6835 | register const Py_UNICODE *e;\r | |
6836 | \r | |
6837 | /* Shortcut for single character strings */\r | |
6838 | if (PyUnicode_GET_SIZE(self) == 1 &&\r | |
6839 | Py_UNICODE_ISDIGIT(*p))\r | |
6840 | return PyBool_FromLong(1);\r | |
6841 | \r | |
6842 | /* Special case for empty strings */\r | |
6843 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6844 | return PyBool_FromLong(0);\r | |
6845 | \r | |
6846 | e = p + PyUnicode_GET_SIZE(self);\r | |
6847 | for (; p < e; p++) {\r | |
6848 | if (!Py_UNICODE_ISDIGIT(*p))\r | |
6849 | return PyBool_FromLong(0);\r | |
6850 | }\r | |
6851 | return PyBool_FromLong(1);\r | |
6852 | }\r | |
6853 | \r | |
6854 | PyDoc_STRVAR(isnumeric__doc__,\r | |
6855 | "S.isnumeric() -> bool\n\\r | |
6856 | \n\\r | |
6857 | Return True if there are only numeric characters in S,\n\\r | |
6858 | False otherwise.");\r | |
6859 | \r | |
6860 | static PyObject*\r | |
6861 | unicode_isnumeric(PyUnicodeObject *self)\r | |
6862 | {\r | |
6863 | register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r | |
6864 | register const Py_UNICODE *e;\r | |
6865 | \r | |
6866 | /* Shortcut for single character strings */\r | |
6867 | if (PyUnicode_GET_SIZE(self) == 1 &&\r | |
6868 | Py_UNICODE_ISNUMERIC(*p))\r | |
6869 | return PyBool_FromLong(1);\r | |
6870 | \r | |
6871 | /* Special case for empty strings */\r | |
6872 | if (PyUnicode_GET_SIZE(self) == 0)\r | |
6873 | return PyBool_FromLong(0);\r | |
6874 | \r | |
6875 | e = p + PyUnicode_GET_SIZE(self);\r | |
6876 | for (; p < e; p++) {\r | |
6877 | if (!Py_UNICODE_ISNUMERIC(*p))\r | |
6878 | return PyBool_FromLong(0);\r | |
6879 | }\r | |
6880 | return PyBool_FromLong(1);\r | |
6881 | }\r | |
6882 | \r | |
6883 | PyDoc_STRVAR(join__doc__,\r | |
6884 | "S.join(iterable) -> unicode\n\\r | |
6885 | \n\\r | |
6886 | Return a string which is the concatenation of the strings in the\n\\r | |
6887 | iterable. The separator between elements is S.");\r | |
6888 | \r | |
6889 | static PyObject*\r | |
6890 | unicode_join(PyObject *self, PyObject *data)\r | |
6891 | {\r | |
6892 | return PyUnicode_Join(self, data);\r | |
6893 | }\r | |
6894 | \r | |
6895 | static Py_ssize_t\r | |
6896 | unicode_length(PyUnicodeObject *self)\r | |
6897 | {\r | |
6898 | return self->length;\r | |
6899 | }\r | |
6900 | \r | |
6901 | PyDoc_STRVAR(ljust__doc__,\r | |
6902 | "S.ljust(width[, fillchar]) -> int\n\\r | |
6903 | \n\\r | |
6904 | Return S left-justified in a Unicode string of length width. Padding is\n\\r | |
6905 | done using the specified fill character (default is a space).");\r | |
6906 | \r | |
6907 | static PyObject *\r | |
6908 | unicode_ljust(PyUnicodeObject *self, PyObject *args)\r | |
6909 | {\r | |
6910 | Py_ssize_t width;\r | |
6911 | Py_UNICODE fillchar = ' ';\r | |
6912 | \r | |
6913 | if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))\r | |
6914 | return NULL;\r | |
6915 | \r | |
6916 | if (self->length >= width && PyUnicode_CheckExact(self)) {\r | |
6917 | Py_INCREF(self);\r | |
6918 | return (PyObject*) self;\r | |
6919 | }\r | |
6920 | \r | |
6921 | return (PyObject*) pad(self, 0, width - self->length, fillchar);\r | |
6922 | }\r | |
6923 | \r | |
6924 | PyDoc_STRVAR(lower__doc__,\r | |
6925 | "S.lower() -> unicode\n\\r | |
6926 | \n\\r | |
6927 | Return a copy of the string S converted to lowercase.");\r | |
6928 | \r | |
6929 | static PyObject*\r | |
6930 | unicode_lower(PyUnicodeObject *self)\r | |
6931 | {\r | |
6932 | return fixup(self, fixlower);\r | |
6933 | }\r | |
6934 | \r | |
6935 | #define LEFTSTRIP 0\r | |
6936 | #define RIGHTSTRIP 1\r | |
6937 | #define BOTHSTRIP 2\r | |
6938 | \r | |
6939 | /* Arrays indexed by above */\r | |
6940 | static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};\r | |
6941 | \r | |
6942 | #define STRIPNAME(i) (stripformat[i]+3)\r | |
6943 | \r | |
6944 | /* externally visible for str.strip(unicode) */\r | |
6945 | PyObject *\r | |
6946 | _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)\r | |
6947 | {\r | |
6948 | Py_UNICODE *s = PyUnicode_AS_UNICODE(self);\r | |
6949 | Py_ssize_t len = PyUnicode_GET_SIZE(self);\r | |
6950 | Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);\r | |
6951 | Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);\r | |
6952 | Py_ssize_t i, j;\r | |
6953 | \r | |
6954 | BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);\r | |
6955 | \r | |
6956 | i = 0;\r | |
6957 | if (striptype != RIGHTSTRIP) {\r | |
6958 | while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {\r | |
6959 | i++;\r | |
6960 | }\r | |
6961 | }\r | |
6962 | \r | |
6963 | j = len;\r | |
6964 | if (striptype != LEFTSTRIP) {\r | |
6965 | do {\r | |
6966 | j--;\r | |
6967 | } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));\r | |
6968 | j++;\r | |
6969 | }\r | |
6970 | \r | |
6971 | if (i == 0 && j == len && PyUnicode_CheckExact(self)) {\r | |
6972 | Py_INCREF(self);\r | |
6973 | return (PyObject*)self;\r | |
6974 | }\r | |
6975 | else\r | |
6976 | return PyUnicode_FromUnicode(s+i, j-i);\r | |
6977 | }\r | |
6978 | \r | |
6979 | \r | |
6980 | static PyObject *\r | |
6981 | do_strip(PyUnicodeObject *self, int striptype)\r | |
6982 | {\r | |
6983 | Py_UNICODE *s = PyUnicode_AS_UNICODE(self);\r | |
6984 | Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;\r | |
6985 | \r | |
6986 | i = 0;\r | |
6987 | if (striptype != RIGHTSTRIP) {\r | |
6988 | while (i < len && Py_UNICODE_ISSPACE(s[i])) {\r | |
6989 | i++;\r | |
6990 | }\r | |
6991 | }\r | |
6992 | \r | |
6993 | j = len;\r | |
6994 | if (striptype != LEFTSTRIP) {\r | |
6995 | do {\r | |
6996 | j--;\r | |
6997 | } while (j >= i && Py_UNICODE_ISSPACE(s[j]));\r | |
6998 | j++;\r | |
6999 | }\r | |
7000 | \r | |
7001 | if (i == 0 && j == len && PyUnicode_CheckExact(self)) {\r | |
7002 | Py_INCREF(self);\r | |
7003 | return (PyObject*)self;\r | |
7004 | }\r | |
7005 | else\r | |
7006 | return PyUnicode_FromUnicode(s+i, j-i);\r | |
7007 | }\r | |
7008 | \r | |
7009 | \r | |
7010 | static PyObject *\r | |
7011 | do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)\r | |
7012 | {\r | |
7013 | PyObject *sep = NULL;\r | |
7014 | \r | |
7015 | if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))\r | |
7016 | return NULL;\r | |
7017 | \r | |
7018 | if (sep != NULL && sep != Py_None) {\r | |
7019 | if (PyUnicode_Check(sep))\r | |
7020 | return _PyUnicode_XStrip(self, striptype, sep);\r | |
7021 | else if (PyString_Check(sep)) {\r | |
7022 | PyObject *res;\r | |
7023 | sep = PyUnicode_FromObject(sep);\r | |
7024 | if (sep==NULL)\r | |
7025 | return NULL;\r | |
7026 | res = _PyUnicode_XStrip(self, striptype, sep);\r | |
7027 | Py_DECREF(sep);\r | |
7028 | return res;\r | |
7029 | }\r | |
7030 | else {\r | |
7031 | PyErr_Format(PyExc_TypeError,\r | |
7032 | "%s arg must be None, unicode or str",\r | |
7033 | STRIPNAME(striptype));\r | |
7034 | return NULL;\r | |
7035 | }\r | |
7036 | }\r | |
7037 | \r | |
7038 | return do_strip(self, striptype);\r | |
7039 | }\r | |
7040 | \r | |
7041 | \r | |
7042 | PyDoc_STRVAR(strip__doc__,\r | |
7043 | "S.strip([chars]) -> unicode\n\\r | |
7044 | \n\\r | |
7045 | Return a copy of the string S with leading and trailing\n\\r | |
7046 | whitespace removed.\n\\r | |
7047 | If chars is given and not None, remove characters in chars instead.\n\\r | |
7048 | If chars is a str, it will be converted to unicode before stripping");\r | |
7049 | \r | |
7050 | static PyObject *\r | |
7051 | unicode_strip(PyUnicodeObject *self, PyObject *args)\r | |
7052 | {\r | |
7053 | if (PyTuple_GET_SIZE(args) == 0)\r | |
7054 | return do_strip(self, BOTHSTRIP); /* Common case */\r | |
7055 | else\r | |
7056 | return do_argstrip(self, BOTHSTRIP, args);\r | |
7057 | }\r | |
7058 | \r | |
7059 | \r | |
7060 | PyDoc_STRVAR(lstrip__doc__,\r | |
7061 | "S.lstrip([chars]) -> unicode\n\\r | |
7062 | \n\\r | |
7063 | Return a copy of the string S with leading whitespace removed.\n\\r | |
7064 | If chars is given and not None, remove characters in chars instead.\n\\r | |
7065 | If chars is a str, it will be converted to unicode before stripping");\r | |
7066 | \r | |
7067 | static PyObject *\r | |
7068 | unicode_lstrip(PyUnicodeObject *self, PyObject *args)\r | |
7069 | {\r | |
7070 | if (PyTuple_GET_SIZE(args) == 0)\r | |
7071 | return do_strip(self, LEFTSTRIP); /* Common case */\r | |
7072 | else\r | |
7073 | return do_argstrip(self, LEFTSTRIP, args);\r | |
7074 | }\r | |
7075 | \r | |
7076 | \r | |
7077 | PyDoc_STRVAR(rstrip__doc__,\r | |
7078 | "S.rstrip([chars]) -> unicode\n\\r | |
7079 | \n\\r | |
7080 | Return a copy of the string S with trailing whitespace removed.\n\\r | |
7081 | If chars is given and not None, remove characters in chars instead.\n\\r | |
7082 | If chars is a str, it will be converted to unicode before stripping");\r | |
7083 | \r | |
7084 | static PyObject *\r | |
7085 | unicode_rstrip(PyUnicodeObject *self, PyObject *args)\r | |
7086 | {\r | |
7087 | if (PyTuple_GET_SIZE(args) == 0)\r | |
7088 | return do_strip(self, RIGHTSTRIP); /* Common case */\r | |
7089 | else\r | |
7090 | return do_argstrip(self, RIGHTSTRIP, args);\r | |
7091 | }\r | |
7092 | \r | |
7093 | \r | |
7094 | static PyObject*\r | |
7095 | unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)\r | |
7096 | {\r | |
7097 | PyUnicodeObject *u;\r | |
7098 | Py_UNICODE *p;\r | |
7099 | Py_ssize_t nchars;\r | |
7100 | size_t nbytes;\r | |
7101 | \r | |
7102 | if (len < 0)\r | |
7103 | len = 0;\r | |
7104 | \r | |
7105 | if (len == 1 && PyUnicode_CheckExact(str)) {\r | |
7106 | /* no repeat, return original string */\r | |
7107 | Py_INCREF(str);\r | |
7108 | return (PyObject*) str;\r | |
7109 | }\r | |
7110 | \r | |
7111 | /* ensure # of chars needed doesn't overflow int and # of bytes\r | |
7112 | * needed doesn't overflow size_t\r | |
7113 | */\r | |
7114 | nchars = len * str->length;\r | |
7115 | if (len && nchars / len != str->length) {\r | |
7116 | PyErr_SetString(PyExc_OverflowError,\r | |
7117 | "repeated string is too long");\r | |
7118 | return NULL;\r | |
7119 | }\r | |
7120 | nbytes = (nchars + 1) * sizeof(Py_UNICODE);\r | |
7121 | if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {\r | |
7122 | PyErr_SetString(PyExc_OverflowError,\r | |
7123 | "repeated string is too long");\r | |
7124 | return NULL;\r | |
7125 | }\r | |
7126 | u = _PyUnicode_New(nchars);\r | |
7127 | if (!u)\r | |
7128 | return NULL;\r | |
7129 | \r | |
7130 | p = u->str;\r | |
7131 | \r | |
7132 | if (str->length == 1 && len > 0) {\r | |
7133 | Py_UNICODE_FILL(p, str->str[0], len);\r | |
7134 | } else {\r | |
7135 | Py_ssize_t done = 0; /* number of characters copied this far */\r | |
7136 | if (done < nchars) {\r | |
7137 | Py_UNICODE_COPY(p, str->str, str->length);\r | |
7138 | done = str->length;\r | |
7139 | }\r | |
7140 | while (done < nchars) {\r | |
7141 | Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;\r | |
7142 | Py_UNICODE_COPY(p+done, p, n);\r | |
7143 | done += n;\r | |
7144 | }\r | |
7145 | }\r | |
7146 | \r | |
7147 | return (PyObject*) u;\r | |
7148 | }\r | |
7149 | \r | |
7150 | PyObject *PyUnicode_Replace(PyObject *obj,\r | |
7151 | PyObject *subobj,\r | |
7152 | PyObject *replobj,\r | |
7153 | Py_ssize_t maxcount)\r | |
7154 | {\r | |
7155 | PyObject *self;\r | |
7156 | PyObject *str1;\r | |
7157 | PyObject *str2;\r | |
7158 | PyObject *result;\r | |
7159 | \r | |
7160 | self = PyUnicode_FromObject(obj);\r | |
7161 | if (self == NULL)\r | |
7162 | return NULL;\r | |
7163 | str1 = PyUnicode_FromObject(subobj);\r | |
7164 | if (str1 == NULL) {\r | |
7165 | Py_DECREF(self);\r | |
7166 | return NULL;\r | |
7167 | }\r | |
7168 | str2 = PyUnicode_FromObject(replobj);\r | |
7169 | if (str2 == NULL) {\r | |
7170 | Py_DECREF(self);\r | |
7171 | Py_DECREF(str1);\r | |
7172 | return NULL;\r | |
7173 | }\r | |
7174 | result = replace((PyUnicodeObject *)self,\r | |
7175 | (PyUnicodeObject *)str1,\r | |
7176 | (PyUnicodeObject *)str2,\r | |
7177 | maxcount);\r | |
7178 | Py_DECREF(self);\r | |
7179 | Py_DECREF(str1);\r | |
7180 | Py_DECREF(str2);\r | |
7181 | return result;\r | |
7182 | }\r | |
7183 | \r | |
7184 | PyDoc_STRVAR(replace__doc__,\r | |
7185 | "S.replace(old, new[, count]) -> unicode\n\\r | |
7186 | \n\\r | |
7187 | Return a copy of S with all occurrences of substring\n\\r | |
7188 | old replaced by new. If the optional argument count is\n\\r | |
7189 | given, only the first count occurrences are replaced.");\r | |
7190 | \r | |
7191 | static PyObject*\r | |
7192 | unicode_replace(PyUnicodeObject *self, PyObject *args)\r | |
7193 | {\r | |
7194 | PyUnicodeObject *str1;\r | |
7195 | PyUnicodeObject *str2;\r | |
7196 | Py_ssize_t maxcount = -1;\r | |
7197 | PyObject *result;\r | |
7198 | \r | |
7199 | if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))\r | |
7200 | return NULL;\r | |
7201 | str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);\r | |
7202 | if (str1 == NULL)\r | |
7203 | return NULL;\r | |
7204 | str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);\r | |
7205 | if (str2 == NULL) {\r | |
7206 | Py_DECREF(str1);\r | |
7207 | return NULL;\r | |
7208 | }\r | |
7209 | \r | |
7210 | result = replace(self, str1, str2, maxcount);\r | |
7211 | \r | |
7212 | Py_DECREF(str1);\r | |
7213 | Py_DECREF(str2);\r | |
7214 | return result;\r | |
7215 | }\r | |
7216 | \r | |
7217 | static\r | |
7218 | PyObject *unicode_repr(PyObject *unicode)\r | |
7219 | {\r | |
7220 | return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),\r | |
7221 | PyUnicode_GET_SIZE(unicode),\r | |
7222 | 1);\r | |
7223 | }\r | |
7224 | \r | |
7225 | PyDoc_STRVAR(rfind__doc__,\r | |
7226 | "S.rfind(sub [,start [,end]]) -> int\n\\r | |
7227 | \n\\r | |
7228 | Return the highest index in S where substring sub is found,\n\\r | |
7229 | such that sub is contained within s[start:end]. Optional\n\\r | |
7230 | arguments start and end are interpreted as in slice notation.\n\\r | |
7231 | \n\\r | |
7232 | Return -1 on failure.");\r | |
7233 | \r | |
7234 | static PyObject *\r | |
7235 | unicode_rfind(PyUnicodeObject *self, PyObject *args)\r | |
7236 | {\r | |
7237 | PyUnicodeObject *substring;\r | |
7238 | Py_ssize_t start;\r | |
7239 | Py_ssize_t end;\r | |
7240 | Py_ssize_t result;\r | |
7241 | \r | |
7242 | if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,\r | |
7243 | &start, &end))\r | |
7244 | return NULL;\r | |
7245 | \r | |
7246 | result = stringlib_rfind_slice(\r | |
7247 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r | |
7248 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r | |
7249 | start, end\r | |
7250 | );\r | |
7251 | \r | |
7252 | Py_DECREF(substring);\r | |
7253 | \r | |
7254 | return PyInt_FromSsize_t(result);\r | |
7255 | }\r | |
7256 | \r | |
7257 | PyDoc_STRVAR(rindex__doc__,\r | |
7258 | "S.rindex(sub [,start [,end]]) -> int\n\\r | |
7259 | \n\\r | |
7260 | Like S.rfind() but raise ValueError when the substring is not found.");\r | |
7261 | \r | |
7262 | static PyObject *\r | |
7263 | unicode_rindex(PyUnicodeObject *self, PyObject *args)\r | |
7264 | {\r | |
7265 | PyUnicodeObject *substring;\r | |
7266 | Py_ssize_t start;\r | |
7267 | Py_ssize_t end;\r | |
7268 | Py_ssize_t result;\r | |
7269 | \r | |
7270 | if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,\r | |
7271 | &start, &end))\r | |
7272 | return NULL;\r | |
7273 | \r | |
7274 | result = stringlib_rfind_slice(\r | |
7275 | PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r | |
7276 | PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r | |
7277 | start, end\r | |
7278 | );\r | |
7279 | \r | |
7280 | Py_DECREF(substring);\r | |
7281 | \r | |
7282 | if (result < 0) {\r | |
7283 | PyErr_SetString(PyExc_ValueError, "substring not found");\r | |
7284 | return NULL;\r | |
7285 | }\r | |
7286 | return PyInt_FromSsize_t(result);\r | |
7287 | }\r | |
7288 | \r | |
7289 | PyDoc_STRVAR(rjust__doc__,\r | |
7290 | "S.rjust(width[, fillchar]) -> unicode\n\\r | |
7291 | \n\\r | |
7292 | Return S right-justified in a Unicode string of length width. Padding is\n\\r | |
7293 | done using the specified fill character (default is a space).");\r | |
7294 | \r | |
7295 | static PyObject *\r | |
7296 | unicode_rjust(PyUnicodeObject *self, PyObject *args)\r | |
7297 | {\r | |
7298 | Py_ssize_t width;\r | |
7299 | Py_UNICODE fillchar = ' ';\r | |
7300 | \r | |
7301 | if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))\r | |
7302 | return NULL;\r | |
7303 | \r | |
7304 | if (self->length >= width && PyUnicode_CheckExact(self)) {\r | |
7305 | Py_INCREF(self);\r | |
7306 | return (PyObject*) self;\r | |
7307 | }\r | |
7308 | \r | |
7309 | return (PyObject*) pad(self, width - self->length, 0, fillchar);\r | |
7310 | }\r | |
7311 | \r | |
7312 | static PyObject*\r | |
7313 | unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)\r | |
7314 | {\r | |
7315 | /* standard clamping */\r | |
7316 | if (start < 0)\r | |
7317 | start = 0;\r | |
7318 | if (end < 0)\r | |
7319 | end = 0;\r | |
7320 | if (end > self->length)\r | |
7321 | end = self->length;\r | |
7322 | if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {\r | |
7323 | /* full slice, return original string */\r | |
7324 | Py_INCREF(self);\r | |
7325 | return (PyObject*) self;\r | |
7326 | }\r | |
7327 | if (start > end)\r | |
7328 | start = end;\r | |
7329 | /* copy slice */\r | |
7330 | return (PyObject*) PyUnicode_FromUnicode(self->str + start,\r | |
7331 | end - start);\r | |
7332 | }\r | |
7333 | \r | |
7334 | PyObject *PyUnicode_Split(PyObject *s,\r | |
7335 | PyObject *sep,\r | |
7336 | Py_ssize_t maxsplit)\r | |
7337 | {\r | |
7338 | PyObject *result;\r | |
7339 | \r | |
7340 | s = PyUnicode_FromObject(s);\r | |
7341 | if (s == NULL)\r | |
7342 | return NULL;\r | |
7343 | if (sep != NULL) {\r | |
7344 | sep = PyUnicode_FromObject(sep);\r | |
7345 | if (sep == NULL) {\r | |
7346 | Py_DECREF(s);\r | |
7347 | return NULL;\r | |
7348 | }\r | |
7349 | }\r | |
7350 | \r | |
7351 | result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);\r | |
7352 | \r | |
7353 | Py_DECREF(s);\r | |
7354 | Py_XDECREF(sep);\r | |
7355 | return result;\r | |
7356 | }\r | |
7357 | \r | |
7358 | PyDoc_STRVAR(split__doc__,\r | |
7359 | "S.split([sep [,maxsplit]]) -> list of strings\n\\r | |
7360 | \n\\r | |
7361 | Return a list of the words in S, using sep as the\n\\r | |
7362 | delimiter string. If maxsplit is given, at most maxsplit\n\\r | |
7363 | splits are done. If sep is not specified or is None, any\n\\r | |
7364 | whitespace string is a separator and empty strings are\n\\r | |
7365 | removed from the result.");\r | |
7366 | \r | |
7367 | static PyObject*\r | |
7368 | unicode_split(PyUnicodeObject *self, PyObject *args)\r | |
7369 | {\r | |
7370 | PyObject *substring = Py_None;\r | |
7371 | Py_ssize_t maxcount = -1;\r | |
7372 | \r | |
7373 | if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))\r | |
7374 | return NULL;\r | |
7375 | \r | |
7376 | if (substring == Py_None)\r | |
7377 | return split(self, NULL, maxcount);\r | |
7378 | else if (PyUnicode_Check(substring))\r | |
7379 | return split(self, (PyUnicodeObject *)substring, maxcount);\r | |
7380 | else\r | |
7381 | return PyUnicode_Split((PyObject *)self, substring, maxcount);\r | |
7382 | }\r | |
7383 | \r | |
7384 | PyObject *\r | |
7385 | PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)\r | |
7386 | {\r | |
7387 | PyObject* str_obj;\r | |
7388 | PyObject* sep_obj;\r | |
7389 | PyObject* out;\r | |
7390 | \r | |
7391 | str_obj = PyUnicode_FromObject(str_in);\r | |
7392 | if (!str_obj)\r | |
7393 | return NULL;\r | |
7394 | sep_obj = PyUnicode_FromObject(sep_in);\r | |
7395 | if (!sep_obj) {\r | |
7396 | Py_DECREF(str_obj);\r | |
7397 | return NULL;\r | |
7398 | }\r | |
7399 | \r | |
7400 | out = stringlib_partition(\r | |
7401 | str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),\r | |
7402 | sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)\r | |
7403 | );\r | |
7404 | \r | |
7405 | Py_DECREF(sep_obj);\r | |
7406 | Py_DECREF(str_obj);\r | |
7407 | \r | |
7408 | return out;\r | |
7409 | }\r | |
7410 | \r | |
7411 | \r | |
7412 | PyObject *\r | |
7413 | PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)\r | |
7414 | {\r | |
7415 | PyObject* str_obj;\r | |
7416 | PyObject* sep_obj;\r | |
7417 | PyObject* out;\r | |
7418 | \r | |
7419 | str_obj = PyUnicode_FromObject(str_in);\r | |
7420 | if (!str_obj)\r | |
7421 | return NULL;\r | |
7422 | sep_obj = PyUnicode_FromObject(sep_in);\r | |
7423 | if (!sep_obj) {\r | |
7424 | Py_DECREF(str_obj);\r | |
7425 | return NULL;\r | |
7426 | }\r | |
7427 | \r | |
7428 | out = stringlib_rpartition(\r | |
7429 | str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),\r | |
7430 | sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)\r | |
7431 | );\r | |
7432 | \r | |
7433 | Py_DECREF(sep_obj);\r | |
7434 | Py_DECREF(str_obj);\r | |
7435 | \r | |
7436 | return out;\r | |
7437 | }\r | |
7438 | \r | |
7439 | PyDoc_STRVAR(partition__doc__,\r | |
7440 | "S.partition(sep) -> (head, sep, tail)\n\\r | |
7441 | \n\\r | |
7442 | Search for the separator sep in S, and return the part before it,\n\\r | |
7443 | the separator itself, and the part after it. If the separator is not\n\\r | |
7444 | found, return S and two empty strings.");\r | |
7445 | \r | |
7446 | static PyObject*\r | |
7447 | unicode_partition(PyUnicodeObject *self, PyObject *separator)\r | |
7448 | {\r | |
7449 | return PyUnicode_Partition((PyObject *)self, separator);\r | |
7450 | }\r | |
7451 | \r | |
7452 | PyDoc_STRVAR(rpartition__doc__,\r | |
7453 | "S.rpartition(sep) -> (head, sep, tail)\n\\r | |
7454 | \n\\r | |
7455 | Search for the separator sep in S, starting at the end of S, and return\n\\r | |
7456 | the part before it, the separator itself, and the part after it. If the\n\\r | |
7457 | separator is not found, return two empty strings and S.");\r | |
7458 | \r | |
7459 | static PyObject*\r | |
7460 | unicode_rpartition(PyUnicodeObject *self, PyObject *separator)\r | |
7461 | {\r | |
7462 | return PyUnicode_RPartition((PyObject *)self, separator);\r | |
7463 | }\r | |
7464 | \r | |
7465 | PyObject *PyUnicode_RSplit(PyObject *s,\r | |
7466 | PyObject *sep,\r | |
7467 | Py_ssize_t maxsplit)\r | |
7468 | {\r | |
7469 | PyObject *result;\r | |
7470 | \r | |
7471 | s = PyUnicode_FromObject(s);\r | |
7472 | if (s == NULL)\r | |
7473 | return NULL;\r | |
7474 | if (sep != NULL) {\r | |
7475 | sep = PyUnicode_FromObject(sep);\r | |
7476 | if (sep == NULL) {\r | |
7477 | Py_DECREF(s);\r | |
7478 | return NULL;\r | |
7479 | }\r | |
7480 | }\r | |
7481 | \r | |
7482 | result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);\r | |
7483 | \r | |
7484 | Py_DECREF(s);\r | |
7485 | Py_XDECREF(sep);\r | |
7486 | return result;\r | |
7487 | }\r | |
7488 | \r | |
7489 | PyDoc_STRVAR(rsplit__doc__,\r | |
7490 | "S.rsplit([sep [,maxsplit]]) -> list of strings\n\\r | |
7491 | \n\\r | |
7492 | Return a list of the words in S, using sep as the\n\\r | |
7493 | delimiter string, starting at the end of the string and\n\\r | |
7494 | working to the front. If maxsplit is given, at most maxsplit\n\\r | |
7495 | splits are done. If sep is not specified, any whitespace string\n\\r | |
7496 | is a separator.");\r | |
7497 | \r | |
7498 | static PyObject*\r | |
7499 | unicode_rsplit(PyUnicodeObject *self, PyObject *args)\r | |
7500 | {\r | |
7501 | PyObject *substring = Py_None;\r | |
7502 | Py_ssize_t maxcount = -1;\r | |
7503 | \r | |
7504 | if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))\r | |
7505 | return NULL;\r | |
7506 | \r | |
7507 | if (substring == Py_None)\r | |
7508 | return rsplit(self, NULL, maxcount);\r | |
7509 | else if (PyUnicode_Check(substring))\r | |
7510 | return rsplit(self, (PyUnicodeObject *)substring, maxcount);\r | |
7511 | else\r | |
7512 | return PyUnicode_RSplit((PyObject *)self, substring, maxcount);\r | |
7513 | }\r | |
7514 | \r | |
7515 | PyDoc_STRVAR(splitlines__doc__,\r | |
7516 | "S.splitlines([keepends]) -> list of strings\n\\r | |
7517 | \n\\r | |
7518 | Return a list of the lines in S, breaking at line boundaries.\n\\r | |
7519 | Line breaks are not included in the resulting list unless keepends\n\\r | |
7520 | is given and true.");\r | |
7521 | \r | |
7522 | static PyObject*\r | |
7523 | unicode_splitlines(PyUnicodeObject *self, PyObject *args)\r | |
7524 | {\r | |
7525 | int keepends = 0;\r | |
7526 | \r | |
7527 | if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))\r | |
7528 | return NULL;\r | |
7529 | \r | |
7530 | return PyUnicode_Splitlines((PyObject *)self, keepends);\r | |
7531 | }\r | |
7532 | \r | |
7533 | static\r | |
7534 | PyObject *unicode_str(PyUnicodeObject *self)\r | |
7535 | {\r | |
7536 | return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);\r | |
7537 | }\r | |
7538 | \r | |
7539 | PyDoc_STRVAR(swapcase__doc__,\r | |
7540 | "S.swapcase() -> unicode\n\\r | |
7541 | \n\\r | |
7542 | Return a copy of S with uppercase characters converted to lowercase\n\\r | |
7543 | and vice versa.");\r | |
7544 | \r | |
7545 | static PyObject*\r | |
7546 | unicode_swapcase(PyUnicodeObject *self)\r | |
7547 | {\r | |
7548 | return fixup(self, fixswapcase);\r | |
7549 | }\r | |
7550 | \r | |
7551 | PyDoc_STRVAR(translate__doc__,\r | |
7552 | "S.translate(table) -> unicode\n\\r | |
7553 | \n\\r | |
7554 | Return a copy of the string S, where all characters have been mapped\n\\r | |
7555 | through the given translation table, which must be a mapping of\n\\r | |
7556 | Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\\r | |
7557 | Unmapped characters are left untouched. Characters mapped to None\n\\r | |
7558 | are deleted.");\r | |
7559 | \r | |
7560 | static PyObject*\r | |
7561 | unicode_translate(PyUnicodeObject *self, PyObject *table)\r | |
7562 | {\r | |
7563 | return PyUnicode_TranslateCharmap(self->str,\r | |
7564 | self->length,\r | |
7565 | table,\r | |
7566 | "ignore");\r | |
7567 | }\r | |
7568 | \r | |
7569 | PyDoc_STRVAR(upper__doc__,\r | |
7570 | "S.upper() -> unicode\n\\r | |
7571 | \n\\r | |
7572 | Return a copy of S converted to uppercase.");\r | |
7573 | \r | |
7574 | static PyObject*\r | |
7575 | unicode_upper(PyUnicodeObject *self)\r | |
7576 | {\r | |
7577 | return fixup(self, fixupper);\r | |
7578 | }\r | |
7579 | \r | |
7580 | PyDoc_STRVAR(zfill__doc__,\r | |
7581 | "S.zfill(width) -> unicode\n\\r | |
7582 | \n\\r | |
7583 | Pad a numeric string S with zeros on the left, to fill a field\n\\r | |
7584 | of the specified width. The string S is never truncated.");\r | |
7585 | \r | |
7586 | static PyObject *\r | |
7587 | unicode_zfill(PyUnicodeObject *self, PyObject *args)\r | |
7588 | {\r | |
7589 | Py_ssize_t fill;\r | |
7590 | PyUnicodeObject *u;\r | |
7591 | \r | |
7592 | Py_ssize_t width;\r | |
7593 | if (!PyArg_ParseTuple(args, "n:zfill", &width))\r | |
7594 | return NULL;\r | |
7595 | \r | |
7596 | if (self->length >= width) {\r | |
7597 | if (PyUnicode_CheckExact(self)) {\r | |
7598 | Py_INCREF(self);\r | |
7599 | return (PyObject*) self;\r | |
7600 | }\r | |
7601 | else\r | |
7602 | return PyUnicode_FromUnicode(\r | |
7603 | PyUnicode_AS_UNICODE(self),\r | |
7604 | PyUnicode_GET_SIZE(self)\r | |
7605 | );\r | |
7606 | }\r | |
7607 | \r | |
7608 | fill = width - self->length;\r | |
7609 | \r | |
7610 | u = pad(self, fill, 0, '0');\r | |
7611 | \r | |
7612 | if (u == NULL)\r | |
7613 | return NULL;\r | |
7614 | \r | |
7615 | if (u->str[fill] == '+' || u->str[fill] == '-') {\r | |
7616 | /* move sign to beginning of string */\r | |
7617 | u->str[0] = u->str[fill];\r | |
7618 | u->str[fill] = '0';\r | |
7619 | }\r | |
7620 | \r | |
7621 | return (PyObject*) u;\r | |
7622 | }\r | |
7623 | \r | |
7624 | #if 0\r | |
7625 | static PyObject*\r | |
7626 | free_listsize(PyUnicodeObject *self)\r | |
7627 | {\r | |
7628 | return PyInt_FromLong(numfree);\r | |
7629 | }\r | |
7630 | #endif\r | |
7631 | \r | |
7632 | PyDoc_STRVAR(startswith__doc__,\r | |
7633 | "S.startswith(prefix[, start[, end]]) -> bool\n\\r | |
7634 | \n\\r | |
7635 | Return True if S starts with the specified prefix, False otherwise.\n\\r | |
7636 | With optional start, test S beginning at that position.\n\\r | |
7637 | With optional end, stop comparing S at that position.\n\\r | |
7638 | prefix can also be a tuple of strings to try.");\r | |
7639 | \r | |
7640 | static PyObject *\r | |
7641 | unicode_startswith(PyUnicodeObject *self,\r | |
7642 | PyObject *args)\r | |
7643 | {\r | |
7644 | PyObject *subobj;\r | |
7645 | PyUnicodeObject *substring;\r | |
7646 | Py_ssize_t start = 0;\r | |
7647 | Py_ssize_t end = PY_SSIZE_T_MAX;\r | |
7648 | int result;\r | |
7649 | \r | |
7650 | if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))\r | |
7651 | return NULL;\r | |
7652 | if (PyTuple_Check(subobj)) {\r | |
7653 | Py_ssize_t i;\r | |
7654 | for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {\r | |
7655 | substring = (PyUnicodeObject *)PyUnicode_FromObject(\r | |
7656 | PyTuple_GET_ITEM(subobj, i));\r | |
7657 | if (substring == NULL)\r | |
7658 | return NULL;\r | |
7659 | result = tailmatch(self, substring, start, end, -1);\r | |
7660 | Py_DECREF(substring);\r | |
7661 | if (result) {\r | |
7662 | Py_RETURN_TRUE;\r | |
7663 | }\r | |
7664 | }\r | |
7665 | /* nothing matched */\r | |
7666 | Py_RETURN_FALSE;\r | |
7667 | }\r | |
7668 | substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);\r | |
7669 | if (substring == NULL) {\r | |
7670 | if (PyErr_ExceptionMatches(PyExc_TypeError))\r | |
7671 | PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "\r | |
7672 | "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);\r | |
7673 | return NULL;\r | |
7674 | }\r | |
7675 | result = tailmatch(self, substring, start, end, -1);\r | |
7676 | Py_DECREF(substring);\r | |
7677 | return PyBool_FromLong(result);\r | |
7678 | }\r | |
7679 | \r | |
7680 | \r | |
7681 | PyDoc_STRVAR(endswith__doc__,\r | |
7682 | "S.endswith(suffix[, start[, end]]) -> bool\n\\r | |
7683 | \n\\r | |
7684 | Return True if S ends with the specified suffix, False otherwise.\n\\r | |
7685 | With optional start, test S beginning at that position.\n\\r | |
7686 | With optional end, stop comparing S at that position.\n\\r | |
7687 | suffix can also be a tuple of strings to try.");\r | |
7688 | \r | |
7689 | static PyObject *\r | |
7690 | unicode_endswith(PyUnicodeObject *self,\r | |
7691 | PyObject *args)\r | |
7692 | {\r | |
7693 | PyObject *subobj;\r | |
7694 | PyUnicodeObject *substring;\r | |
7695 | Py_ssize_t start = 0;\r | |
7696 | Py_ssize_t end = PY_SSIZE_T_MAX;\r | |
7697 | int result;\r | |
7698 | \r | |
7699 | if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))\r | |
7700 | return NULL;\r | |
7701 | if (PyTuple_Check(subobj)) {\r | |
7702 | Py_ssize_t i;\r | |
7703 | for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {\r | |
7704 | substring = (PyUnicodeObject *)PyUnicode_FromObject(\r | |
7705 | PyTuple_GET_ITEM(subobj, i));\r | |
7706 | if (substring == NULL)\r | |
7707 | return NULL;\r | |
7708 | result = tailmatch(self, substring, start, end, +1);\r | |
7709 | Py_DECREF(substring);\r | |
7710 | if (result) {\r | |
7711 | Py_RETURN_TRUE;\r | |
7712 | }\r | |
7713 | }\r | |
7714 | Py_RETURN_FALSE;\r | |
7715 | }\r | |
7716 | substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);\r | |
7717 | if (substring == NULL) {\r | |
7718 | if (PyErr_ExceptionMatches(PyExc_TypeError))\r | |
7719 | PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "\r | |
7720 | "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);\r | |
7721 | return NULL;\r | |
7722 | }\r | |
7723 | result = tailmatch(self, substring, start, end, +1);\r | |
7724 | Py_DECREF(substring);\r | |
7725 | return PyBool_FromLong(result);\r | |
7726 | }\r | |
7727 | \r | |
7728 | \r | |
7729 | /* Implements do_string_format, which is unicode because of stringlib */\r | |
7730 | #include "stringlib/string_format.h"\r | |
7731 | \r | |
7732 | PyDoc_STRVAR(format__doc__,\r | |
7733 | "S.format(*args, **kwargs) -> unicode\n\\r | |
7734 | \n\\r | |
7735 | Return a formatted version of S, using substitutions from args and kwargs.\n\\r | |
7736 | The substitutions are identified by braces ('{' and '}').");\r | |
7737 | \r | |
7738 | static PyObject *\r | |
7739 | unicode__format__(PyObject *self, PyObject *args)\r | |
7740 | {\r | |
7741 | PyObject *format_spec;\r | |
7742 | PyObject *result = NULL;\r | |
7743 | PyObject *tmp = NULL;\r | |
7744 | \r | |
7745 | /* If 2.x, convert format_spec to the same type as value */\r | |
7746 | /* This is to allow things like u''.format('') */\r | |
7747 | if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))\r | |
7748 | goto done;\r | |
7749 | if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {\r | |
7750 | PyErr_Format(PyExc_TypeError, "__format__ arg must be str "\r | |
7751 | "or unicode, not %s", Py_TYPE(format_spec)->tp_name);\r | |
7752 | goto done;\r | |
7753 | }\r | |
7754 | tmp = PyObject_Unicode(format_spec);\r | |
7755 | if (tmp == NULL)\r | |
7756 | goto done;\r | |
7757 | format_spec = tmp;\r | |
7758 | \r | |
7759 | result = _PyUnicode_FormatAdvanced(self,\r | |
7760 | PyUnicode_AS_UNICODE(format_spec),\r | |
7761 | PyUnicode_GET_SIZE(format_spec));\r | |
7762 | done:\r | |
7763 | Py_XDECREF(tmp);\r | |
7764 | return result;\r | |
7765 | }\r | |
7766 | \r | |
7767 | PyDoc_STRVAR(p_format__doc__,\r | |
7768 | "S.__format__(format_spec) -> unicode\n\\r | |
7769 | \n\\r | |
7770 | Return a formatted version of S as described by format_spec.");\r | |
7771 | \r | |
7772 | static PyObject *\r | |
7773 | unicode__sizeof__(PyUnicodeObject *v)\r | |
7774 | {\r | |
7775 | return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +\r | |
7776 | sizeof(Py_UNICODE) * (v->length + 1));\r | |
7777 | }\r | |
7778 | \r | |
7779 | PyDoc_STRVAR(sizeof__doc__,\r | |
7780 | "S.__sizeof__() -> size of S in memory, in bytes\n\\r | |
7781 | \n\\r | |
7782 | ");\r | |
7783 | \r | |
7784 | static PyObject *\r | |
7785 | unicode_getnewargs(PyUnicodeObject *v)\r | |
7786 | {\r | |
7787 | return Py_BuildValue("(u#)", v->str, v->length);\r | |
7788 | }\r | |
7789 | \r | |
7790 | \r | |
7791 | static PyMethodDef unicode_methods[] = {\r | |
7792 | \r | |
7793 | /* Order is according to common usage: often used methods should\r | |
7794 | appear first, since lookup is done sequentially. */\r | |
7795 | \r | |
7796 | {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},\r | |
7797 | {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},\r | |
7798 | {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},\r | |
7799 | {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},\r | |
7800 | {"join", (PyCFunction) unicode_join, METH_O, join__doc__},\r | |
7801 | {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},\r | |
7802 | {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},\r | |
7803 | {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},\r | |
7804 | {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},\r | |
7805 | {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},\r | |
7806 | {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},\r | |
7807 | {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},\r | |
7808 | {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},\r | |
7809 | {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},\r | |
7810 | {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},\r | |
7811 | {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},\r | |
7812 | {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},\r | |
7813 | /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */\r | |
7814 | {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},\r | |
7815 | {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},\r | |
7816 | {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},\r | |
7817 | {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},\r | |
7818 | {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},\r | |
7819 | {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},\r | |
7820 | {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},\r | |
7821 | {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},\r | |
7822 | {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},\r | |
7823 | {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},\r | |
7824 | {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},\r | |
7825 | {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},\r | |
7826 | {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},\r | |
7827 | {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},\r | |
7828 | {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},\r | |
7829 | {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},\r | |
7830 | {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},\r | |
7831 | {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},\r | |
7832 | {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},\r | |
7833 | {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},\r | |
7834 | {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},\r | |
7835 | {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},\r | |
7836 | {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},\r | |
7837 | {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},\r | |
7838 | {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},\r | |
7839 | {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},\r | |
7840 | {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},\r | |
7841 | #if 0\r | |
7842 | {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},\r | |
7843 | #endif\r | |
7844 | \r | |
7845 | #if 0\r | |
7846 | /* This one is just used for debugging the implementation. */\r | |
7847 | {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},\r | |
7848 | #endif\r | |
7849 | \r | |
7850 | {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},\r | |
7851 | {NULL, NULL}\r | |
7852 | };\r | |
7853 | \r | |
7854 | static PyObject *\r | |
7855 | unicode_mod(PyObject *v, PyObject *w)\r | |
7856 | {\r | |
7857 | if (!PyUnicode_Check(v)) {\r | |
7858 | Py_INCREF(Py_NotImplemented);\r | |
7859 | return Py_NotImplemented;\r | |
7860 | }\r | |
7861 | return PyUnicode_Format(v, w);\r | |
7862 | }\r | |
7863 | \r | |
7864 | static PyNumberMethods unicode_as_number = {\r | |
7865 | 0, /*nb_add*/\r | |
7866 | 0, /*nb_subtract*/\r | |
7867 | 0, /*nb_multiply*/\r | |
7868 | 0, /*nb_divide*/\r | |
7869 | unicode_mod, /*nb_remainder*/\r | |
7870 | };\r | |
7871 | \r | |
7872 | static PySequenceMethods unicode_as_sequence = {\r | |
7873 | (lenfunc) unicode_length, /* sq_length */\r | |
7874 | PyUnicode_Concat, /* sq_concat */\r | |
7875 | (ssizeargfunc) unicode_repeat, /* sq_repeat */\r | |
7876 | (ssizeargfunc) unicode_getitem, /* sq_item */\r | |
7877 | (ssizessizeargfunc) unicode_slice, /* sq_slice */\r | |
7878 | 0, /* sq_ass_item */\r | |
7879 | 0, /* sq_ass_slice */\r | |
7880 | PyUnicode_Contains, /* sq_contains */\r | |
7881 | };\r | |
7882 | \r | |
7883 | static PyObject*\r | |
7884 | unicode_subscript(PyUnicodeObject* self, PyObject* item)\r | |
7885 | {\r | |
7886 | if (PyIndex_Check(item)) {\r | |
7887 | Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);\r | |
7888 | if (i == -1 && PyErr_Occurred())\r | |
7889 | return NULL;\r | |
7890 | if (i < 0)\r | |
7891 | i += PyUnicode_GET_SIZE(self);\r | |
7892 | return unicode_getitem(self, i);\r | |
7893 | } else if (PySlice_Check(item)) {\r | |
7894 | Py_ssize_t start, stop, step, slicelength, cur, i;\r | |
7895 | Py_UNICODE* source_buf;\r | |
7896 | Py_UNICODE* result_buf;\r | |
7897 | PyObject* result;\r | |
7898 | \r | |
7899 | if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),\r | |
7900 | &start, &stop, &step, &slicelength) < 0) {\r | |
7901 | return NULL;\r | |
7902 | }\r | |
7903 | \r | |
7904 | if (slicelength <= 0) {\r | |
7905 | return PyUnicode_FromUnicode(NULL, 0);\r | |
7906 | } else if (start == 0 && step == 1 && slicelength == self->length &&\r | |
7907 | PyUnicode_CheckExact(self)) {\r | |
7908 | Py_INCREF(self);\r | |
7909 | return (PyObject *)self;\r | |
7910 | } else if (step == 1) {\r | |
7911 | return PyUnicode_FromUnicode(self->str + start, slicelength);\r | |
7912 | } else {\r | |
7913 | source_buf = PyUnicode_AS_UNICODE((PyObject*)self);\r | |
7914 | result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*\r | |
7915 | sizeof(Py_UNICODE));\r | |
7916 | \r | |
7917 | if (result_buf == NULL)\r | |
7918 | return PyErr_NoMemory();\r | |
7919 | \r | |
7920 | for (cur = start, i = 0; i < slicelength; cur += step, i++) {\r | |
7921 | result_buf[i] = source_buf[cur];\r | |
7922 | }\r | |
7923 | \r | |
7924 | result = PyUnicode_FromUnicode(result_buf, slicelength);\r | |
7925 | PyObject_FREE(result_buf);\r | |
7926 | return result;\r | |
7927 | }\r | |
7928 | } else {\r | |
7929 | PyErr_SetString(PyExc_TypeError, "string indices must be integers");\r | |
7930 | return NULL;\r | |
7931 | }\r | |
7932 | }\r | |
7933 | \r | |
7934 | static PyMappingMethods unicode_as_mapping = {\r | |
7935 | (lenfunc)unicode_length, /* mp_length */\r | |
7936 | (binaryfunc)unicode_subscript, /* mp_subscript */\r | |
7937 | (objobjargproc)0, /* mp_ass_subscript */\r | |
7938 | };\r | |
7939 | \r | |
7940 | static Py_ssize_t\r | |
7941 | unicode_buffer_getreadbuf(PyUnicodeObject *self,\r | |
7942 | Py_ssize_t index,\r | |
7943 | const void **ptr)\r | |
7944 | {\r | |
7945 | if (index != 0) {\r | |
7946 | PyErr_SetString(PyExc_SystemError,\r | |
7947 | "accessing non-existent unicode segment");\r | |
7948 | return -1;\r | |
7949 | }\r | |
7950 | *ptr = (void *) self->str;\r | |
7951 | return PyUnicode_GET_DATA_SIZE(self);\r | |
7952 | }\r | |
7953 | \r | |
7954 | static Py_ssize_t\r | |
7955 | unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,\r | |
7956 | const void **ptr)\r | |
7957 | {\r | |
7958 | PyErr_SetString(PyExc_TypeError,\r | |
7959 | "cannot use unicode as modifiable buffer");\r | |
7960 | return -1;\r | |
7961 | }\r | |
7962 | \r | |
7963 | static int\r | |
7964 | unicode_buffer_getsegcount(PyUnicodeObject *self,\r | |
7965 | Py_ssize_t *lenp)\r | |
7966 | {\r | |
7967 | if (lenp)\r | |
7968 | *lenp = PyUnicode_GET_DATA_SIZE(self);\r | |
7969 | return 1;\r | |
7970 | }\r | |
7971 | \r | |
7972 | static Py_ssize_t\r | |
7973 | unicode_buffer_getcharbuf(PyUnicodeObject *self,\r | |
7974 | Py_ssize_t index,\r | |
7975 | const void **ptr)\r | |
7976 | {\r | |
7977 | PyObject *str;\r | |
7978 | \r | |
7979 | if (index != 0) {\r | |
7980 | PyErr_SetString(PyExc_SystemError,\r | |
7981 | "accessing non-existent unicode segment");\r | |
7982 | return -1;\r | |
7983 | }\r | |
7984 | str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);\r | |
7985 | if (str == NULL)\r | |
7986 | return -1;\r | |
7987 | *ptr = (void *) PyString_AS_STRING(str);\r | |
7988 | return PyString_GET_SIZE(str);\r | |
7989 | }\r | |
7990 | \r | |
7991 | /* Helpers for PyUnicode_Format() */\r | |
7992 | \r | |
7993 | static PyObject *\r | |
7994 | getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)\r | |
7995 | {\r | |
7996 | Py_ssize_t argidx = *p_argidx;\r | |
7997 | if (argidx < arglen) {\r | |
7998 | (*p_argidx)++;\r | |
7999 | if (arglen < 0)\r | |
8000 | return args;\r | |
8001 | else\r | |
8002 | return PyTuple_GetItem(args, argidx);\r | |
8003 | }\r | |
8004 | PyErr_SetString(PyExc_TypeError,\r | |
8005 | "not enough arguments for format string");\r | |
8006 | return NULL;\r | |
8007 | }\r | |
8008 | \r | |
8009 | #define F_LJUST (1<<0)\r | |
8010 | #define F_SIGN (1<<1)\r | |
8011 | #define F_BLANK (1<<2)\r | |
8012 | #define F_ALT (1<<3)\r | |
8013 | #define F_ZERO (1<<4)\r | |
8014 | \r | |
8015 | static Py_ssize_t\r | |
8016 | strtounicode(Py_UNICODE *buffer, const char *charbuffer)\r | |
8017 | {\r | |
8018 | register Py_ssize_t i;\r | |
8019 | Py_ssize_t len = strlen(charbuffer);\r | |
8020 | for (i = len - 1; i >= 0; i--)\r | |
8021 | buffer[i] = (Py_UNICODE) charbuffer[i];\r | |
8022 | \r | |
8023 | return len;\r | |
8024 | }\r | |
8025 | \r | |
8026 | static int\r | |
8027 | longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)\r | |
8028 | {\r | |
8029 | Py_ssize_t result;\r | |
8030 | \r | |
8031 | PyOS_snprintf((char *)buffer, len, format, x);\r | |
8032 | result = strtounicode(buffer, (char *)buffer);\r | |
8033 | return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);\r | |
8034 | }\r | |
8035 | \r | |
8036 | /* XXX To save some code duplication, formatfloat/long/int could have been\r | |
8037 | shared with stringobject.c, converting from 8-bit to Unicode after the\r | |
8038 | formatting is done. */\r | |
8039 | \r | |
8040 | /* Returns a new reference to a PyUnicode object, or NULL on failure. */\r | |
8041 | \r | |
8042 | static PyObject *\r | |
8043 | formatfloat(PyObject *v, int flags, int prec, int type)\r | |
8044 | {\r | |
8045 | char *p;\r | |
8046 | PyObject *result;\r | |
8047 | double x;\r | |
8048 | \r | |
8049 | x = PyFloat_AsDouble(v);\r | |
8050 | if (x == -1.0 && PyErr_Occurred())\r | |
8051 | return NULL;\r | |
8052 | \r | |
8053 | if (prec < 0)\r | |
8054 | prec = 6;\r | |
8055 | \r | |
8056 | p = PyOS_double_to_string(x, type, prec,\r | |
8057 | (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);\r | |
8058 | if (p == NULL)\r | |
8059 | return NULL;\r | |
8060 | result = PyUnicode_FromStringAndSize(p, strlen(p));\r | |
8061 | PyMem_Free(p);\r | |
8062 | return result;\r | |
8063 | }\r | |
8064 | \r | |
8065 | static PyObject*\r | |
8066 | formatlong(PyObject *val, int flags, int prec, int type)\r | |
8067 | {\r | |
8068 | char *buf;\r | |
8069 | int i, len;\r | |
8070 | PyObject *str; /* temporary string object. */\r | |
8071 | PyUnicodeObject *result;\r | |
8072 | \r | |
8073 | str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);\r | |
8074 | if (!str)\r | |
8075 | return NULL;\r | |
8076 | result = _PyUnicode_New(len);\r | |
8077 | if (!result) {\r | |
8078 | Py_DECREF(str);\r | |
8079 | return NULL;\r | |
8080 | }\r | |
8081 | for (i = 0; i < len; i++)\r | |
8082 | result->str[i] = buf[i];\r | |
8083 | result->str[len] = 0;\r | |
8084 | Py_DECREF(str);\r | |
8085 | return (PyObject*)result;\r | |
8086 | }\r | |
8087 | \r | |
8088 | static int\r | |
8089 | formatint(Py_UNICODE *buf,\r | |
8090 | size_t buflen,\r | |
8091 | int flags,\r | |
8092 | int prec,\r | |
8093 | int type,\r | |
8094 | PyObject *v)\r | |
8095 | {\r | |
8096 | /* fmt = '%#.' + `prec` + 'l' + `type`\r | |
8097 | * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)\r | |
8098 | * + 1 + 1\r | |
8099 | * = 24\r | |
8100 | */\r | |
8101 | char fmt[64]; /* plenty big enough! */\r | |
8102 | char *sign;\r | |
8103 | long x;\r | |
8104 | \r | |
8105 | x = PyInt_AsLong(v);\r | |
8106 | if (x == -1 && PyErr_Occurred())\r | |
8107 | return -1;\r | |
8108 | if (x < 0 && type == 'u') {\r | |
8109 | type = 'd';\r | |
8110 | }\r | |
8111 | if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))\r | |
8112 | sign = "-";\r | |
8113 | else\r | |
8114 | sign = "";\r | |
8115 | if (prec < 0)\r | |
8116 | prec = 1;\r | |
8117 | \r | |
8118 | /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))\r | |
8119 | * worst case buf = '-0x' + [0-9]*prec, where prec >= 11\r | |
8120 | */\r | |
8121 | if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {\r | |
8122 | PyErr_SetString(PyExc_OverflowError,\r | |
8123 | "formatted integer is too long (precision too large?)");\r | |
8124 | return -1;\r | |
8125 | }\r | |
8126 | \r | |
8127 | if ((flags & F_ALT) &&\r | |
8128 | (type == 'x' || type == 'X')) {\r | |
8129 | /* When converting under %#x or %#X, there are a number\r | |
8130 | * of issues that cause pain:\r | |
8131 | * - when 0 is being converted, the C standard leaves off\r | |
8132 | * the '0x' or '0X', which is inconsistent with other\r | |
8133 | * %#x/%#X conversions and inconsistent with Python's\r | |
8134 | * hex() function\r | |
8135 | * - there are platforms that violate the standard and\r | |
8136 | * convert 0 with the '0x' or '0X'\r | |
8137 | * (Metrowerks, Compaq Tru64)\r | |
8138 | * - there are platforms that give '0x' when converting\r | |
8139 | * under %#X, but convert 0 in accordance with the\r | |
8140 | * standard (OS/2 EMX)\r | |
8141 | *\r | |
8142 | * We can achieve the desired consistency by inserting our\r | |
8143 | * own '0x' or '0X' prefix, and substituting %x/%X in place\r | |
8144 | * of %#x/%#X.\r | |
8145 | *\r | |
8146 | * Note that this is the same approach as used in\r | |
8147 | * formatint() in stringobject.c\r | |
8148 | */\r | |
8149 | PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",\r | |
8150 | sign, type, prec, type);\r | |
8151 | }\r | |
8152 | else {\r | |
8153 | PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",\r | |
8154 | sign, (flags&F_ALT) ? "#" : "",\r | |
8155 | prec, type);\r | |
8156 | }\r | |
8157 | if (sign[0])\r | |
8158 | return longtounicode(buf, buflen, fmt, -x);\r | |
8159 | else\r | |
8160 | return longtounicode(buf, buflen, fmt, x);\r | |
8161 | }\r | |
8162 | \r | |
8163 | static int\r | |
8164 | formatchar(Py_UNICODE *buf,\r | |
8165 | size_t buflen,\r | |
8166 | PyObject *v)\r | |
8167 | {\r | |
8168 | PyObject *unistr;\r | |
8169 | char *str;\r | |
8170 | /* presume that the buffer is at least 2 characters long */\r | |
8171 | if (PyUnicode_Check(v)) {\r | |
8172 | if (PyUnicode_GET_SIZE(v) != 1)\r | |
8173 | goto onError;\r | |
8174 | buf[0] = PyUnicode_AS_UNICODE(v)[0];\r | |
8175 | }\r | |
8176 | \r | |
8177 | else if (PyString_Check(v)) {\r | |
8178 | if (PyString_GET_SIZE(v) != 1)\r | |
8179 | goto onError;\r | |
8180 | /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail\r | |
8181 | with a UnicodeDecodeError if 'char' is not decodable with the\r | |
8182 | default encoding (usually ASCII, but it might be something else) */\r | |
8183 | str = PyString_AS_STRING(v);\r | |
8184 | if ((unsigned char)str[0] > 0x7F) {\r | |
8185 | /* the char is not ASCII; try to decode the string using the\r | |
8186 | default encoding and return -1 to let the UnicodeDecodeError\r | |
8187 | be raised if the string can't be decoded */\r | |
8188 | unistr = PyUnicode_Decode(str, 1, NULL, "strict");\r | |
8189 | if (unistr == NULL)\r | |
8190 | return -1;\r | |
8191 | buf[0] = PyUnicode_AS_UNICODE(unistr)[0];\r | |
8192 | Py_DECREF(unistr);\r | |
8193 | }\r | |
8194 | else\r | |
8195 | buf[0] = (Py_UNICODE)str[0];\r | |
8196 | }\r | |
8197 | \r | |
8198 | else {\r | |
8199 | /* Integer input truncated to a character */\r | |
8200 | long x;\r | |
8201 | x = PyInt_AsLong(v);\r | |
8202 | if (x == -1 && PyErr_Occurred())\r | |
8203 | goto onError;\r | |
8204 | #ifdef Py_UNICODE_WIDE\r | |
8205 | if (x < 0 || x > 0x10ffff) {\r | |
8206 | PyErr_SetString(PyExc_OverflowError,\r | |
8207 | "%c arg not in range(0x110000) "\r | |
8208 | "(wide Python build)");\r | |
8209 | return -1;\r | |
8210 | }\r | |
8211 | #else\r | |
8212 | if (x < 0 || x > 0xffff) {\r | |
8213 | PyErr_SetString(PyExc_OverflowError,\r | |
8214 | "%c arg not in range(0x10000) "\r | |
8215 | "(narrow Python build)");\r | |
8216 | return -1;\r | |
8217 | }\r | |
8218 | #endif\r | |
8219 | buf[0] = (Py_UNICODE) x;\r | |
8220 | }\r | |
8221 | buf[1] = '\0';\r | |
8222 | return 1;\r | |
8223 | \r | |
8224 | onError:\r | |
8225 | PyErr_SetString(PyExc_TypeError,\r | |
8226 | "%c requires int or char");\r | |
8227 | return -1;\r | |
8228 | }\r | |
8229 | \r | |
8230 | /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)\r | |
8231 | \r | |
8232 | FORMATBUFLEN is the length of the buffer in which the ints &\r | |
8233 | chars are formatted. XXX This is a magic number. Each formatting\r | |
8234 | routine does bounds checking to ensure no overflow, but a better\r | |
8235 | solution may be to malloc a buffer of appropriate size for each\r | |
8236 | format. For now, the current solution is sufficient.\r | |
8237 | */\r | |
8238 | #define FORMATBUFLEN (size_t)120\r | |
8239 | \r | |
8240 | PyObject *PyUnicode_Format(PyObject *format,\r | |
8241 | PyObject *args)\r | |
8242 | {\r | |
8243 | Py_UNICODE *fmt, *res;\r | |
8244 | Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;\r | |
8245 | int args_owned = 0;\r | |
8246 | PyUnicodeObject *result = NULL;\r | |
8247 | PyObject *dict = NULL;\r | |
8248 | PyObject *uformat;\r | |
8249 | \r | |
8250 | if (format == NULL || args == NULL) {\r | |
8251 | PyErr_BadInternalCall();\r | |
8252 | return NULL;\r | |
8253 | }\r | |
8254 | uformat = PyUnicode_FromObject(format);\r | |
8255 | if (uformat == NULL)\r | |
8256 | return NULL;\r | |
8257 | fmt = PyUnicode_AS_UNICODE(uformat);\r | |
8258 | fmtcnt = PyUnicode_GET_SIZE(uformat);\r | |
8259 | \r | |
8260 | reslen = rescnt = fmtcnt + 100;\r | |
8261 | result = _PyUnicode_New(reslen);\r | |
8262 | if (result == NULL)\r | |
8263 | goto onError;\r | |
8264 | res = PyUnicode_AS_UNICODE(result);\r | |
8265 | \r | |
8266 | if (PyTuple_Check(args)) {\r | |
8267 | arglen = PyTuple_Size(args);\r | |
8268 | argidx = 0;\r | |
8269 | }\r | |
8270 | else {\r | |
8271 | arglen = -1;\r | |
8272 | argidx = -2;\r | |
8273 | }\r | |
8274 | if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&\r | |
8275 | !PyObject_TypeCheck(args, &PyBaseString_Type))\r | |
8276 | dict = args;\r | |
8277 | \r | |
8278 | while (--fmtcnt >= 0) {\r | |
8279 | if (*fmt != '%') {\r | |
8280 | if (--rescnt < 0) {\r | |
8281 | rescnt = fmtcnt + 100;\r | |
8282 | reslen += rescnt;\r | |
8283 | if (_PyUnicode_Resize(&result, reslen) < 0)\r | |
8284 | goto onError;\r | |
8285 | res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;\r | |
8286 | --rescnt;\r | |
8287 | }\r | |
8288 | *res++ = *fmt++;\r | |
8289 | }\r | |
8290 | else {\r | |
8291 | /* Got a format specifier */\r | |
8292 | int flags = 0;\r | |
8293 | Py_ssize_t width = -1;\r | |
8294 | int prec = -1;\r | |
8295 | Py_UNICODE c = '\0';\r | |
8296 | Py_UNICODE fill;\r | |
8297 | int isnumok;\r | |
de08c53b DM |
8298 | PyObject *v = NULL;\r |
8299 | PyObject *temp = NULL;\r | |
8300 | Py_UNICODE *pbuf = NULL;\r | |
4710c53d | 8301 | Py_UNICODE sign;\r |
8302 | Py_ssize_t len;\r | |
8303 | Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */\r | |
8304 | \r | |
8305 | fmt++;\r | |
8306 | if (*fmt == '(') {\r | |
8307 | Py_UNICODE *keystart;\r | |
8308 | Py_ssize_t keylen;\r | |
8309 | PyObject *key;\r | |
8310 | int pcount = 1;\r | |
8311 | \r | |
8312 | if (dict == NULL) {\r | |
8313 | PyErr_SetString(PyExc_TypeError,\r | |
8314 | "format requires a mapping");\r | |
8315 | goto onError;\r | |
8316 | }\r | |
8317 | ++fmt;\r | |
8318 | --fmtcnt;\r | |
8319 | keystart = fmt;\r | |
8320 | /* Skip over balanced parentheses */\r | |
8321 | while (pcount > 0 && --fmtcnt >= 0) {\r | |
8322 | if (*fmt == ')')\r | |
8323 | --pcount;\r | |
8324 | else if (*fmt == '(')\r | |
8325 | ++pcount;\r | |
8326 | fmt++;\r | |
8327 | }\r | |
8328 | keylen = fmt - keystart - 1;\r | |
8329 | if (fmtcnt < 0 || pcount > 0) {\r | |
8330 | PyErr_SetString(PyExc_ValueError,\r | |
8331 | "incomplete format key");\r | |
8332 | goto onError;\r | |
8333 | }\r | |
8334 | #if 0\r | |
8335 | /* keys are converted to strings using UTF-8 and\r | |
8336 | then looked up since Python uses strings to hold\r | |
8337 | variables names etc. in its namespaces and we\r | |
8338 | wouldn't want to break common idioms. */\r | |
8339 | key = PyUnicode_EncodeUTF8(keystart,\r | |
8340 | keylen,\r | |
8341 | NULL);\r | |
8342 | #else\r | |
8343 | key = PyUnicode_FromUnicode(keystart, keylen);\r | |
8344 | #endif\r | |
8345 | if (key == NULL)\r | |
8346 | goto onError;\r | |
8347 | if (args_owned) {\r | |
8348 | Py_DECREF(args);\r | |
8349 | args_owned = 0;\r | |
8350 | }\r | |
8351 | args = PyObject_GetItem(dict, key);\r | |
8352 | Py_DECREF(key);\r | |
8353 | if (args == NULL) {\r | |
8354 | goto onError;\r | |
8355 | }\r | |
8356 | args_owned = 1;\r | |
8357 | arglen = -1;\r | |
8358 | argidx = -2;\r | |
8359 | }\r | |
8360 | while (--fmtcnt >= 0) {\r | |
8361 | switch (c = *fmt++) {\r | |
8362 | case '-': flags |= F_LJUST; continue;\r | |
8363 | case '+': flags |= F_SIGN; continue;\r | |
8364 | case ' ': flags |= F_BLANK; continue;\r | |
8365 | case '#': flags |= F_ALT; continue;\r | |
8366 | case '0': flags |= F_ZERO; continue;\r | |
8367 | }\r | |
8368 | break;\r | |
8369 | }\r | |
8370 | if (c == '*') {\r | |
8371 | v = getnextarg(args, arglen, &argidx);\r | |
8372 | if (v == NULL)\r | |
8373 | goto onError;\r | |
8374 | if (!PyInt_Check(v)) {\r | |
8375 | PyErr_SetString(PyExc_TypeError,\r | |
8376 | "* wants int");\r | |
8377 | goto onError;\r | |
8378 | }\r | |
8379 | width = PyInt_AsLong(v);\r | |
8380 | if (width < 0) {\r | |
8381 | flags |= F_LJUST;\r | |
8382 | width = -width;\r | |
8383 | }\r | |
8384 | if (--fmtcnt >= 0)\r | |
8385 | c = *fmt++;\r | |
8386 | }\r | |
8387 | else if (c >= '0' && c <= '9') {\r | |
8388 | width = c - '0';\r | |
8389 | while (--fmtcnt >= 0) {\r | |
8390 | c = *fmt++;\r | |
8391 | if (c < '0' || c > '9')\r | |
8392 | break;\r | |
8393 | if ((width*10) / 10 != width) {\r | |
8394 | PyErr_SetString(PyExc_ValueError,\r | |
8395 | "width too big");\r | |
8396 | goto onError;\r | |
8397 | }\r | |
8398 | width = width*10 + (c - '0');\r | |
8399 | }\r | |
8400 | }\r | |
8401 | if (c == '.') {\r | |
8402 | prec = 0;\r | |
8403 | if (--fmtcnt >= 0)\r | |
8404 | c = *fmt++;\r | |
8405 | if (c == '*') {\r | |
8406 | v = getnextarg(args, arglen, &argidx);\r | |
8407 | if (v == NULL)\r | |
8408 | goto onError;\r | |
8409 | if (!PyInt_Check(v)) {\r | |
8410 | PyErr_SetString(PyExc_TypeError,\r | |
8411 | "* wants int");\r | |
8412 | goto onError;\r | |
8413 | }\r | |
8414 | prec = PyInt_AsLong(v);\r | |
8415 | if (prec < 0)\r | |
8416 | prec = 0;\r | |
8417 | if (--fmtcnt >= 0)\r | |
8418 | c = *fmt++;\r | |
8419 | }\r | |
8420 | else if (c >= '0' && c <= '9') {\r | |
8421 | prec = c - '0';\r | |
8422 | while (--fmtcnt >= 0) {\r | |
8423 | c = *fmt++;\r | |
8424 | if (c < '0' || c > '9')\r | |
8425 | break;\r | |
8426 | if ((prec*10) / 10 != prec) {\r | |
8427 | PyErr_SetString(PyExc_ValueError,\r | |
8428 | "prec too big");\r | |
8429 | goto onError;\r | |
8430 | }\r | |
8431 | prec = prec*10 + (c - '0');\r | |
8432 | }\r | |
8433 | }\r | |
8434 | } /* prec */\r | |
8435 | if (fmtcnt >= 0) {\r | |
8436 | if (c == 'h' || c == 'l' || c == 'L') {\r | |
8437 | if (--fmtcnt >= 0)\r | |
8438 | c = *fmt++;\r | |
8439 | }\r | |
8440 | }\r | |
8441 | if (fmtcnt < 0) {\r | |
8442 | PyErr_SetString(PyExc_ValueError,\r | |
8443 | "incomplete format");\r | |
8444 | goto onError;\r | |
8445 | }\r | |
8446 | if (c != '%') {\r | |
8447 | v = getnextarg(args, arglen, &argidx);\r | |
8448 | if (v == NULL)\r | |
8449 | goto onError;\r | |
8450 | }\r | |
8451 | sign = 0;\r | |
8452 | fill = ' ';\r | |
8453 | switch (c) {\r | |
8454 | \r | |
8455 | case '%':\r | |
8456 | pbuf = formatbuf;\r | |
8457 | /* presume that buffer length is at least 1 */\r | |
8458 | pbuf[0] = '%';\r | |
8459 | len = 1;\r | |
8460 | break;\r | |
8461 | \r | |
8462 | case 's':\r | |
8463 | case 'r':\r | |
8464 | if (PyUnicode_CheckExact(v) && c == 's') {\r | |
8465 | temp = v;\r | |
8466 | Py_INCREF(temp);\r | |
8467 | }\r | |
8468 | else {\r | |
8469 | PyObject *unicode;\r | |
8470 | if (c == 's')\r | |
8471 | temp = PyObject_Unicode(v);\r | |
8472 | else\r | |
8473 | temp = PyObject_Repr(v);\r | |
8474 | if (temp == NULL)\r | |
8475 | goto onError;\r | |
8476 | if (PyUnicode_Check(temp))\r | |
8477 | /* nothing to do */;\r | |
8478 | else if (PyString_Check(temp)) {\r | |
8479 | /* convert to string to Unicode */\r | |
8480 | unicode = PyUnicode_Decode(PyString_AS_STRING(temp),\r | |
8481 | PyString_GET_SIZE(temp),\r | |
8482 | NULL,\r | |
8483 | "strict");\r | |
8484 | Py_DECREF(temp);\r | |
8485 | temp = unicode;\r | |
8486 | if (temp == NULL)\r | |
8487 | goto onError;\r | |
8488 | }\r | |
8489 | else {\r | |
8490 | Py_DECREF(temp);\r | |
8491 | PyErr_SetString(PyExc_TypeError,\r | |
8492 | "%s argument has non-string str()");\r | |
8493 | goto onError;\r | |
8494 | }\r | |
8495 | }\r | |
8496 | pbuf = PyUnicode_AS_UNICODE(temp);\r | |
8497 | len = PyUnicode_GET_SIZE(temp);\r | |
8498 | if (prec >= 0 && len > prec)\r | |
8499 | len = prec;\r | |
8500 | break;\r | |
8501 | \r | |
8502 | case 'i':\r | |
8503 | case 'd':\r | |
8504 | case 'u':\r | |
8505 | case 'o':\r | |
8506 | case 'x':\r | |
8507 | case 'X':\r | |
8508 | if (c == 'i')\r | |
8509 | c = 'd';\r | |
8510 | isnumok = 0;\r | |
8511 | if (PyNumber_Check(v)) {\r | |
8512 | PyObject *iobj=NULL;\r | |
8513 | \r | |
8514 | if (PyInt_Check(v) || (PyLong_Check(v))) {\r | |
8515 | iobj = v;\r | |
8516 | Py_INCREF(iobj);\r | |
8517 | }\r | |
8518 | else {\r | |
8519 | iobj = PyNumber_Int(v);\r | |
8520 | if (iobj==NULL) iobj = PyNumber_Long(v);\r | |
8521 | }\r | |
8522 | if (iobj!=NULL) {\r | |
8523 | if (PyInt_Check(iobj)) {\r | |
8524 | isnumok = 1;\r | |
8525 | pbuf = formatbuf;\r | |
8526 | len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),\r | |
8527 | flags, prec, c, iobj);\r | |
8528 | Py_DECREF(iobj);\r | |
8529 | if (len < 0)\r | |
8530 | goto onError;\r | |
8531 | sign = 1;\r | |
8532 | }\r | |
8533 | else if (PyLong_Check(iobj)) {\r | |
8534 | isnumok = 1;\r | |
8535 | temp = formatlong(iobj, flags, prec, c);\r | |
8536 | Py_DECREF(iobj);\r | |
8537 | if (!temp)\r | |
8538 | goto onError;\r | |
8539 | pbuf = PyUnicode_AS_UNICODE(temp);\r | |
8540 | len = PyUnicode_GET_SIZE(temp);\r | |
8541 | sign = 1;\r | |
8542 | }\r | |
8543 | else {\r | |
8544 | Py_DECREF(iobj);\r | |
8545 | }\r | |
8546 | }\r | |
8547 | }\r | |
8548 | if (!isnumok) {\r | |
8549 | PyErr_Format(PyExc_TypeError,\r | |
8550 | "%%%c format: a number is required, "\r | |
8551 | "not %.200s", (char)c, Py_TYPE(v)->tp_name);\r | |
8552 | goto onError;\r | |
8553 | }\r | |
8554 | if (flags & F_ZERO)\r | |
8555 | fill = '0';\r | |
8556 | break;\r | |
8557 | \r | |
8558 | case 'e':\r | |
8559 | case 'E':\r | |
8560 | case 'f':\r | |
8561 | case 'F':\r | |
8562 | case 'g':\r | |
8563 | case 'G':\r | |
8564 | temp = formatfloat(v, flags, prec, c);\r | |
8565 | if (temp == NULL)\r | |
8566 | goto onError;\r | |
8567 | pbuf = PyUnicode_AS_UNICODE(temp);\r | |
8568 | len = PyUnicode_GET_SIZE(temp);\r | |
8569 | sign = 1;\r | |
8570 | if (flags & F_ZERO)\r | |
8571 | fill = '0';\r | |
8572 | break;\r | |
8573 | \r | |
8574 | case 'c':\r | |
8575 | pbuf = formatbuf;\r | |
8576 | len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);\r | |
8577 | if (len < 0)\r | |
8578 | goto onError;\r | |
8579 | break;\r | |
8580 | \r | |
8581 | default:\r | |
8582 | PyErr_Format(PyExc_ValueError,\r | |
8583 | "unsupported format character '%c' (0x%x) "\r | |
8584 | "at index %zd",\r | |
8585 | (31<=c && c<=126) ? (char)c : '?',\r | |
8586 | (int)c,\r | |
8587 | (Py_ssize_t)(fmt - 1 -\r | |
8588 | PyUnicode_AS_UNICODE(uformat)));\r | |
8589 | goto onError;\r | |
8590 | }\r | |
8591 | if (sign) {\r | |
8592 | if (*pbuf == '-' || *pbuf == '+') {\r | |
8593 | sign = *pbuf++;\r | |
8594 | len--;\r | |
8595 | }\r | |
8596 | else if (flags & F_SIGN)\r | |
8597 | sign = '+';\r | |
8598 | else if (flags & F_BLANK)\r | |
8599 | sign = ' ';\r | |
8600 | else\r | |
8601 | sign = 0;\r | |
8602 | }\r | |
8603 | if (width < len)\r | |
8604 | width = len;\r | |
8605 | if (rescnt - (sign != 0) < width) {\r | |
8606 | reslen -= rescnt;\r | |
8607 | rescnt = width + fmtcnt + 100;\r | |
8608 | reslen += rescnt;\r | |
8609 | if (reslen < 0) {\r | |
8610 | Py_XDECREF(temp);\r | |
8611 | PyErr_NoMemory();\r | |
8612 | goto onError;\r | |
8613 | }\r | |
8614 | if (_PyUnicode_Resize(&result, reslen) < 0) {\r | |
8615 | Py_XDECREF(temp);\r | |
8616 | goto onError;\r | |
8617 | }\r | |
8618 | res = PyUnicode_AS_UNICODE(result)\r | |
8619 | + reslen - rescnt;\r | |
8620 | }\r | |
8621 | if (sign) {\r | |
8622 | if (fill != ' ')\r | |
8623 | *res++ = sign;\r | |
8624 | rescnt--;\r | |
8625 | if (width > len)\r | |
8626 | width--;\r | |
8627 | }\r | |
8628 | if ((flags & F_ALT) && (c == 'x' || c == 'X')) {\r | |
8629 | assert(pbuf[0] == '0');\r | |
8630 | assert(pbuf[1] == c);\r | |
8631 | if (fill != ' ') {\r | |
8632 | *res++ = *pbuf++;\r | |
8633 | *res++ = *pbuf++;\r | |
8634 | }\r | |
8635 | rescnt -= 2;\r | |
8636 | width -= 2;\r | |
8637 | if (width < 0)\r | |
8638 | width = 0;\r | |
8639 | len -= 2;\r | |
8640 | }\r | |
8641 | if (width > len && !(flags & F_LJUST)) {\r | |
8642 | do {\r | |
8643 | --rescnt;\r | |
8644 | *res++ = fill;\r | |
8645 | } while (--width > len);\r | |
8646 | }\r | |
8647 | if (fill == ' ') {\r | |
8648 | if (sign)\r | |
8649 | *res++ = sign;\r | |
8650 | if ((flags & F_ALT) && (c == 'x' || c == 'X')) {\r | |
8651 | assert(pbuf[0] == '0');\r | |
8652 | assert(pbuf[1] == c);\r | |
8653 | *res++ = *pbuf++;\r | |
8654 | *res++ = *pbuf++;\r | |
8655 | }\r | |
8656 | }\r | |
8657 | Py_UNICODE_COPY(res, pbuf, len);\r | |
8658 | res += len;\r | |
8659 | rescnt -= len;\r | |
8660 | while (--width >= len) {\r | |
8661 | --rescnt;\r | |
8662 | *res++ = ' ';\r | |
8663 | }\r | |
8664 | if (dict && (argidx < arglen) && c != '%') {\r | |
8665 | PyErr_SetString(PyExc_TypeError,\r | |
8666 | "not all arguments converted during string formatting");\r | |
8667 | Py_XDECREF(temp);\r | |
8668 | goto onError;\r | |
8669 | }\r | |
8670 | Py_XDECREF(temp);\r | |
8671 | } /* '%' */\r | |
8672 | } /* until end */\r | |
8673 | if (argidx < arglen && !dict) {\r | |
8674 | PyErr_SetString(PyExc_TypeError,\r | |
8675 | "not all arguments converted during string formatting");\r | |
8676 | goto onError;\r | |
8677 | }\r | |
8678 | \r | |
8679 | if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)\r | |
8680 | goto onError;\r | |
8681 | if (args_owned) {\r | |
8682 | Py_DECREF(args);\r | |
8683 | }\r | |
8684 | Py_DECREF(uformat);\r | |
8685 | return (PyObject *)result;\r | |
8686 | \r | |
8687 | onError:\r | |
8688 | Py_XDECREF(result);\r | |
8689 | Py_DECREF(uformat);\r | |
8690 | if (args_owned) {\r | |
8691 | Py_DECREF(args);\r | |
8692 | }\r | |
8693 | return NULL;\r | |
8694 | }\r | |
8695 | \r | |
8696 | static PyBufferProcs unicode_as_buffer = {\r | |
8697 | (readbufferproc) unicode_buffer_getreadbuf,\r | |
8698 | (writebufferproc) unicode_buffer_getwritebuf,\r | |
8699 | (segcountproc) unicode_buffer_getsegcount,\r | |
8700 | (charbufferproc) unicode_buffer_getcharbuf,\r | |
8701 | };\r | |
8702 | \r | |
8703 | static PyObject *\r | |
8704 | unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);\r | |
8705 | \r | |
8706 | static PyObject *\r | |
8707 | unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)\r | |
8708 | {\r | |
8709 | PyObject *x = NULL;\r | |
8710 | static char *kwlist[] = {"string", "encoding", "errors", 0};\r | |
8711 | char *encoding = NULL;\r | |
8712 | char *errors = NULL;\r | |
8713 | \r | |
8714 | if (type != &PyUnicode_Type)\r | |
8715 | return unicode_subtype_new(type, args, kwds);\r | |
8716 | if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",\r | |
8717 | kwlist, &x, &encoding, &errors))\r | |
8718 | return NULL;\r | |
8719 | if (x == NULL)\r | |
8720 | return (PyObject *)_PyUnicode_New(0);\r | |
8721 | if (encoding == NULL && errors == NULL)\r | |
8722 | return PyObject_Unicode(x);\r | |
8723 | else\r | |
8724 | return PyUnicode_FromEncodedObject(x, encoding, errors);\r | |
8725 | }\r | |
8726 | \r | |
8727 | static PyObject *\r | |
8728 | unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)\r | |
8729 | {\r | |
8730 | PyUnicodeObject *tmp, *pnew;\r | |
8731 | Py_ssize_t n;\r | |
8732 | \r | |
8733 | assert(PyType_IsSubtype(type, &PyUnicode_Type));\r | |
8734 | tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);\r | |
8735 | if (tmp == NULL)\r | |
8736 | return NULL;\r | |
8737 | assert(PyUnicode_Check(tmp));\r | |
8738 | pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);\r | |
8739 | if (pnew == NULL) {\r | |
8740 | Py_DECREF(tmp);\r | |
8741 | return NULL;\r | |
8742 | }\r | |
8743 | pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));\r | |
8744 | if (pnew->str == NULL) {\r | |
8745 | _Py_ForgetReference((PyObject *)pnew);\r | |
8746 | PyObject_Del(pnew);\r | |
8747 | Py_DECREF(tmp);\r | |
8748 | return PyErr_NoMemory();\r | |
8749 | }\r | |
8750 | Py_UNICODE_COPY(pnew->str, tmp->str, n+1);\r | |
8751 | pnew->length = n;\r | |
8752 | pnew->hash = tmp->hash;\r | |
8753 | Py_DECREF(tmp);\r | |
8754 | return (PyObject *)pnew;\r | |
8755 | }\r | |
8756 | \r | |
8757 | PyDoc_STRVAR(unicode_doc,\r | |
8758 | "unicode(string [, encoding[, errors]]) -> object\n\\r | |
8759 | \n\\r | |
8760 | Create a new Unicode object from the given encoded string.\n\\r | |
8761 | encoding defaults to the current default string encoding.\n\\r | |
8762 | errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");\r | |
8763 | \r | |
8764 | PyTypeObject PyUnicode_Type = {\r | |
8765 | PyVarObject_HEAD_INIT(&PyType_Type, 0)\r | |
8766 | "unicode", /* tp_name */\r | |
8767 | sizeof(PyUnicodeObject), /* tp_size */\r | |
8768 | 0, /* tp_itemsize */\r | |
8769 | /* Slots */\r | |
8770 | (destructor)unicode_dealloc, /* tp_dealloc */\r | |
8771 | 0, /* tp_print */\r | |
8772 | 0, /* tp_getattr */\r | |
8773 | 0, /* tp_setattr */\r | |
8774 | 0, /* tp_compare */\r | |
8775 | unicode_repr, /* tp_repr */\r | |
8776 | &unicode_as_number, /* tp_as_number */\r | |
8777 | &unicode_as_sequence, /* tp_as_sequence */\r | |
8778 | &unicode_as_mapping, /* tp_as_mapping */\r | |
8779 | (hashfunc) unicode_hash, /* tp_hash*/\r | |
8780 | 0, /* tp_call*/\r | |
8781 | (reprfunc) unicode_str, /* tp_str */\r | |
8782 | PyObject_GenericGetAttr, /* tp_getattro */\r | |
8783 | 0, /* tp_setattro */\r | |
8784 | &unicode_as_buffer, /* tp_as_buffer */\r | |
8785 | Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |\r | |
8786 | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */\r | |
8787 | unicode_doc, /* tp_doc */\r | |
8788 | 0, /* tp_traverse */\r | |
8789 | 0, /* tp_clear */\r | |
8790 | PyUnicode_RichCompare, /* tp_richcompare */\r | |
8791 | 0, /* tp_weaklistoffset */\r | |
8792 | 0, /* tp_iter */\r | |
8793 | 0, /* tp_iternext */\r | |
8794 | unicode_methods, /* tp_methods */\r | |
8795 | 0, /* tp_members */\r | |
8796 | 0, /* tp_getset */\r | |
8797 | &PyBaseString_Type, /* tp_base */\r | |
8798 | 0, /* tp_dict */\r | |
8799 | 0, /* tp_descr_get */\r | |
8800 | 0, /* tp_descr_set */\r | |
8801 | 0, /* tp_dictoffset */\r | |
8802 | 0, /* tp_init */\r | |
8803 | 0, /* tp_alloc */\r | |
8804 | unicode_new, /* tp_new */\r | |
8805 | PyObject_Del, /* tp_free */\r | |
8806 | };\r | |
8807 | \r | |
8808 | /* Initialize the Unicode implementation */\r | |
8809 | \r | |
8810 | void _PyUnicode_Init(void)\r | |
8811 | {\r | |
8812 | int i;\r | |
8813 | \r | |
8814 | /* XXX - move this array to unicodectype.c ? */\r | |
8815 | Py_UNICODE linebreak[] = {\r | |
8816 | 0x000A, /* LINE FEED */\r | |
8817 | 0x000D, /* CARRIAGE RETURN */\r | |
8818 | 0x001C, /* FILE SEPARATOR */\r | |
8819 | 0x001D, /* GROUP SEPARATOR */\r | |
8820 | 0x001E, /* RECORD SEPARATOR */\r | |
8821 | 0x0085, /* NEXT LINE */\r | |
8822 | 0x2028, /* LINE SEPARATOR */\r | |
8823 | 0x2029, /* PARAGRAPH SEPARATOR */\r | |
8824 | };\r | |
8825 | \r | |
8826 | /* Init the implementation */\r | |
8827 | free_list = NULL;\r | |
8828 | numfree = 0;\r | |
8829 | unicode_empty = _PyUnicode_New(0);\r | |
8830 | if (!unicode_empty)\r | |
8831 | return;\r | |
8832 | \r | |
8833 | strcpy(unicode_default_encoding, "ascii");\r | |
8834 | for (i = 0; i < 256; i++)\r | |
8835 | unicode_latin1[i] = NULL;\r | |
8836 | if (PyType_Ready(&PyUnicode_Type) < 0)\r | |
8837 | Py_FatalError("Can't initialize 'unicode'");\r | |
8838 | \r | |
8839 | /* initialize the linebreak bloom filter */\r | |
8840 | bloom_linebreak = make_bloom_mask(\r | |
8841 | linebreak, sizeof(linebreak) / sizeof(linebreak[0])\r | |
8842 | );\r | |
8843 | \r | |
8844 | PyType_Ready(&EncodingMapType);\r | |
8845 | }\r | |
8846 | \r | |
8847 | /* Finalize the Unicode implementation */\r | |
8848 | \r | |
8849 | int\r | |
8850 | PyUnicode_ClearFreeList(void)\r | |
8851 | {\r | |
8852 | int freelist_size = numfree;\r | |
8853 | PyUnicodeObject *u;\r | |
8854 | \r | |
8855 | for (u = free_list; u != NULL;) {\r | |
8856 | PyUnicodeObject *v = u;\r | |
8857 | u = *(PyUnicodeObject **)u;\r | |
8858 | if (v->str)\r | |
8859 | PyObject_DEL(v->str);\r | |
8860 | Py_XDECREF(v->defenc);\r | |
8861 | PyObject_Del(v);\r | |
8862 | numfree--;\r | |
8863 | }\r | |
8864 | free_list = NULL;\r | |
8865 | assert(numfree == 0);\r | |
8866 | return freelist_size;\r | |
8867 | }\r | |
8868 | \r | |
8869 | void\r | |
8870 | _PyUnicode_Fini(void)\r | |
8871 | {\r | |
8872 | int i;\r | |
8873 | \r | |
8874 | Py_XDECREF(unicode_empty);\r | |
8875 | unicode_empty = NULL;\r | |
8876 | \r | |
8877 | for (i = 0; i < 256; i++) {\r | |
8878 | if (unicode_latin1[i]) {\r | |
8879 | Py_DECREF(unicode_latin1[i]);\r | |
8880 | unicode_latin1[i] = NULL;\r | |
8881 | }\r | |
8882 | }\r | |
8883 | (void)PyUnicode_ClearFreeList();\r | |
8884 | }\r | |
8885 | \r | |
8886 | #ifdef __cplusplus\r | |
8887 | }\r | |
8888 | #endif\r |