]> git.proxmox.com Git - mirror_edk2.git/blame - AppPkg/Applications/Python/Python-2.7.2/Objects/unicodeobject.c
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.2 / Objects / unicodeobject.c
CommitLineData
4710c53d 1/*\r
2\r
3Unicode implementation based on original code by Fredrik Lundh,\r
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the\r
5Unicode Integration Proposal (see file Misc/unicode.txt).\r
6\r
7Major speed upgrades to the method implementations at the Reykjavik\r
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.\r
9\r
10Copyright (c) Corporation for National Research Initiatives.\r
11\r
12--------------------------------------------------------------------\r
13The original string type implementation is:\r
14\r
15 Copyright (c) 1999 by Secret Labs AB\r
16 Copyright (c) 1999 by Fredrik Lundh\r
17\r
18By obtaining, using, and/or copying this software and/or its\r
19associated documentation, you agree that you have read, understood,\r
20and will comply with the following terms and conditions:\r
21\r
22Permission to use, copy, modify, and distribute this software and its\r
23associated documentation for any purpose and without fee is hereby\r
24granted, provided that the above copyright notice appears in all\r
25copies, and that both that copyright notice and this permission notice\r
26appear in supporting documentation, and that the name of Secret Labs\r
27AB or the author not be used in advertising or publicity pertaining to\r
28distribution of the software without specific, written prior\r
29permission.\r
30\r
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO\r
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND\r
33FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR\r
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES\r
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN\r
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT\r
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.\r
38--------------------------------------------------------------------\r
39\r
40*/\r
41\r
42#define PY_SSIZE_T_CLEAN\r
43#include "Python.h"\r
44\r
45#include "unicodeobject.h"\r
46#include "ucnhash.h"\r
47\r
48#ifdef MS_WINDOWS\r
49#include <windows.h>\r
50#endif\r
51\r
52/* Limit for the Unicode object free list */\r
53\r
54#define PyUnicode_MAXFREELIST 1024\r
55\r
56/* Limit for the Unicode object free list stay alive optimization.\r
57\r
58 The implementation will keep allocated Unicode memory intact for\r
59 all objects on the free list having a size less than this\r
60 limit. This reduces malloc() overhead for small Unicode objects.\r
61\r
62 At worst this will result in PyUnicode_MAXFREELIST *\r
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +\r
64 malloc()-overhead) bytes of unused garbage.\r
65\r
66 Setting the limit to 0 effectively turns the feature off.\r
67\r
68 Note: This is an experimental feature ! If you get core dumps when\r
69 using Unicode objects, turn this feature off.\r
70\r
71*/\r
72\r
73#define KEEPALIVE_SIZE_LIMIT 9\r
74\r
75/* Endianness switches; defaults to little endian */\r
76\r
77#ifdef WORDS_BIGENDIAN\r
78# define BYTEORDER_IS_BIG_ENDIAN\r
79#else\r
80# define BYTEORDER_IS_LITTLE_ENDIAN\r
81#endif\r
82\r
83/* --- Globals ------------------------------------------------------------\r
84\r
85 The globals are initialized by the _PyUnicode_Init() API and should\r
86 not be used before calling that API.\r
87\r
88*/\r
89\r
90\r
91#ifdef __cplusplus\r
92extern "C" {\r
93#endif\r
94\r
95/* Free list for Unicode objects */\r
96static PyUnicodeObject *free_list;\r
97static int numfree;\r
98\r
99/* The empty Unicode object is shared to improve performance. */\r
100static PyUnicodeObject *unicode_empty;\r
101\r
102/* Single character Unicode strings in the Latin-1 range are being\r
103 shared as well. */\r
104static PyUnicodeObject *unicode_latin1[256];\r
105\r
106/* Default encoding to use and assume when NULL is passed as encoding\r
107 parameter; it is initialized by _PyUnicode_Init().\r
108\r
109 Always use the PyUnicode_SetDefaultEncoding() and\r
110 PyUnicode_GetDefaultEncoding() APIs to access this global.\r
111\r
112*/\r
113static char unicode_default_encoding[100];\r
114\r
115/* Fast detection of the most frequent whitespace characters */\r
116const unsigned char _Py_ascii_whitespace[] = {\r
117 0, 0, 0, 0, 0, 0, 0, 0,\r
118/* case 0x0009: * CHARACTER TABULATION */\r
119/* case 0x000A: * LINE FEED */\r
120/* case 0x000B: * LINE TABULATION */\r
121/* case 0x000C: * FORM FEED */\r
122/* case 0x000D: * CARRIAGE RETURN */\r
123 0, 1, 1, 1, 1, 1, 0, 0,\r
124 0, 0, 0, 0, 0, 0, 0, 0,\r
125/* case 0x001C: * FILE SEPARATOR */\r
126/* case 0x001D: * GROUP SEPARATOR */\r
127/* case 0x001E: * RECORD SEPARATOR */\r
128/* case 0x001F: * UNIT SEPARATOR */\r
129 0, 0, 0, 0, 1, 1, 1, 1,\r
130/* case 0x0020: * SPACE */\r
131 1, 0, 0, 0, 0, 0, 0, 0,\r
132 0, 0, 0, 0, 0, 0, 0, 0,\r
133 0, 0, 0, 0, 0, 0, 0, 0,\r
134 0, 0, 0, 0, 0, 0, 0, 0,\r
135\r
136 0, 0, 0, 0, 0, 0, 0, 0,\r
137 0, 0, 0, 0, 0, 0, 0, 0,\r
138 0, 0, 0, 0, 0, 0, 0, 0,\r
139 0, 0, 0, 0, 0, 0, 0, 0,\r
140 0, 0, 0, 0, 0, 0, 0, 0,\r
141 0, 0, 0, 0, 0, 0, 0, 0,\r
142 0, 0, 0, 0, 0, 0, 0, 0,\r
143 0, 0, 0, 0, 0, 0, 0, 0\r
144};\r
145\r
146/* Same for linebreaks */\r
147static unsigned char ascii_linebreak[] = {\r
148 0, 0, 0, 0, 0, 0, 0, 0,\r
149/* 0x000A, * LINE FEED */\r
150/* 0x000B, * LINE TABULATION */\r
151/* 0x000C, * FORM FEED */\r
152/* 0x000D, * CARRIAGE RETURN */\r
153 0, 0, 1, 1, 1, 1, 0, 0,\r
154 0, 0, 0, 0, 0, 0, 0, 0,\r
155/* 0x001C, * FILE SEPARATOR */\r
156/* 0x001D, * GROUP SEPARATOR */\r
157/* 0x001E, * RECORD SEPARATOR */\r
158 0, 0, 0, 0, 1, 1, 1, 0,\r
159 0, 0, 0, 0, 0, 0, 0, 0,\r
160 0, 0, 0, 0, 0, 0, 0, 0,\r
161 0, 0, 0, 0, 0, 0, 0, 0,\r
162 0, 0, 0, 0, 0, 0, 0, 0,\r
163\r
164 0, 0, 0, 0, 0, 0, 0, 0,\r
165 0, 0, 0, 0, 0, 0, 0, 0,\r
166 0, 0, 0, 0, 0, 0, 0, 0,\r
167 0, 0, 0, 0, 0, 0, 0, 0,\r
168 0, 0, 0, 0, 0, 0, 0, 0,\r
169 0, 0, 0, 0, 0, 0, 0, 0,\r
170 0, 0, 0, 0, 0, 0, 0, 0,\r
171 0, 0, 0, 0, 0, 0, 0, 0\r
172};\r
173\r
174\r
175Py_UNICODE\r
176PyUnicode_GetMax(void)\r
177{\r
178#ifdef Py_UNICODE_WIDE\r
179 return 0x10FFFF;\r
180#else\r
181 /* This is actually an illegal character, so it should\r
182 not be passed to unichr. */\r
183 return 0xFFFF;\r
184#endif\r
185}\r
186\r
187/* --- Bloom Filters ----------------------------------------------------- */\r
188\r
189/* stuff to implement simple "bloom filters" for Unicode characters.\r
190 to keep things simple, we use a single bitmask, using the least 5\r
191 bits from each unicode characters as the bit index. */\r
192\r
193/* the linebreak mask is set up by Unicode_Init below */\r
194\r
195#if LONG_BIT >= 128\r
196#define BLOOM_WIDTH 128\r
197#elif LONG_BIT >= 64\r
198#define BLOOM_WIDTH 64\r
199#elif LONG_BIT >= 32\r
200#define BLOOM_WIDTH 32\r
201#else\r
202#error "LONG_BIT is smaller than 32"\r
203#endif\r
204\r
205#define BLOOM_MASK unsigned long\r
206\r
207static BLOOM_MASK bloom_linebreak;\r
208\r
209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))\r
210#define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))\r
211\r
212#define BLOOM_LINEBREAK(ch) \\r
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \\r
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))\r
215\r
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)\r
217{\r
218 /* calculate simple bloom-style bitmask for a given unicode string */\r
219\r
220 BLOOM_MASK mask;\r
221 Py_ssize_t i;\r
222\r
223 mask = 0;\r
224 for (i = 0; i < len; i++)\r
225 BLOOM_ADD(mask, ptr[i]);\r
226\r
227 return mask;\r
228}\r
229\r
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)\r
231{\r
232 Py_ssize_t i;\r
233\r
234 for (i = 0; i < setlen; i++)\r
235 if (set[i] == chr)\r
236 return 1;\r
237\r
238 return 0;\r
239}\r
240\r
241#define BLOOM_MEMBER(mask, chr, set, setlen) \\r
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)\r
243\r
244/* --- Unicode Object ----------------------------------------------------- */\r
245\r
246static\r
247int unicode_resize(register PyUnicodeObject *unicode,\r
248 Py_ssize_t length)\r
249{\r
250 void *oldstr;\r
251\r
252 /* Shortcut if there's nothing much to do. */\r
253 if (unicode->length == length)\r
254 goto reset;\r
255\r
256 /* Resizing shared object (unicode_empty or single character\r
257 objects) in-place is not allowed. Use PyUnicode_Resize()\r
258 instead ! */\r
259\r
260 if (unicode == unicode_empty ||\r
261 (unicode->length == 1 &&\r
262 unicode->str[0] < 256U &&\r
263 unicode_latin1[unicode->str[0]] == unicode)) {\r
264 PyErr_SetString(PyExc_SystemError,\r
265 "can't resize shared unicode objects");\r
266 return -1;\r
267 }\r
268\r
269 /* We allocate one more byte to make sure the string is Ux0000 terminated.\r
270 The overallocation is also used by fastsearch, which assumes that it's\r
271 safe to look at str[length] (without making any assumptions about what\r
272 it contains). */\r
273\r
274 oldstr = unicode->str;\r
275 unicode->str = PyObject_REALLOC(unicode->str,\r
276 sizeof(Py_UNICODE) * (length + 1));\r
277 if (!unicode->str) {\r
278 unicode->str = (Py_UNICODE *)oldstr;\r
279 PyErr_NoMemory();\r
280 return -1;\r
281 }\r
282 unicode->str[length] = 0;\r
283 unicode->length = length;\r
284\r
285 reset:\r
286 /* Reset the object caches */\r
287 if (unicode->defenc) {\r
288 Py_CLEAR(unicode->defenc);\r
289 }\r
290 unicode->hash = -1;\r
291\r
292 return 0;\r
293}\r
294\r
295/* We allocate one more byte to make sure the string is\r
296 Ux0000 terminated; some code relies on that.\r
297\r
298 XXX This allocator could further be enhanced by assuring that the\r
299 free list never reduces its size below 1.\r
300\r
301*/\r
302\r
303static\r
304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)\r
305{\r
306 register PyUnicodeObject *unicode;\r
307\r
308 /* Optimization for empty strings */\r
309 if (length == 0 && unicode_empty != NULL) {\r
310 Py_INCREF(unicode_empty);\r
311 return unicode_empty;\r
312 }\r
313\r
314 /* Ensure we won't overflow the size. */\r
315 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {\r
316 return (PyUnicodeObject *)PyErr_NoMemory();\r
317 }\r
318\r
319 /* Unicode freelist & memory allocation */\r
320 if (free_list) {\r
321 unicode = free_list;\r
322 free_list = *(PyUnicodeObject **)unicode;\r
323 numfree--;\r
324 if (unicode->str) {\r
325 /* Keep-Alive optimization: we only upsize the buffer,\r
326 never downsize it. */\r
327 if ((unicode->length < length) &&\r
328 unicode_resize(unicode, length) < 0) {\r
329 PyObject_DEL(unicode->str);\r
330 unicode->str = NULL;\r
331 }\r
332 }\r
333 else {\r
334 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);\r
335 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);\r
336 }\r
337 PyObject_INIT(unicode, &PyUnicode_Type);\r
338 }\r
339 else {\r
340 size_t new_size;\r
341 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);\r
342 if (unicode == NULL)\r
343 return NULL;\r
344 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);\r
345 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);\r
346 }\r
347\r
348 if (!unicode->str) {\r
349 PyErr_NoMemory();\r
350 goto onError;\r
351 }\r
352 /* Initialize the first element to guard against cases where\r
353 * the caller fails before initializing str -- unicode_resize()\r
354 * reads str[0], and the Keep-Alive optimization can keep memory\r
355 * allocated for str alive across a call to unicode_dealloc(unicode).\r
356 * We don't want unicode_resize to read uninitialized memory in\r
357 * that case.\r
358 */\r
359 unicode->str[0] = 0;\r
360 unicode->str[length] = 0;\r
361 unicode->length = length;\r
362 unicode->hash = -1;\r
363 unicode->defenc = NULL;\r
364 return unicode;\r
365\r
366 onError:\r
367 /* XXX UNREF/NEWREF interface should be more symmetrical */\r
368 _Py_DEC_REFTOTAL;\r
369 _Py_ForgetReference((PyObject *)unicode);\r
370 PyObject_Del(unicode);\r
371 return NULL;\r
372}\r
373\r
374static\r
375void unicode_dealloc(register PyUnicodeObject *unicode)\r
376{\r
377 if (PyUnicode_CheckExact(unicode) &&\r
378 numfree < PyUnicode_MAXFREELIST) {\r
379 /* Keep-Alive optimization */\r
380 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {\r
381 PyObject_DEL(unicode->str);\r
382 unicode->str = NULL;\r
383 unicode->length = 0;\r
384 }\r
385 if (unicode->defenc) {\r
386 Py_CLEAR(unicode->defenc);\r
387 }\r
388 /* Add to free list */\r
389 *(PyUnicodeObject **)unicode = free_list;\r
390 free_list = unicode;\r
391 numfree++;\r
392 }\r
393 else {\r
394 PyObject_DEL(unicode->str);\r
395 Py_XDECREF(unicode->defenc);\r
396 Py_TYPE(unicode)->tp_free((PyObject *)unicode);\r
397 }\r
398}\r
399\r
400static\r
401int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)\r
402{\r
403 register PyUnicodeObject *v;\r
404\r
405 /* Argument checks */\r
406 if (unicode == NULL) {\r
407 PyErr_BadInternalCall();\r
408 return -1;\r
409 }\r
410 v = *unicode;\r
411 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {\r
412 PyErr_BadInternalCall();\r
413 return -1;\r
414 }\r
415\r
416 /* Resizing unicode_empty and single character objects is not\r
417 possible since these are being shared. We simply return a fresh\r
418 copy with the same Unicode content. */\r
419 if (v->length != length &&\r
420 (v == unicode_empty || v->length == 1)) {\r
421 PyUnicodeObject *w = _PyUnicode_New(length);\r
422 if (w == NULL)\r
423 return -1;\r
424 Py_UNICODE_COPY(w->str, v->str,\r
425 length < v->length ? length : v->length);\r
426 Py_DECREF(*unicode);\r
427 *unicode = w;\r
428 return 0;\r
429 }\r
430\r
431 /* Note that we don't have to modify *unicode for unshared Unicode\r
432 objects, since we can modify them in-place. */\r
433 return unicode_resize(v, length);\r
434}\r
435\r
436int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)\r
437{\r
438 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);\r
439}\r
440\r
441PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,\r
442 Py_ssize_t size)\r
443{\r
444 PyUnicodeObject *unicode;\r
445\r
446 /* If the Unicode data is known at construction time, we can apply\r
447 some optimizations which share commonly used objects. */\r
448 if (u != NULL) {\r
449\r
450 /* Optimization for empty strings */\r
451 if (size == 0 && unicode_empty != NULL) {\r
452 Py_INCREF(unicode_empty);\r
453 return (PyObject *)unicode_empty;\r
454 }\r
455\r
456 /* Single character Unicode objects in the Latin-1 range are\r
457 shared when using this constructor */\r
458 if (size == 1 && *u < 256) {\r
459 unicode = unicode_latin1[*u];\r
460 if (!unicode) {\r
461 unicode = _PyUnicode_New(1);\r
462 if (!unicode)\r
463 return NULL;\r
464 unicode->str[0] = *u;\r
465 unicode_latin1[*u] = unicode;\r
466 }\r
467 Py_INCREF(unicode);\r
468 return (PyObject *)unicode;\r
469 }\r
470 }\r
471\r
472 unicode = _PyUnicode_New(size);\r
473 if (!unicode)\r
474 return NULL;\r
475\r
476 /* Copy the Unicode data into the new object */\r
477 if (u != NULL)\r
478 Py_UNICODE_COPY(unicode->str, u, size);\r
479\r
480 return (PyObject *)unicode;\r
481}\r
482\r
483PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)\r
484{\r
485 PyUnicodeObject *unicode;\r
486\r
487 if (size < 0) {\r
488 PyErr_SetString(PyExc_SystemError,\r
489 "Negative size passed to PyUnicode_FromStringAndSize");\r
490 return NULL;\r
491 }\r
492\r
493 /* If the Unicode data is known at construction time, we can apply\r
494 some optimizations which share commonly used objects.\r
495 Also, this means the input must be UTF-8, so fall back to the\r
496 UTF-8 decoder at the end. */\r
497 if (u != NULL) {\r
498\r
499 /* Optimization for empty strings */\r
500 if (size == 0 && unicode_empty != NULL) {\r
501 Py_INCREF(unicode_empty);\r
502 return (PyObject *)unicode_empty;\r
503 }\r
504\r
505 /* Single characters are shared when using this constructor.\r
506 Restrict to ASCII, since the input must be UTF-8. */\r
507 if (size == 1 && Py_CHARMASK(*u) < 128) {\r
508 unicode = unicode_latin1[Py_CHARMASK(*u)];\r
509 if (!unicode) {\r
510 unicode = _PyUnicode_New(1);\r
511 if (!unicode)\r
512 return NULL;\r
513 unicode->str[0] = Py_CHARMASK(*u);\r
514 unicode_latin1[Py_CHARMASK(*u)] = unicode;\r
515 }\r
516 Py_INCREF(unicode);\r
517 return (PyObject *)unicode;\r
518 }\r
519\r
520 return PyUnicode_DecodeUTF8(u, size, NULL);\r
521 }\r
522\r
523 unicode = _PyUnicode_New(size);\r
524 if (!unicode)\r
525 return NULL;\r
526\r
527 return (PyObject *)unicode;\r
528}\r
529\r
530PyObject *PyUnicode_FromString(const char *u)\r
531{\r
532 size_t size = strlen(u);\r
533 if (size > PY_SSIZE_T_MAX) {\r
534 PyErr_SetString(PyExc_OverflowError, "input too long");\r
535 return NULL;\r
536 }\r
537\r
538 return PyUnicode_FromStringAndSize(u, size);\r
539}\r
540\r
541#ifdef HAVE_WCHAR_H\r
542\r
543#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)\r
544# define CONVERT_WCHAR_TO_SURROGATES\r
545#endif\r
546\r
547#ifdef CONVERT_WCHAR_TO_SURROGATES\r
548\r
549/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need\r
550 to convert from UTF32 to UTF16. */\r
551\r
552PyObject *PyUnicode_FromWideChar(register const wchar_t *w,\r
553 Py_ssize_t size)\r
554{\r
555 PyUnicodeObject *unicode;\r
556 register Py_ssize_t i;\r
557 Py_ssize_t alloc;\r
558 const wchar_t *orig_w;\r
559\r
560 if (w == NULL) {\r
561 PyErr_BadInternalCall();\r
562 return NULL;\r
563 }\r
564\r
565 alloc = size;\r
566 orig_w = w;\r
567 for (i = size; i > 0; i--) {\r
568 if (*w > 0xFFFF)\r
569 alloc++;\r
570 w++;\r
571 }\r
572 w = orig_w;\r
573 unicode = _PyUnicode_New(alloc);\r
574 if (!unicode)\r
575 return NULL;\r
576\r
577 /* Copy the wchar_t data into the new object */\r
578 {\r
579 register Py_UNICODE *u;\r
580 u = PyUnicode_AS_UNICODE(unicode);\r
581 for (i = size; i > 0; i--) {\r
582 if (*w > 0xFFFF) {\r
583 wchar_t ordinal = *w++;\r
584 ordinal -= 0x10000;\r
585 *u++ = 0xD800 | (ordinal >> 10);\r
586 *u++ = 0xDC00 | (ordinal & 0x3FF);\r
587 }\r
588 else\r
589 *u++ = *w++;\r
590 }\r
591 }\r
592 return (PyObject *)unicode;\r
593}\r
594\r
595#else\r
596\r
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,\r
598 Py_ssize_t size)\r
599{\r
600 PyUnicodeObject *unicode;\r
601\r
602 if (w == NULL) {\r
603 PyErr_BadInternalCall();\r
604 return NULL;\r
605 }\r
606\r
607 unicode = _PyUnicode_New(size);\r
608 if (!unicode)\r
609 return NULL;\r
610\r
611 /* Copy the wchar_t data into the new object */\r
612#ifdef HAVE_USABLE_WCHAR_T\r
613 memcpy(unicode->str, w, size * sizeof(wchar_t));\r
614#else\r
615 {\r
616 register Py_UNICODE *u;\r
617 register Py_ssize_t i;\r
618 u = PyUnicode_AS_UNICODE(unicode);\r
619 for (i = size; i > 0; i--)\r
620 *u++ = *w++;\r
621 }\r
622#endif\r
623\r
624 return (PyObject *)unicode;\r
625}\r
626\r
627#endif /* CONVERT_WCHAR_TO_SURROGATES */\r
628\r
629#undef CONVERT_WCHAR_TO_SURROGATES\r
630\r
631static void\r
632makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)\r
633{\r
634 *fmt++ = '%';\r
635 if (width) {\r
636 if (zeropad)\r
637 *fmt++ = '0';\r
638 fmt += sprintf(fmt, "%d", width);\r
639 }\r
640 if (precision)\r
641 fmt += sprintf(fmt, ".%d", precision);\r
642 if (longflag)\r
643 *fmt++ = 'l';\r
644 else if (size_tflag) {\r
645 char *f = PY_FORMAT_SIZE_T;\r
646 while (*f)\r
647 *fmt++ = *f++;\r
648 }\r
649 *fmt++ = c;\r
650 *fmt = '\0';\r
651}\r
652\r
653#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}\r
654\r
655PyObject *\r
656PyUnicode_FromFormatV(const char *format, va_list vargs)\r
657{\r
658 va_list count;\r
659 Py_ssize_t callcount = 0;\r
660 PyObject **callresults = NULL;\r
661 PyObject **callresult = NULL;\r
662 Py_ssize_t n = 0;\r
663 int width = 0;\r
664 int precision = 0;\r
665 int zeropad;\r
666 const char* f;\r
667 Py_UNICODE *s;\r
668 PyObject *string;\r
669 /* used by sprintf */\r
670 char buffer[21];\r
671 /* use abuffer instead of buffer, if we need more space\r
672 * (which can happen if there's a format specifier with width). */\r
673 char *abuffer = NULL;\r
674 char *realbuffer;\r
675 Py_ssize_t abuffersize = 0;\r
676 char fmt[60]; /* should be enough for %0width.precisionld */\r
677 const char *copy;\r
678\r
679#ifdef VA_LIST_IS_ARRAY\r
680 Py_MEMCPY(count, vargs, sizeof(va_list));\r
681#else\r
682#ifdef __va_copy\r
683 __va_copy(count, vargs);\r
684#else\r
685 count = vargs;\r
686#endif\r
687#endif\r
688 /* step 1: count the number of %S/%R/%s format specifications\r
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these\r
690 * objects once during step 3 and put the result in an array) */\r
691 for (f = format; *f; f++) {\r
692 if (*f == '%') {\r
693 if (*(f+1)=='%')\r
694 continue;\r
695 if (*(f+1)=='S' || *(f+1)=='R')\r
696 ++callcount;\r
697 while (isdigit((unsigned)*f))\r
698 width = (width*10) + *f++ - '0';\r
699 while (*++f && *f != '%' && !isalpha((unsigned)*f))\r
700 ;\r
701 if (*f == 's')\r
702 ++callcount;\r
703 }\r
704 }\r
705 /* step 2: allocate memory for the results of\r
706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */\r
707 if (callcount) {\r
708 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);\r
709 if (!callresults) {\r
710 PyErr_NoMemory();\r
711 return NULL;\r
712 }\r
713 callresult = callresults;\r
714 }\r
715 /* step 3: figure out how large a buffer we need */\r
716 for (f = format; *f; f++) {\r
717 if (*f == '%') {\r
718 const char* p = f;\r
719 width = 0;\r
720 while (isdigit((unsigned)*f))\r
721 width = (width*10) + *f++ - '0';\r
722 while (*++f && *f != '%' && !isalpha((unsigned)*f))\r
723 ;\r
724\r
725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since\r
726 * they don't affect the amount of space we reserve.\r
727 */\r
728 if ((*f == 'l' || *f == 'z') &&\r
729 (f[1] == 'd' || f[1] == 'u'))\r
730 ++f;\r
731\r
732 switch (*f) {\r
733 case 'c':\r
734 (void)va_arg(count, int);\r
735 /* fall through... */\r
736 case '%':\r
737 n++;\r
738 break;\r
739 case 'd': case 'u': case 'i': case 'x':\r
740 (void) va_arg(count, int);\r
741 /* 20 bytes is enough to hold a 64-bit\r
742 integer. Decimal takes the most space.\r
743 This isn't enough for octal.\r
744 If a width is specified we need more\r
745 (which we allocate later). */\r
746 if (width < 20)\r
747 width = 20;\r
748 n += width;\r
749 if (abuffersize < width)\r
750 abuffersize = width;\r
751 break;\r
752 case 's':\r
753 {\r
754 /* UTF-8 */\r
755 const char *s = va_arg(count, const char*);\r
756 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");\r
757 if (!str)\r
758 goto fail;\r
759 n += PyUnicode_GET_SIZE(str);\r
760 /* Remember the str and switch to the next slot */\r
761 *callresult++ = str;\r
762 break;\r
763 }\r
764 case 'U':\r
765 {\r
766 PyObject *obj = va_arg(count, PyObject *);\r
767 assert(obj && PyUnicode_Check(obj));\r
768 n += PyUnicode_GET_SIZE(obj);\r
769 break;\r
770 }\r
771 case 'V':\r
772 {\r
773 PyObject *obj = va_arg(count, PyObject *);\r
774 const char *str = va_arg(count, const char *);\r
775 assert(obj || str);\r
776 assert(!obj || PyUnicode_Check(obj));\r
777 if (obj)\r
778 n += PyUnicode_GET_SIZE(obj);\r
779 else\r
780 n += strlen(str);\r
781 break;\r
782 }\r
783 case 'S':\r
784 {\r
785 PyObject *obj = va_arg(count, PyObject *);\r
786 PyObject *str;\r
787 assert(obj);\r
788 str = PyObject_Str(obj);\r
789 if (!str)\r
790 goto fail;\r
791 n += PyUnicode_GET_SIZE(str);\r
792 /* Remember the str and switch to the next slot */\r
793 *callresult++ = str;\r
794 break;\r
795 }\r
796 case 'R':\r
797 {\r
798 PyObject *obj = va_arg(count, PyObject *);\r
799 PyObject *repr;\r
800 assert(obj);\r
801 repr = PyObject_Repr(obj);\r
802 if (!repr)\r
803 goto fail;\r
804 n += PyUnicode_GET_SIZE(repr);\r
805 /* Remember the repr and switch to the next slot */\r
806 *callresult++ = repr;\r
807 break;\r
808 }\r
809 case 'p':\r
810 (void) va_arg(count, int);\r
811 /* maximum 64-bit pointer representation:\r
812 * 0xffffffffffffffff\r
813 * so 19 characters is enough.\r
814 * XXX I count 18 -- what's the extra for?\r
815 */\r
816 n += 19;\r
817 break;\r
818 default:\r
819 /* if we stumble upon an unknown\r
820 formatting code, copy the rest of\r
821 the format string to the output\r
822 string. (we cannot just skip the\r
823 code, since there's no way to know\r
824 what's in the argument list) */\r
825 n += strlen(p);\r
826 goto expand;\r
827 }\r
828 } else\r
829 n++;\r
830 }\r
831 expand:\r
832 if (abuffersize > 20) {\r
833 abuffer = PyObject_Malloc(abuffersize);\r
834 if (!abuffer) {\r
835 PyErr_NoMemory();\r
836 goto fail;\r
837 }\r
838 realbuffer = abuffer;\r
839 }\r
840 else\r
841 realbuffer = buffer;\r
842 /* step 4: fill the buffer */\r
843 /* Since we've analyzed how much space we need for the worst case,\r
844 we don't have to resize the string.\r
845 There can be no errors beyond this point. */\r
846 string = PyUnicode_FromUnicode(NULL, n);\r
847 if (!string)\r
848 goto fail;\r
849\r
850 s = PyUnicode_AS_UNICODE(string);\r
851 callresult = callresults;\r
852\r
853 for (f = format; *f; f++) {\r
854 if (*f == '%') {\r
855 const char* p = f++;\r
856 int longflag = 0;\r
857 int size_tflag = 0;\r
858 zeropad = (*f == '0');\r
859 /* parse the width.precision part */\r
860 width = 0;\r
861 while (isdigit((unsigned)*f))\r
862 width = (width*10) + *f++ - '0';\r
863 precision = 0;\r
864 if (*f == '.') {\r
865 f++;\r
866 while (isdigit((unsigned)*f))\r
867 precision = (precision*10) + *f++ - '0';\r
868 }\r
869 /* handle the long flag, but only for %ld and %lu.\r
870 others can be added when necessary. */\r
871 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {\r
872 longflag = 1;\r
873 ++f;\r
874 }\r
875 /* handle the size_t flag. */\r
876 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {\r
877 size_tflag = 1;\r
878 ++f;\r
879 }\r
880\r
881 switch (*f) {\r
882 case 'c':\r
883 *s++ = va_arg(vargs, int);\r
884 break;\r
885 case 'd':\r
886 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');\r
887 if (longflag)\r
888 sprintf(realbuffer, fmt, va_arg(vargs, long));\r
889 else if (size_tflag)\r
890 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));\r
891 else\r
892 sprintf(realbuffer, fmt, va_arg(vargs, int));\r
893 appendstring(realbuffer);\r
894 break;\r
895 case 'u':\r
896 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');\r
897 if (longflag)\r
898 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));\r
899 else if (size_tflag)\r
900 sprintf(realbuffer, fmt, va_arg(vargs, size_t));\r
901 else\r
902 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));\r
903 appendstring(realbuffer);\r
904 break;\r
905 case 'i':\r
906 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');\r
907 sprintf(realbuffer, fmt, va_arg(vargs, int));\r
908 appendstring(realbuffer);\r
909 break;\r
910 case 'x':\r
911 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');\r
912 sprintf(realbuffer, fmt, va_arg(vargs, int));\r
913 appendstring(realbuffer);\r
914 break;\r
915 case 's':\r
916 {\r
917 /* unused, since we already have the result */\r
918 (void) va_arg(vargs, char *);\r
919 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),\r
920 PyUnicode_GET_SIZE(*callresult));\r
921 s += PyUnicode_GET_SIZE(*callresult);\r
922 /* We're done with the unicode()/repr() => forget it */\r
923 Py_DECREF(*callresult);\r
924 /* switch to next unicode()/repr() result */\r
925 ++callresult;\r
926 break;\r
927 }\r
928 case 'U':\r
929 {\r
930 PyObject *obj = va_arg(vargs, PyObject *);\r
931 Py_ssize_t size = PyUnicode_GET_SIZE(obj);\r
932 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);\r
933 s += size;\r
934 break;\r
935 }\r
936 case 'V':\r
937 {\r
938 PyObject *obj = va_arg(vargs, PyObject *);\r
939 const char *str = va_arg(vargs, const char *);\r
940 if (obj) {\r
941 Py_ssize_t size = PyUnicode_GET_SIZE(obj);\r
942 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);\r
943 s += size;\r
944 } else {\r
945 appendstring(str);\r
946 }\r
947 break;\r
948 }\r
949 case 'S':\r
950 case 'R':\r
951 {\r
952 Py_UNICODE *ucopy;\r
953 Py_ssize_t usize;\r
954 Py_ssize_t upos;\r
955 /* unused, since we already have the result */\r
956 (void) va_arg(vargs, PyObject *);\r
957 ucopy = PyUnicode_AS_UNICODE(*callresult);\r
958 usize = PyUnicode_GET_SIZE(*callresult);\r
959 for (upos = 0; upos<usize;)\r
960 *s++ = ucopy[upos++];\r
961 /* We're done with the unicode()/repr() => forget it */\r
962 Py_DECREF(*callresult);\r
963 /* switch to next unicode()/repr() result */\r
964 ++callresult;\r
965 break;\r
966 }\r
967 case 'p':\r
968 sprintf(buffer, "%p", va_arg(vargs, void*));\r
969 /* %p is ill-defined: ensure leading 0x. */\r
970 if (buffer[1] == 'X')\r
971 buffer[1] = 'x';\r
972 else if (buffer[1] != 'x') {\r
973 memmove(buffer+2, buffer, strlen(buffer)+1);\r
974 buffer[0] = '0';\r
975 buffer[1] = 'x';\r
976 }\r
977 appendstring(buffer);\r
978 break;\r
979 case '%':\r
980 *s++ = '%';\r
981 break;\r
982 default:\r
983 appendstring(p);\r
984 goto end;\r
985 }\r
986 } else\r
987 *s++ = *f;\r
988 }\r
989\r
990 end:\r
991 if (callresults)\r
992 PyObject_Free(callresults);\r
993 if (abuffer)\r
994 PyObject_Free(abuffer);\r
995 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));\r
996 return string;\r
997 fail:\r
998 if (callresults) {\r
999 PyObject **callresult2 = callresults;\r
1000 while (callresult2 < callresult) {\r
1001 Py_DECREF(*callresult2);\r
1002 ++callresult2;\r
1003 }\r
1004 PyObject_Free(callresults);\r
1005 }\r
1006 if (abuffer)\r
1007 PyObject_Free(abuffer);\r
1008 return NULL;\r
1009}\r
1010\r
1011#undef appendstring\r
1012\r
1013PyObject *\r
1014PyUnicode_FromFormat(const char *format, ...)\r
1015{\r
1016 PyObject* ret;\r
1017 va_list vargs;\r
1018\r
1019#ifdef HAVE_STDARG_PROTOTYPES\r
1020 va_start(vargs, format);\r
1021#else\r
1022 va_start(vargs);\r
1023#endif\r
1024 ret = PyUnicode_FromFormatV(format, vargs);\r
1025 va_end(vargs);\r
1026 return ret;\r
1027}\r
1028\r
1029Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,\r
1030 wchar_t *w,\r
1031 Py_ssize_t size)\r
1032{\r
1033 if (unicode == NULL) {\r
1034 PyErr_BadInternalCall();\r
1035 return -1;\r
1036 }\r
1037\r
1038 /* If possible, try to copy the 0-termination as well */\r
1039 if (size > PyUnicode_GET_SIZE(unicode))\r
1040 size = PyUnicode_GET_SIZE(unicode) + 1;\r
1041\r
1042#ifdef HAVE_USABLE_WCHAR_T\r
1043 memcpy(w, unicode->str, size * sizeof(wchar_t));\r
1044#else\r
1045 {\r
1046 register Py_UNICODE *u;\r
1047 register Py_ssize_t i;\r
1048 u = PyUnicode_AS_UNICODE(unicode);\r
1049 for (i = size; i > 0; i--)\r
1050 *w++ = *u++;\r
1051 }\r
1052#endif\r
1053\r
1054 if (size > PyUnicode_GET_SIZE(unicode))\r
1055 return PyUnicode_GET_SIZE(unicode);\r
1056 else\r
1057 return size;\r
1058}\r
1059\r
1060#endif\r
1061\r
1062PyObject *PyUnicode_FromOrdinal(int ordinal)\r
1063{\r
1064 Py_UNICODE s[1];\r
1065\r
1066#ifdef Py_UNICODE_WIDE\r
1067 if (ordinal < 0 || ordinal > 0x10ffff) {\r
1068 PyErr_SetString(PyExc_ValueError,\r
1069 "unichr() arg not in range(0x110000) "\r
1070 "(wide Python build)");\r
1071 return NULL;\r
1072 }\r
1073#else\r
1074 if (ordinal < 0 || ordinal > 0xffff) {\r
1075 PyErr_SetString(PyExc_ValueError,\r
1076 "unichr() arg not in range(0x10000) "\r
1077 "(narrow Python build)");\r
1078 return NULL;\r
1079 }\r
1080#endif\r
1081\r
1082 s[0] = (Py_UNICODE)ordinal;\r
1083 return PyUnicode_FromUnicode(s, 1);\r
1084}\r
1085\r
1086PyObject *PyUnicode_FromObject(register PyObject *obj)\r
1087{\r
1088 /* XXX Perhaps we should make this API an alias of\r
1089 PyObject_Unicode() instead ?! */\r
1090 if (PyUnicode_CheckExact(obj)) {\r
1091 Py_INCREF(obj);\r
1092 return obj;\r
1093 }\r
1094 if (PyUnicode_Check(obj)) {\r
1095 /* For a Unicode subtype that's not a Unicode object,\r
1096 return a true Unicode object with the same data. */\r
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),\r
1098 PyUnicode_GET_SIZE(obj));\r
1099 }\r
1100 return PyUnicode_FromEncodedObject(obj, NULL, "strict");\r
1101}\r
1102\r
1103PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,\r
1104 const char *encoding,\r
1105 const char *errors)\r
1106{\r
1107 const char *s = NULL;\r
1108 Py_ssize_t len;\r
1109 PyObject *v;\r
1110\r
1111 if (obj == NULL) {\r
1112 PyErr_BadInternalCall();\r
1113 return NULL;\r
1114 }\r
1115\r
1116#if 0\r
1117 /* For b/w compatibility we also accept Unicode objects provided\r
1118 that no encodings is given and then redirect to\r
1119 PyObject_Unicode() which then applies the additional logic for\r
1120 Unicode subclasses.\r
1121\r
1122 NOTE: This API should really only be used for object which\r
1123 represent *encoded* Unicode !\r
1124\r
1125 */\r
1126 if (PyUnicode_Check(obj)) {\r
1127 if (encoding) {\r
1128 PyErr_SetString(PyExc_TypeError,\r
1129 "decoding Unicode is not supported");\r
1130 return NULL;\r
1131 }\r
1132 return PyObject_Unicode(obj);\r
1133 }\r
1134#else\r
1135 if (PyUnicode_Check(obj)) {\r
1136 PyErr_SetString(PyExc_TypeError,\r
1137 "decoding Unicode is not supported");\r
1138 return NULL;\r
1139 }\r
1140#endif\r
1141\r
1142 /* Coerce object */\r
1143 if (PyString_Check(obj)) {\r
1144 s = PyString_AS_STRING(obj);\r
1145 len = PyString_GET_SIZE(obj);\r
1146 }\r
1147 else if (PyByteArray_Check(obj)) {\r
1148 /* Python 2.x specific */\r
1149 PyErr_Format(PyExc_TypeError,\r
1150 "decoding bytearray is not supported");\r
1151 return NULL;\r
1152 }\r
1153 else if (PyObject_AsCharBuffer(obj, &s, &len)) {\r
1154 /* Overwrite the error message with something more useful in\r
1155 case of a TypeError. */\r
1156 if (PyErr_ExceptionMatches(PyExc_TypeError))\r
1157 PyErr_Format(PyExc_TypeError,\r
1158 "coercing to Unicode: need string or buffer, "\r
1159 "%.80s found",\r
1160 Py_TYPE(obj)->tp_name);\r
1161 goto onError;\r
1162 }\r
1163\r
1164 /* Convert to Unicode */\r
1165 if (len == 0) {\r
1166 Py_INCREF(unicode_empty);\r
1167 v = (PyObject *)unicode_empty;\r
1168 }\r
1169 else\r
1170 v = PyUnicode_Decode(s, len, encoding, errors);\r
1171\r
1172 return v;\r
1173\r
1174 onError:\r
1175 return NULL;\r
1176}\r
1177\r
1178PyObject *PyUnicode_Decode(const char *s,\r
1179 Py_ssize_t size,\r
1180 const char *encoding,\r
1181 const char *errors)\r
1182{\r
1183 PyObject *buffer = NULL, *unicode;\r
1184\r
1185 if (encoding == NULL)\r
1186 encoding = PyUnicode_GetDefaultEncoding();\r
1187\r
1188 /* Shortcuts for common default encodings */\r
1189 if (strcmp(encoding, "utf-8") == 0)\r
1190 return PyUnicode_DecodeUTF8(s, size, errors);\r
1191 else if (strcmp(encoding, "latin-1") == 0)\r
1192 return PyUnicode_DecodeLatin1(s, size, errors);\r
1193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)\r
1194 else if (strcmp(encoding, "mbcs") == 0)\r
1195 return PyUnicode_DecodeMBCS(s, size, errors);\r
1196#endif\r
1197 else if (strcmp(encoding, "ascii") == 0)\r
1198 return PyUnicode_DecodeASCII(s, size, errors);\r
1199\r
1200 /* Decode via the codec registry */\r
1201 buffer = PyBuffer_FromMemory((void *)s, size);\r
1202 if (buffer == NULL)\r
1203 goto onError;\r
1204 unicode = PyCodec_Decode(buffer, encoding, errors);\r
1205 if (unicode == NULL)\r
1206 goto onError;\r
1207 if (!PyUnicode_Check(unicode)) {\r
1208 PyErr_Format(PyExc_TypeError,\r
1209 "decoder did not return an unicode object (type=%.400s)",\r
1210 Py_TYPE(unicode)->tp_name);\r
1211 Py_DECREF(unicode);\r
1212 goto onError;\r
1213 }\r
1214 Py_DECREF(buffer);\r
1215 return unicode;\r
1216\r
1217 onError:\r
1218 Py_XDECREF(buffer);\r
1219 return NULL;\r
1220}\r
1221\r
1222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,\r
1223 const char *encoding,\r
1224 const char *errors)\r
1225{\r
1226 PyObject *v;\r
1227\r
1228 if (!PyUnicode_Check(unicode)) {\r
1229 PyErr_BadArgument();\r
1230 goto onError;\r
1231 }\r
1232\r
1233 if (encoding == NULL)\r
1234 encoding = PyUnicode_GetDefaultEncoding();\r
1235\r
1236 /* Decode via the codec registry */\r
1237 v = PyCodec_Decode(unicode, encoding, errors);\r
1238 if (v == NULL)\r
1239 goto onError;\r
1240 return v;\r
1241\r
1242 onError:\r
1243 return NULL;\r
1244}\r
1245\r
1246PyObject *PyUnicode_Encode(const Py_UNICODE *s,\r
1247 Py_ssize_t size,\r
1248 const char *encoding,\r
1249 const char *errors)\r
1250{\r
1251 PyObject *v, *unicode;\r
1252\r
1253 unicode = PyUnicode_FromUnicode(s, size);\r
1254 if (unicode == NULL)\r
1255 return NULL;\r
1256 v = PyUnicode_AsEncodedString(unicode, encoding, errors);\r
1257 Py_DECREF(unicode);\r
1258 return v;\r
1259}\r
1260\r
1261PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,\r
1262 const char *encoding,\r
1263 const char *errors)\r
1264{\r
1265 PyObject *v;\r
1266\r
1267 if (!PyUnicode_Check(unicode)) {\r
1268 PyErr_BadArgument();\r
1269 goto onError;\r
1270 }\r
1271\r
1272 if (encoding == NULL)\r
1273 encoding = PyUnicode_GetDefaultEncoding();\r
1274\r
1275 /* Encode via the codec registry */\r
1276 v = PyCodec_Encode(unicode, encoding, errors);\r
1277 if (v == NULL)\r
1278 goto onError;\r
1279 return v;\r
1280\r
1281 onError:\r
1282 return NULL;\r
1283}\r
1284\r
1285PyObject *PyUnicode_AsEncodedString(PyObject *unicode,\r
1286 const char *encoding,\r
1287 const char *errors)\r
1288{\r
1289 PyObject *v;\r
1290\r
1291 if (!PyUnicode_Check(unicode)) {\r
1292 PyErr_BadArgument();\r
1293 goto onError;\r
1294 }\r
1295\r
1296 if (encoding == NULL)\r
1297 encoding = PyUnicode_GetDefaultEncoding();\r
1298\r
1299 /* Shortcuts for common default encodings */\r
1300 if (errors == NULL) {\r
1301 if (strcmp(encoding, "utf-8") == 0)\r
1302 return PyUnicode_AsUTF8String(unicode);\r
1303 else if (strcmp(encoding, "latin-1") == 0)\r
1304 return PyUnicode_AsLatin1String(unicode);\r
1305#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)\r
1306 else if (strcmp(encoding, "mbcs") == 0)\r
1307 return PyUnicode_AsMBCSString(unicode);\r
1308#endif\r
1309 else if (strcmp(encoding, "ascii") == 0)\r
1310 return PyUnicode_AsASCIIString(unicode);\r
1311 }\r
1312\r
1313 /* Encode via the codec registry */\r
1314 v = PyCodec_Encode(unicode, encoding, errors);\r
1315 if (v == NULL)\r
1316 goto onError;\r
1317 if (!PyString_Check(v)) {\r
1318 PyErr_Format(PyExc_TypeError,\r
1319 "encoder did not return a string object (type=%.400s)",\r
1320 Py_TYPE(v)->tp_name);\r
1321 Py_DECREF(v);\r
1322 goto onError;\r
1323 }\r
1324 return v;\r
1325\r
1326 onError:\r
1327 return NULL;\r
1328}\r
1329\r
1330PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,\r
1331 const char *errors)\r
1332{\r
1333 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;\r
1334\r
1335 if (v)\r
1336 return v;\r
1337 v = PyUnicode_AsEncodedString(unicode, NULL, errors);\r
1338 if (v && errors == NULL)\r
1339 ((PyUnicodeObject *)unicode)->defenc = v;\r
1340 return v;\r
1341}\r
1342\r
1343Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)\r
1344{\r
1345 if (!PyUnicode_Check(unicode)) {\r
1346 PyErr_BadArgument();\r
1347 goto onError;\r
1348 }\r
1349 return PyUnicode_AS_UNICODE(unicode);\r
1350\r
1351 onError:\r
1352 return NULL;\r
1353}\r
1354\r
1355Py_ssize_t PyUnicode_GetSize(PyObject *unicode)\r
1356{\r
1357 if (!PyUnicode_Check(unicode)) {\r
1358 PyErr_BadArgument();\r
1359 goto onError;\r
1360 }\r
1361 return PyUnicode_GET_SIZE(unicode);\r
1362\r
1363 onError:\r
1364 return -1;\r
1365}\r
1366\r
1367const char *PyUnicode_GetDefaultEncoding(void)\r
1368{\r
1369 return unicode_default_encoding;\r
1370}\r
1371\r
1372int PyUnicode_SetDefaultEncoding(const char *encoding)\r
1373{\r
1374 PyObject *v;\r
1375\r
1376 /* Make sure the encoding is valid. As side effect, this also\r
1377 loads the encoding into the codec registry cache. */\r
1378 v = _PyCodec_Lookup(encoding);\r
1379 if (v == NULL)\r
1380 goto onError;\r
1381 Py_DECREF(v);\r
1382 strncpy(unicode_default_encoding,\r
1383 encoding,\r
1384 sizeof(unicode_default_encoding));\r
1385 return 0;\r
1386\r
1387 onError:\r
1388 return -1;\r
1389}\r
1390\r
1391/* error handling callback helper:\r
1392 build arguments, call the callback and check the arguments,\r
1393 if no exception occurred, copy the replacement to the output\r
1394 and adjust various state variables.\r
1395 return 0 on success, -1 on error\r
1396*/\r
1397\r
1398static\r
1399int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,\r
1400 const char *encoding, const char *reason,\r
1401 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,\r
1402 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,\r
1403 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)\r
1404{\r
1405 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";\r
1406\r
1407 PyObject *restuple = NULL;\r
1408 PyObject *repunicode = NULL;\r
1409 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);\r
1410 Py_ssize_t requiredsize;\r
1411 Py_ssize_t newpos;\r
1412 Py_UNICODE *repptr;\r
1413 Py_ssize_t repsize;\r
1414 int res = -1;\r
1415\r
1416 if (*errorHandler == NULL) {\r
1417 *errorHandler = PyCodec_LookupError(errors);\r
1418 if (*errorHandler == NULL)\r
1419 goto onError;\r
1420 }\r
1421\r
1422 if (*exceptionObject == NULL) {\r
1423 *exceptionObject = PyUnicodeDecodeError_Create(\r
1424 encoding, input, insize, *startinpos, *endinpos, reason);\r
1425 if (*exceptionObject == NULL)\r
1426 goto onError;\r
1427 }\r
1428 else {\r
1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))\r
1430 goto onError;\r
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))\r
1432 goto onError;\r
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))\r
1434 goto onError;\r
1435 }\r
1436\r
1437 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);\r
1438 if (restuple == NULL)\r
1439 goto onError;\r
1440 if (!PyTuple_Check(restuple)) {\r
1441 PyErr_SetString(PyExc_TypeError, &argparse[4]);\r
1442 goto onError;\r
1443 }\r
1444 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))\r
1445 goto onError;\r
1446 if (newpos<0)\r
1447 newpos = insize+newpos;\r
1448 if (newpos<0 || newpos>insize) {\r
1449 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);\r
1450 goto onError;\r
1451 }\r
1452\r
1453 /* need more space? (at least enough for what we\r
1454 have+the replacement+the rest of the string (starting\r
1455 at the new input position), so we won't have to check space\r
1456 when there are no errors in the rest of the string) */\r
1457 repptr = PyUnicode_AS_UNICODE(repunicode);\r
1458 repsize = PyUnicode_GET_SIZE(repunicode);\r
1459 requiredsize = *outpos + repsize + insize-newpos;\r
1460 if (requiredsize > outsize) {\r
1461 if (requiredsize<2*outsize)\r
1462 requiredsize = 2*outsize;\r
1463 if (_PyUnicode_Resize(output, requiredsize) < 0)\r
1464 goto onError;\r
1465 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;\r
1466 }\r
1467 *endinpos = newpos;\r
1468 *inptr = input + newpos;\r
1469 Py_UNICODE_COPY(*outptr, repptr, repsize);\r
1470 *outptr += repsize;\r
1471 *outpos += repsize;\r
1472 /* we made it! */\r
1473 res = 0;\r
1474\r
1475 onError:\r
1476 Py_XDECREF(restuple);\r
1477 return res;\r
1478}\r
1479\r
1480/* --- UTF-7 Codec -------------------------------------------------------- */\r
1481\r
1482/* See RFC2152 for details. We encode conservatively and decode liberally. */\r
1483\r
1484/* Three simple macros defining base-64. */\r
1485\r
1486/* Is c a base-64 character? */\r
1487\r
1488#define IS_BASE64(c) \\r
1489 (isalnum(c) || (c) == '+' || (c) == '/')\r
1490\r
1491/* given that c is a base-64 character, what is its base-64 value? */\r
1492\r
1493#define FROM_BASE64(c) \\r
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \\r
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \\r
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \\r
1497 (c) == '+' ? 62 : 63)\r
1498\r
1499/* What is the base-64 character of the bottom 6 bits of n? */\r
1500\r
1501#define TO_BASE64(n) \\r
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])\r
1503\r
1504/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be\r
1505 * decoded as itself. We are permissive on decoding; the only ASCII\r
1506 * byte not decoding to itself is the + which begins a base64\r
1507 * string. */\r
1508\r
1509#define DECODE_DIRECT(c) \\r
1510 ((c) <= 127 && (c) != '+')\r
1511\r
1512/* The UTF-7 encoder treats ASCII characters differently according to\r
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of\r
1514 * the above). See RFC2152. This array identifies these different\r
1515 * sets:\r
1516 * 0 : "Set D"\r
1517 * alphanumeric and '(),-./:?\r
1518 * 1 : "Set O"\r
1519 * !"#$%&*;<=>@[]^_`{|}\r
1520 * 2 : "whitespace"\r
1521 * ht nl cr sp\r
1522 * 3 : special (must be base64 encoded)\r
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)\r
1524 */\r
1525\r
1526static\r
1527char utf7_category[128] = {\r
1528/* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */\r
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,\r
1530/* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */\r
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,\r
1532/* sp ! " # $ % & ' ( ) * + , - . / */\r
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,\r
1534/* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */\r
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,\r
1536/* @ A B C D E F G H I J K L M N O */\r
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
1538/* P Q R S T U V W X Y Z [ \ ] ^ _ */\r
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,\r
1540/* ` a b c d e f g h i j k l m n o */\r
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
1542/* p q r s t u v w x y z { | } ~ del */\r
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,\r
1544};\r
1545\r
1546/* ENCODE_DIRECT: this character should be encoded as itself. The\r
1547 * answer depends on whether we are encoding set O as itself, and also\r
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it\r
1549 * clear that the answers to these questions vary between\r
1550 * applications, so this code needs to be flexible. */\r
1551\r
1552#define ENCODE_DIRECT(c, directO, directWS) \\r
1553 ((c) < 128 && (c) > 0 && \\r
1554 ((utf7_category[(c)] == 0) || \\r
1555 (directWS && (utf7_category[(c)] == 2)) || \\r
1556 (directO && (utf7_category[(c)] == 1))))\r
1557\r
1558PyObject *PyUnicode_DecodeUTF7(const char *s,\r
1559 Py_ssize_t size,\r
1560 const char *errors)\r
1561{\r
1562 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);\r
1563}\r
1564\r
1565/* The decoder. The only state we preserve is our read position,\r
1566 * i.e. how many characters we have consumed. So if we end in the\r
1567 * middle of a shift sequence we have to back off the read position\r
1568 * and the output to the beginning of the sequence, otherwise we lose\r
1569 * all the shift state (seen bits, number of bits seen, high\r
1570 * surrogate). */\r
1571\r
1572PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,\r
1573 Py_ssize_t size,\r
1574 const char *errors,\r
1575 Py_ssize_t *consumed)\r
1576{\r
1577 const char *starts = s;\r
1578 Py_ssize_t startinpos;\r
1579 Py_ssize_t endinpos;\r
1580 Py_ssize_t outpos;\r
1581 const char *e;\r
1582 PyUnicodeObject *unicode;\r
1583 Py_UNICODE *p;\r
1584 const char *errmsg = "";\r
1585 int inShift = 0;\r
1586 Py_UNICODE *shiftOutStart;\r
1587 unsigned int base64bits = 0;\r
1588 unsigned long base64buffer = 0;\r
1589 Py_UNICODE surrogate = 0;\r
1590 PyObject *errorHandler = NULL;\r
1591 PyObject *exc = NULL;\r
1592\r
1593 unicode = _PyUnicode_New(size);\r
1594 if (!unicode)\r
1595 return NULL;\r
1596 if (size == 0) {\r
1597 if (consumed)\r
1598 *consumed = 0;\r
1599 return (PyObject *)unicode;\r
1600 }\r
1601\r
1602 p = unicode->str;\r
1603 shiftOutStart = p;\r
1604 e = s + size;\r
1605\r
1606 while (s < e) {\r
1607 Py_UNICODE ch = (unsigned char) *s;\r
1608\r
1609 if (inShift) { /* in a base-64 section */\r
1610 if (IS_BASE64(ch)) { /* consume a base-64 character */\r
1611 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);\r
1612 base64bits += 6;\r
1613 s++;\r
1614 if (base64bits >= 16) {\r
1615 /* we have enough bits for a UTF-16 value */\r
1616 Py_UNICODE outCh = (Py_UNICODE)\r
1617 (base64buffer >> (base64bits-16));\r
1618 base64bits -= 16;\r
1619 base64buffer &= (1 << base64bits) - 1; /* clear high bits */\r
1620 if (surrogate) {\r
1621 /* expecting a second surrogate */\r
1622 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {\r
1623#ifdef Py_UNICODE_WIDE\r
1624 *p++ = (((surrogate & 0x3FF)<<10)\r
1625 | (outCh & 0x3FF)) + 0x10000;\r
1626#else\r
1627 *p++ = surrogate;\r
1628 *p++ = outCh;\r
1629#endif\r
1630 surrogate = 0;\r
1631 }\r
1632 else {\r
1633 surrogate = 0;\r
1634 errmsg = "second surrogate missing";\r
1635 goto utf7Error;\r
1636 }\r
1637 }\r
1638 else if (outCh >= 0xD800 && outCh <= 0xDBFF) {\r
1639 /* first surrogate */\r
1640 surrogate = outCh;\r
1641 }\r
1642 else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {\r
1643 errmsg = "unexpected second surrogate";\r
1644 goto utf7Error;\r
1645 }\r
1646 else {\r
1647 *p++ = outCh;\r
1648 }\r
1649 }\r
1650 }\r
1651 else { /* now leaving a base-64 section */\r
1652 inShift = 0;\r
1653 s++;\r
1654 if (surrogate) {\r
1655 errmsg = "second surrogate missing at end of shift sequence";\r
1656 goto utf7Error;\r
1657 }\r
1658 if (base64bits > 0) { /* left-over bits */\r
1659 if (base64bits >= 6) {\r
1660 /* We've seen at least one base-64 character */\r
1661 errmsg = "partial character in shift sequence";\r
1662 goto utf7Error;\r
1663 }\r
1664 else {\r
1665 /* Some bits remain; they should be zero */\r
1666 if (base64buffer != 0) {\r
1667 errmsg = "non-zero padding bits in shift sequence";\r
1668 goto utf7Error;\r
1669 }\r
1670 }\r
1671 }\r
1672 if (ch != '-') {\r
1673 /* '-' is absorbed; other terminating\r
1674 characters are preserved */\r
1675 *p++ = ch;\r
1676 }\r
1677 }\r
1678 }\r
1679 else if ( ch == '+' ) {\r
1680 startinpos = s-starts;\r
1681 s++; /* consume '+' */\r
1682 if (s < e && *s == '-') { /* '+-' encodes '+' */\r
1683 s++;\r
1684 *p++ = '+';\r
1685 }\r
1686 else { /* begin base64-encoded section */\r
1687 inShift = 1;\r
1688 shiftOutStart = p;\r
1689 base64bits = 0;\r
1690 }\r
1691 }\r
1692 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */\r
1693 *p++ = ch;\r
1694 s++;\r
1695 }\r
1696 else {\r
1697 startinpos = s-starts;\r
1698 s++;\r
1699 errmsg = "unexpected special character";\r
1700 goto utf7Error;\r
1701 }\r
1702 continue;\r
1703utf7Error:\r
1704 outpos = p-PyUnicode_AS_UNICODE(unicode);\r
1705 endinpos = s-starts;\r
1706 if (unicode_decode_call_errorhandler(\r
1707 errors, &errorHandler,\r
1708 "utf7", errmsg,\r
1709 starts, size, &startinpos, &endinpos, &exc, &s,\r
1710 &unicode, &outpos, &p))\r
1711 goto onError;\r
1712 }\r
1713\r
1714 /* end of string */\r
1715\r
1716 if (inShift && !consumed) { /* in shift sequence, no more to follow */\r
1717 /* if we're in an inconsistent state, that's an error */\r
1718 if (surrogate ||\r
1719 (base64bits >= 6) ||\r
1720 (base64bits > 0 && base64buffer != 0)) {\r
1721 outpos = p-PyUnicode_AS_UNICODE(unicode);\r
1722 endinpos = size;\r
1723 if (unicode_decode_call_errorhandler(\r
1724 errors, &errorHandler,\r
1725 "utf7", "unterminated shift sequence",\r
1726 starts, size, &startinpos, &endinpos, &exc, &s,\r
1727 &unicode, &outpos, &p))\r
1728 goto onError;\r
1729 }\r
1730 }\r
1731\r
1732 /* return state */\r
1733 if (consumed) {\r
1734 if (inShift) {\r
1735 p = shiftOutStart; /* back off output */\r
1736 *consumed = startinpos;\r
1737 }\r
1738 else {\r
1739 *consumed = s-starts;\r
1740 }\r
1741 }\r
1742\r
1743 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)\r
1744 goto onError;\r
1745\r
1746 Py_XDECREF(errorHandler);\r
1747 Py_XDECREF(exc);\r
1748 return (PyObject *)unicode;\r
1749\r
1750 onError:\r
1751 Py_XDECREF(errorHandler);\r
1752 Py_XDECREF(exc);\r
1753 Py_DECREF(unicode);\r
1754 return NULL;\r
1755}\r
1756\r
1757\r
1758PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,\r
1759 Py_ssize_t size,\r
1760 int base64SetO,\r
1761 int base64WhiteSpace,\r
1762 const char *errors)\r
1763{\r
1764 PyObject *v;\r
1765 /* It might be possible to tighten this worst case */\r
1766 Py_ssize_t allocated = 8 * size;\r
1767 int inShift = 0;\r
1768 Py_ssize_t i = 0;\r
1769 unsigned int base64bits = 0;\r
1770 unsigned long base64buffer = 0;\r
1771 char * out;\r
1772 char * start;\r
1773\r
1774 if (allocated / 8 != size)\r
1775 return PyErr_NoMemory();\r
1776\r
1777 if (size == 0)\r
1778 return PyString_FromStringAndSize(NULL, 0);\r
1779\r
1780 v = PyString_FromStringAndSize(NULL, allocated);\r
1781 if (v == NULL)\r
1782 return NULL;\r
1783\r
1784 start = out = PyString_AS_STRING(v);\r
1785 for (;i < size; ++i) {\r
1786 Py_UNICODE ch = s[i];\r
1787\r
1788 if (inShift) {\r
1789 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {\r
1790 /* shifting out */\r
1791 if (base64bits) { /* output remaining bits */\r
1792 *out++ = TO_BASE64(base64buffer << (6-base64bits));\r
1793 base64buffer = 0;\r
1794 base64bits = 0;\r
1795 }\r
1796 inShift = 0;\r
1797 /* Characters not in the BASE64 set implicitly unshift the sequence\r
1798 so no '-' is required, except if the character is itself a '-' */\r
1799 if (IS_BASE64(ch) || ch == '-') {\r
1800 *out++ = '-';\r
1801 }\r
1802 *out++ = (char) ch;\r
1803 }\r
1804 else {\r
1805 goto encode_char;\r
1806 }\r
1807 }\r
1808 else { /* not in a shift sequence */\r
1809 if (ch == '+') {\r
1810 *out++ = '+';\r
1811 *out++ = '-';\r
1812 }\r
1813 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {\r
1814 *out++ = (char) ch;\r
1815 }\r
1816 else {\r
1817 *out++ = '+';\r
1818 inShift = 1;\r
1819 goto encode_char;\r
1820 }\r
1821 }\r
1822 continue;\r
1823encode_char:\r
1824#ifdef Py_UNICODE_WIDE\r
1825 if (ch >= 0x10000) {\r
1826 /* code first surrogate */\r
1827 base64bits += 16;\r
1828 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);\r
1829 while (base64bits >= 6) {\r
1830 *out++ = TO_BASE64(base64buffer >> (base64bits-6));\r
1831 base64bits -= 6;\r
1832 }\r
1833 /* prepare second surrogate */\r
1834 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);\r
1835 }\r
1836#endif\r
1837 base64bits += 16;\r
1838 base64buffer = (base64buffer << 16) | ch;\r
1839 while (base64bits >= 6) {\r
1840 *out++ = TO_BASE64(base64buffer >> (base64bits-6));\r
1841 base64bits -= 6;\r
1842 }\r
1843 }\r
1844 if (base64bits)\r
1845 *out++= TO_BASE64(base64buffer << (6-base64bits) );\r
1846 if (inShift)\r
1847 *out++ = '-';\r
1848\r
1849 if (_PyString_Resize(&v, out - start))\r
1850 return NULL;\r
1851 return v;\r
1852}\r
1853\r
1854#undef IS_BASE64\r
1855#undef FROM_BASE64\r
1856#undef TO_BASE64\r
1857#undef DECODE_DIRECT\r
1858#undef ENCODE_DIRECT\r
1859\r
1860/* --- UTF-8 Codec -------------------------------------------------------- */\r
1861\r
1862static\r
1863char utf8_code_length[256] = {\r
1864 /* Map UTF-8 encoded prefix byte to sequence length. Zero means\r
1865 illegal prefix. See RFC 3629 for details */\r
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */\r
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
de08c53b 1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
4710c53d 1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\r
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */\r
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */\r
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\r
1877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */\r
1878 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */\r
1879 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */\r
1880 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */\r
1881 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */\r
1882};\r
1883\r
1884PyObject *PyUnicode_DecodeUTF8(const char *s,\r
1885 Py_ssize_t size,\r
1886 const char *errors)\r
1887{\r
1888 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);\r
1889}\r
1890\r
1891PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,\r
1892 Py_ssize_t size,\r
1893 const char *errors,\r
1894 Py_ssize_t *consumed)\r
1895{\r
1896 const char *starts = s;\r
1897 int n;\r
1898 int k;\r
1899 Py_ssize_t startinpos;\r
1900 Py_ssize_t endinpos;\r
1901 Py_ssize_t outpos;\r
1902 const char *e;\r
1903 PyUnicodeObject *unicode;\r
1904 Py_UNICODE *p;\r
1905 const char *errmsg = "";\r
1906 PyObject *errorHandler = NULL;\r
1907 PyObject *exc = NULL;\r
1908\r
1909 /* Note: size will always be longer than the resulting Unicode\r
1910 character count */\r
1911 unicode = _PyUnicode_New(size);\r
1912 if (!unicode)\r
1913 return NULL;\r
1914 if (size == 0) {\r
1915 if (consumed)\r
1916 *consumed = 0;\r
1917 return (PyObject *)unicode;\r
1918 }\r
1919\r
1920 /* Unpack UTF-8 encoded data */\r
1921 p = unicode->str;\r
1922 e = s + size;\r
1923\r
1924 while (s < e) {\r
1925 Py_UCS4 ch = (unsigned char)*s;\r
1926\r
1927 if (ch < 0x80) {\r
1928 *p++ = (Py_UNICODE)ch;\r
1929 s++;\r
1930 continue;\r
1931 }\r
1932\r
1933 n = utf8_code_length[ch];\r
1934\r
1935 if (s + n > e) {\r
1936 if (consumed)\r
1937 break;\r
1938 else {\r
1939 errmsg = "unexpected end of data";\r
1940 startinpos = s-starts;\r
1941 endinpos = startinpos+1;\r
1942 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)\r
1943 endinpos++;\r
1944 goto utf8Error;\r
1945 }\r
1946 }\r
1947\r
1948 switch (n) {\r
1949\r
1950 case 0:\r
1951 errmsg = "invalid start byte";\r
1952 startinpos = s-starts;\r
1953 endinpos = startinpos+1;\r
1954 goto utf8Error;\r
1955\r
1956 case 1:\r
1957 errmsg = "internal error";\r
1958 startinpos = s-starts;\r
1959 endinpos = startinpos+1;\r
1960 goto utf8Error;\r
1961\r
1962 case 2:\r
1963 if ((s[1] & 0xc0) != 0x80) {\r
1964 errmsg = "invalid continuation byte";\r
1965 startinpos = s-starts;\r
1966 endinpos = startinpos + 1;\r
1967 goto utf8Error;\r
1968 }\r
1969 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);\r
1970 assert ((ch > 0x007F) && (ch <= 0x07FF));\r
1971 *p++ = (Py_UNICODE)ch;\r
1972 break;\r
1973\r
1974 case 3:\r
1975 /* XXX: surrogates shouldn't be valid UTF-8!\r
1976 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf\r
1977 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt\r
1978 Uncomment the 2 lines below to make them invalid,\r
1979 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */\r
1980 if ((s[1] & 0xc0) != 0x80 ||\r
1981 (s[2] & 0xc0) != 0x80 ||\r
1982 ((unsigned char)s[0] == 0xE0 &&\r
1983 (unsigned char)s[1] < 0xA0)/* ||\r
1984 ((unsigned char)s[0] == 0xED &&\r
1985 (unsigned char)s[1] > 0x9F)*/) {\r
1986 errmsg = "invalid continuation byte";\r
1987 startinpos = s-starts;\r
1988 endinpos = startinpos + 1;\r
1989\r
1990 /* if s[1] first two bits are 1 and 0, then the invalid\r
1991 continuation byte is s[2], so increment endinpos by 1,\r
1992 if not, s[1] is invalid and endinpos doesn't need to\r
1993 be incremented. */\r
1994 if ((s[1] & 0xC0) == 0x80)\r
1995 endinpos++;\r
1996 goto utf8Error;\r
1997 }\r
1998 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);\r
1999 assert ((ch > 0x07FF) && (ch <= 0xFFFF));\r
2000 *p++ = (Py_UNICODE)ch;\r
2001 break;\r
2002\r
2003 case 4:\r
2004 if ((s[1] & 0xc0) != 0x80 ||\r
2005 (s[2] & 0xc0) != 0x80 ||\r
2006 (s[3] & 0xc0) != 0x80 ||\r
2007 ((unsigned char)s[0] == 0xF0 &&\r
2008 (unsigned char)s[1] < 0x90) ||\r
2009 ((unsigned char)s[0] == 0xF4 &&\r
2010 (unsigned char)s[1] > 0x8F)) {\r
2011 errmsg = "invalid continuation byte";\r
2012 startinpos = s-starts;\r
2013 endinpos = startinpos + 1;\r
2014 if ((s[1] & 0xC0) == 0x80) {\r
2015 endinpos++;\r
2016 if ((s[2] & 0xC0) == 0x80)\r
2017 endinpos++;\r
2018 }\r
2019 goto utf8Error;\r
2020 }\r
2021 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +\r
2022 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);\r
2023 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));\r
2024\r
2025#ifdef Py_UNICODE_WIDE\r
2026 *p++ = (Py_UNICODE)ch;\r
2027#else\r
2028 /* compute and append the two surrogates: */\r
2029\r
2030 /* translate from 10000..10FFFF to 0..FFFF */\r
2031 ch -= 0x10000;\r
2032\r
2033 /* high surrogate = top 10 bits added to D800 */\r
2034 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));\r
2035\r
2036 /* low surrogate = bottom 10 bits added to DC00 */\r
2037 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));\r
2038#endif\r
2039 break;\r
2040 }\r
2041 s += n;\r
2042 continue;\r
2043\r
2044 utf8Error:\r
2045 outpos = p-PyUnicode_AS_UNICODE(unicode);\r
2046 if (unicode_decode_call_errorhandler(\r
2047 errors, &errorHandler,\r
2048 "utf8", errmsg,\r
2049 starts, size, &startinpos, &endinpos, &exc, &s,\r
2050 &unicode, &outpos, &p))\r
2051 goto onError;\r
2052 }\r
2053 if (consumed)\r
2054 *consumed = s-starts;\r
2055\r
2056 /* Adjust length */\r
2057 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)\r
2058 goto onError;\r
2059\r
2060 Py_XDECREF(errorHandler);\r
2061 Py_XDECREF(exc);\r
2062 return (PyObject *)unicode;\r
2063\r
2064 onError:\r
2065 Py_XDECREF(errorHandler);\r
2066 Py_XDECREF(exc);\r
2067 Py_DECREF(unicode);\r
2068 return NULL;\r
2069}\r
2070\r
2071/* Allocation strategy: if the string is short, convert into a stack buffer\r
2072 and allocate exactly as much space needed at the end. Else allocate the\r
2073 maximum possible needed (4 result bytes per Unicode character), and return\r
2074 the excess memory at the end.\r
2075*/\r
2076PyObject *\r
2077PyUnicode_EncodeUTF8(const Py_UNICODE *s,\r
2078 Py_ssize_t size,\r
2079 const char *errors)\r
2080{\r
2081#define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */\r
2082\r
2083 Py_ssize_t i; /* index into s of next input byte */\r
2084 PyObject *v; /* result string object */\r
2085 char *p; /* next free byte in output buffer */\r
2086 Py_ssize_t nallocated; /* number of result bytes allocated */\r
2087 Py_ssize_t nneeded; /* number of result bytes needed */\r
2088 char stackbuf[MAX_SHORT_UNICHARS * 4];\r
2089\r
2090 assert(s != NULL);\r
2091 assert(size >= 0);\r
2092\r
2093 if (size <= MAX_SHORT_UNICHARS) {\r
2094 /* Write into the stack buffer; nallocated can't overflow.\r
2095 * At the end, we'll allocate exactly as much heap space as it\r
2096 * turns out we need.\r
2097 */\r
2098 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);\r
2099 v = NULL; /* will allocate after we're done */\r
2100 p = stackbuf;\r
2101 }\r
2102 else {\r
2103 /* Overallocate on the heap, and give the excess back at the end. */\r
2104 nallocated = size * 4;\r
2105 if (nallocated / 4 != size) /* overflow! */\r
2106 return PyErr_NoMemory();\r
2107 v = PyString_FromStringAndSize(NULL, nallocated);\r
2108 if (v == NULL)\r
2109 return NULL;\r
2110 p = PyString_AS_STRING(v);\r
2111 }\r
2112\r
2113 for (i = 0; i < size;) {\r
2114 Py_UCS4 ch = s[i++];\r
2115\r
2116 if (ch < 0x80)\r
2117 /* Encode ASCII */\r
2118 *p++ = (char) ch;\r
2119\r
2120 else if (ch < 0x0800) {\r
2121 /* Encode Latin-1 */\r
2122 *p++ = (char)(0xc0 | (ch >> 6));\r
2123 *p++ = (char)(0x80 | (ch & 0x3f));\r
2124 }\r
2125 else {\r
2126 /* Encode UCS2 Unicode ordinals */\r
2127 if (ch < 0x10000) {\r
2128 /* Special case: check for high surrogate */\r
2129 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {\r
2130 Py_UCS4 ch2 = s[i];\r
2131 /* Check for low surrogate and combine the two to\r
2132 form a UCS4 value */\r
2133 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r
2134 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;\r
2135 i++;\r
2136 goto encodeUCS4;\r
2137 }\r
2138 /* Fall through: handles isolated high surrogates */\r
2139 }\r
2140 *p++ = (char)(0xe0 | (ch >> 12));\r
2141 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r
2142 *p++ = (char)(0x80 | (ch & 0x3f));\r
2143 continue;\r
2144 }\r
2145 encodeUCS4:\r
2146 /* Encode UCS4 Unicode ordinals */\r
2147 *p++ = (char)(0xf0 | (ch >> 18));\r
2148 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));\r
2149 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));\r
2150 *p++ = (char)(0x80 | (ch & 0x3f));\r
2151 }\r
2152 }\r
2153\r
2154 if (v == NULL) {\r
2155 /* This was stack allocated. */\r
2156 nneeded = p - stackbuf;\r
2157 assert(nneeded <= nallocated);\r
2158 v = PyString_FromStringAndSize(stackbuf, nneeded);\r
2159 }\r
2160 else {\r
2161 /* Cut back to size actually needed. */\r
2162 nneeded = p - PyString_AS_STRING(v);\r
2163 assert(nneeded <= nallocated);\r
2164 if (_PyString_Resize(&v, nneeded))\r
2165 return NULL;\r
2166 }\r
2167 return v;\r
2168\r
2169#undef MAX_SHORT_UNICHARS\r
2170}\r
2171\r
2172PyObject *PyUnicode_AsUTF8String(PyObject *unicode)\r
2173{\r
2174 if (!PyUnicode_Check(unicode)) {\r
2175 PyErr_BadArgument();\r
2176 return NULL;\r
2177 }\r
2178 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),\r
2179 PyUnicode_GET_SIZE(unicode),\r
2180 NULL);\r
2181}\r
2182\r
2183/* --- UTF-32 Codec ------------------------------------------------------- */\r
2184\r
2185PyObject *\r
2186PyUnicode_DecodeUTF32(const char *s,\r
2187 Py_ssize_t size,\r
2188 const char *errors,\r
2189 int *byteorder)\r
2190{\r
2191 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);\r
2192}\r
2193\r
2194PyObject *\r
2195PyUnicode_DecodeUTF32Stateful(const char *s,\r
2196 Py_ssize_t size,\r
2197 const char *errors,\r
2198 int *byteorder,\r
2199 Py_ssize_t *consumed)\r
2200{\r
2201 const char *starts = s;\r
2202 Py_ssize_t startinpos;\r
2203 Py_ssize_t endinpos;\r
2204 Py_ssize_t outpos;\r
2205 PyUnicodeObject *unicode;\r
2206 Py_UNICODE *p;\r
2207#ifndef Py_UNICODE_WIDE\r
2208 int pairs = 0;\r
2209 const unsigned char *qq;\r
2210#else\r
2211 const int pairs = 0;\r
2212#endif\r
2213 const unsigned char *q, *e;\r
2214 int bo = 0; /* assume native ordering by default */\r
2215 const char *errmsg = "";\r
2216 /* Offsets from q for retrieving bytes in the right order. */\r
2217#ifdef BYTEORDER_IS_LITTLE_ENDIAN\r
2218 int iorder[] = {0, 1, 2, 3};\r
2219#else\r
2220 int iorder[] = {3, 2, 1, 0};\r
2221#endif\r
2222 PyObject *errorHandler = NULL;\r
2223 PyObject *exc = NULL;\r
de08c53b 2224\r
4710c53d 2225 q = (unsigned char *)s;\r
2226 e = q + size;\r
2227\r
2228 if (byteorder)\r
2229 bo = *byteorder;\r
2230\r
2231 /* Check for BOM marks (U+FEFF) in the input and adjust current\r
2232 byte order setting accordingly. In native mode, the leading BOM\r
2233 mark is skipped, in all other modes, it is copied to the output\r
2234 stream as-is (giving a ZWNBSP character). */\r
2235 if (bo == 0) {\r
2236 if (size >= 4) {\r
2237 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |\r
2238 (q[iorder[1]] << 8) | q[iorder[0]];\r
2239#ifdef BYTEORDER_IS_LITTLE_ENDIAN\r
2240 if (bom == 0x0000FEFF) {\r
2241 q += 4;\r
2242 bo = -1;\r
2243 }\r
2244 else if (bom == 0xFFFE0000) {\r
2245 q += 4;\r
2246 bo = 1;\r
2247 }\r
2248#else\r
2249 if (bom == 0x0000FEFF) {\r
2250 q += 4;\r
2251 bo = 1;\r
2252 }\r
2253 else if (bom == 0xFFFE0000) {\r
2254 q += 4;\r
2255 bo = -1;\r
2256 }\r
2257#endif\r
2258 }\r
2259 }\r
2260\r
2261 if (bo == -1) {\r
2262 /* force LE */\r
2263 iorder[0] = 0;\r
2264 iorder[1] = 1;\r
2265 iorder[2] = 2;\r
2266 iorder[3] = 3;\r
2267 }\r
2268 else if (bo == 1) {\r
2269 /* force BE */\r
2270 iorder[0] = 3;\r
2271 iorder[1] = 2;\r
2272 iorder[2] = 1;\r
2273 iorder[3] = 0;\r
2274 }\r
2275\r
2276 /* On narrow builds we split characters outside the BMP into two\r
2277 codepoints => count how much extra space we need. */\r
2278#ifndef Py_UNICODE_WIDE\r
2279 for (qq = q; qq < e; qq += 4)\r
2280 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)\r
2281 pairs++;\r
2282#endif\r
2283\r
2284 /* This might be one to much, because of a BOM */\r
2285 unicode = _PyUnicode_New((size+3)/4+pairs);\r
2286 if (!unicode)\r
2287 return NULL;\r
2288 if (size == 0)\r
2289 return (PyObject *)unicode;\r
2290\r
2291 /* Unpack UTF-32 encoded data */\r
2292 p = unicode->str;\r
2293\r
2294 while (q < e) {\r
2295 Py_UCS4 ch;\r
2296 /* remaining bytes at the end? (size should be divisible by 4) */\r
2297 if (e-q<4) {\r
2298 if (consumed)\r
2299 break;\r
2300 errmsg = "truncated data";\r
2301 startinpos = ((const char *)q)-starts;\r
2302 endinpos = ((const char *)e)-starts;\r
2303 goto utf32Error;\r
2304 /* The remaining input chars are ignored if the callback\r
2305 chooses to skip the input */\r
2306 }\r
2307 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |\r
2308 (q[iorder[1]] << 8) | q[iorder[0]];\r
2309\r
2310 if (ch >= 0x110000)\r
2311 {\r
2312 errmsg = "codepoint not in range(0x110000)";\r
2313 startinpos = ((const char *)q)-starts;\r
2314 endinpos = startinpos+4;\r
2315 goto utf32Error;\r
2316 }\r
2317#ifndef Py_UNICODE_WIDE\r
2318 if (ch >= 0x10000)\r
2319 {\r
2320 *p++ = 0xD800 | ((ch-0x10000) >> 10);\r
2321 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);\r
2322 }\r
2323 else\r
2324#endif\r
2325 *p++ = ch;\r
2326 q += 4;\r
2327 continue;\r
2328 utf32Error:\r
2329 outpos = p-PyUnicode_AS_UNICODE(unicode);\r
2330 if (unicode_decode_call_errorhandler(\r
2331 errors, &errorHandler,\r
2332 "utf32", errmsg,\r
2333 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,\r
2334 &unicode, &outpos, &p))\r
2335 goto onError;\r
2336 }\r
2337\r
2338 if (byteorder)\r
2339 *byteorder = bo;\r
2340\r
2341 if (consumed)\r
2342 *consumed = (const char *)q-starts;\r
2343\r
2344 /* Adjust length */\r
2345 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)\r
2346 goto onError;\r
2347\r
2348 Py_XDECREF(errorHandler);\r
2349 Py_XDECREF(exc);\r
2350 return (PyObject *)unicode;\r
2351\r
2352 onError:\r
2353 Py_DECREF(unicode);\r
2354 Py_XDECREF(errorHandler);\r
2355 Py_XDECREF(exc);\r
2356 return NULL;\r
2357}\r
2358\r
2359PyObject *\r
2360PyUnicode_EncodeUTF32(const Py_UNICODE *s,\r
2361 Py_ssize_t size,\r
2362 const char *errors,\r
2363 int byteorder)\r
2364{\r
2365 PyObject *v;\r
2366 unsigned char *p;\r
2367 Py_ssize_t nsize, bytesize;\r
2368#ifndef Py_UNICODE_WIDE\r
2369 Py_ssize_t i, pairs;\r
2370#else\r
2371 const int pairs = 0;\r
2372#endif\r
2373 /* Offsets from p for storing byte pairs in the right order. */\r
2374#ifdef BYTEORDER_IS_LITTLE_ENDIAN\r
2375 int iorder[] = {0, 1, 2, 3};\r
2376#else\r
2377 int iorder[] = {3, 2, 1, 0};\r
2378#endif\r
2379\r
2380#define STORECHAR(CH) \\r
2381 do { \\r
2382 p[iorder[3]] = ((CH) >> 24) & 0xff; \\r
2383 p[iorder[2]] = ((CH) >> 16) & 0xff; \\r
2384 p[iorder[1]] = ((CH) >> 8) & 0xff; \\r
2385 p[iorder[0]] = (CH) & 0xff; \\r
2386 p += 4; \\r
2387 } while(0)\r
2388\r
2389 /* In narrow builds we can output surrogate pairs as one codepoint,\r
2390 so we need less space. */\r
2391#ifndef Py_UNICODE_WIDE\r
2392 for (i = pairs = 0; i < size-1; i++)\r
2393 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&\r
2394 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)\r
2395 pairs++;\r
2396#endif\r
2397 nsize = (size - pairs + (byteorder == 0));\r
2398 bytesize = nsize * 4;\r
2399 if (bytesize / 4 != nsize)\r
2400 return PyErr_NoMemory();\r
2401 v = PyString_FromStringAndSize(NULL, bytesize);\r
2402 if (v == NULL)\r
2403 return NULL;\r
2404\r
2405 p = (unsigned char *)PyString_AS_STRING(v);\r
2406 if (byteorder == 0)\r
2407 STORECHAR(0xFEFF);\r
2408 if (size == 0)\r
2409 return v;\r
2410\r
2411 if (byteorder == -1) {\r
2412 /* force LE */\r
2413 iorder[0] = 0;\r
2414 iorder[1] = 1;\r
2415 iorder[2] = 2;\r
2416 iorder[3] = 3;\r
2417 }\r
2418 else if (byteorder == 1) {\r
2419 /* force BE */\r
2420 iorder[0] = 3;\r
2421 iorder[1] = 2;\r
2422 iorder[2] = 1;\r
2423 iorder[3] = 0;\r
2424 }\r
2425\r
2426 while (size-- > 0) {\r
2427 Py_UCS4 ch = *s++;\r
2428#ifndef Py_UNICODE_WIDE\r
2429 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {\r
2430 Py_UCS4 ch2 = *s;\r
2431 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r
2432 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;\r
2433 s++;\r
2434 size--;\r
2435 }\r
2436 }\r
2437#endif\r
2438 STORECHAR(ch);\r
2439 }\r
2440 return v;\r
2441#undef STORECHAR\r
2442}\r
2443\r
2444PyObject *PyUnicode_AsUTF32String(PyObject *unicode)\r
2445{\r
2446 if (!PyUnicode_Check(unicode)) {\r
2447 PyErr_BadArgument();\r
2448 return NULL;\r
2449 }\r
2450 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),\r
2451 PyUnicode_GET_SIZE(unicode),\r
2452 NULL,\r
2453 0);\r
2454}\r
2455\r
2456/* --- UTF-16 Codec ------------------------------------------------------- */\r
2457\r
2458PyObject *\r
2459PyUnicode_DecodeUTF16(const char *s,\r
2460 Py_ssize_t size,\r
2461 const char *errors,\r
2462 int *byteorder)\r
2463{\r
2464 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);\r
2465}\r
2466\r
2467PyObject *\r
2468PyUnicode_DecodeUTF16Stateful(const char *s,\r
2469 Py_ssize_t size,\r
2470 const char *errors,\r
2471 int *byteorder,\r
2472 Py_ssize_t *consumed)\r
2473{\r
2474 const char *starts = s;\r
2475 Py_ssize_t startinpos;\r
2476 Py_ssize_t endinpos;\r
2477 Py_ssize_t outpos;\r
2478 PyUnicodeObject *unicode;\r
2479 Py_UNICODE *p;\r
2480 const unsigned char *q, *e;\r
2481 int bo = 0; /* assume native ordering by default */\r
2482 const char *errmsg = "";\r
2483 /* Offsets from q for retrieving byte pairs in the right order. */\r
2484#ifdef BYTEORDER_IS_LITTLE_ENDIAN\r
2485 int ihi = 1, ilo = 0;\r
2486#else\r
2487 int ihi = 0, ilo = 1;\r
2488#endif\r
2489 PyObject *errorHandler = NULL;\r
2490 PyObject *exc = NULL;\r
2491\r
2492 /* Note: size will always be longer than the resulting Unicode\r
2493 character count */\r
2494 unicode = _PyUnicode_New(size);\r
2495 if (!unicode)\r
2496 return NULL;\r
2497 if (size == 0)\r
2498 return (PyObject *)unicode;\r
2499\r
2500 /* Unpack UTF-16 encoded data */\r
2501 p = unicode->str;\r
2502 q = (unsigned char *)s;\r
2503 e = q + size;\r
2504\r
2505 if (byteorder)\r
2506 bo = *byteorder;\r
2507\r
2508 /* Check for BOM marks (U+FEFF) in the input and adjust current\r
2509 byte order setting accordingly. In native mode, the leading BOM\r
2510 mark is skipped, in all other modes, it is copied to the output\r
2511 stream as-is (giving a ZWNBSP character). */\r
2512 if (bo == 0) {\r
2513 if (size >= 2) {\r
2514 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];\r
2515#ifdef BYTEORDER_IS_LITTLE_ENDIAN\r
2516 if (bom == 0xFEFF) {\r
2517 q += 2;\r
2518 bo = -1;\r
2519 }\r
2520 else if (bom == 0xFFFE) {\r
2521 q += 2;\r
2522 bo = 1;\r
2523 }\r
2524#else\r
2525 if (bom == 0xFEFF) {\r
2526 q += 2;\r
2527 bo = 1;\r
2528 }\r
2529 else if (bom == 0xFFFE) {\r
2530 q += 2;\r
2531 bo = -1;\r
2532 }\r
2533#endif\r
2534 }\r
2535 }\r
2536\r
2537 if (bo == -1) {\r
2538 /* force LE */\r
2539 ihi = 1;\r
2540 ilo = 0;\r
2541 }\r
2542 else if (bo == 1) {\r
2543 /* force BE */\r
2544 ihi = 0;\r
2545 ilo = 1;\r
2546 }\r
2547\r
2548 while (q < e) {\r
2549 Py_UNICODE ch;\r
2550 /* remaining bytes at the end? (size should be even) */\r
2551 if (e-q<2) {\r
2552 if (consumed)\r
2553 break;\r
2554 errmsg = "truncated data";\r
2555 startinpos = ((const char *)q)-starts;\r
2556 endinpos = ((const char *)e)-starts;\r
2557 goto utf16Error;\r
2558 /* The remaining input chars are ignored if the callback\r
2559 chooses to skip the input */\r
2560 }\r
2561 ch = (q[ihi] << 8) | q[ilo];\r
2562\r
2563 q += 2;\r
2564\r
2565 if (ch < 0xD800 || ch > 0xDFFF) {\r
2566 *p++ = ch;\r
2567 continue;\r
2568 }\r
2569\r
2570 /* UTF-16 code pair: */\r
2571 if (q >= e) {\r
2572 errmsg = "unexpected end of data";\r
2573 startinpos = (((const char *)q)-2)-starts;\r
2574 endinpos = ((const char *)e)-starts;\r
2575 goto utf16Error;\r
2576 }\r
2577 if (0xD800 <= ch && ch <= 0xDBFF) {\r
2578 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];\r
2579 q += 2;\r
2580 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {\r
2581#ifndef Py_UNICODE_WIDE\r
2582 *p++ = ch;\r
2583 *p++ = ch2;\r
2584#else\r
2585 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;\r
2586#endif\r
2587 continue;\r
2588 }\r
2589 else {\r
2590 errmsg = "illegal UTF-16 surrogate";\r
2591 startinpos = (((const char *)q)-4)-starts;\r
2592 endinpos = startinpos+2;\r
2593 goto utf16Error;\r
2594 }\r
2595\r
2596 }\r
2597 errmsg = "illegal encoding";\r
2598 startinpos = (((const char *)q)-2)-starts;\r
2599 endinpos = startinpos+2;\r
2600 /* Fall through to report the error */\r
2601\r
2602 utf16Error:\r
2603 outpos = p-PyUnicode_AS_UNICODE(unicode);\r
2604 if (unicode_decode_call_errorhandler(\r
2605 errors, &errorHandler,\r
2606 "utf16", errmsg,\r
2607 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,\r
2608 &unicode, &outpos, &p))\r
2609 goto onError;\r
2610 }\r
2611\r
2612 if (byteorder)\r
2613 *byteorder = bo;\r
2614\r
2615 if (consumed)\r
2616 *consumed = (const char *)q-starts;\r
2617\r
2618 /* Adjust length */\r
2619 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)\r
2620 goto onError;\r
2621\r
2622 Py_XDECREF(errorHandler);\r
2623 Py_XDECREF(exc);\r
2624 return (PyObject *)unicode;\r
2625\r
2626 onError:\r
2627 Py_DECREF(unicode);\r
2628 Py_XDECREF(errorHandler);\r
2629 Py_XDECREF(exc);\r
2630 return NULL;\r
2631}\r
2632\r
2633PyObject *\r
2634PyUnicode_EncodeUTF16(const Py_UNICODE *s,\r
2635 Py_ssize_t size,\r
2636 const char *errors,\r
2637 int byteorder)\r
2638{\r
2639 PyObject *v;\r
2640 unsigned char *p;\r
2641 Py_ssize_t nsize, bytesize;\r
2642#ifdef Py_UNICODE_WIDE\r
2643 Py_ssize_t i, pairs;\r
2644#else\r
2645 const int pairs = 0;\r
2646#endif\r
2647 /* Offsets from p for storing byte pairs in the right order. */\r
2648#ifdef BYTEORDER_IS_LITTLE_ENDIAN\r
2649 int ihi = 1, ilo = 0;\r
2650#else\r
2651 int ihi = 0, ilo = 1;\r
2652#endif\r
2653\r
2654#define STORECHAR(CH) \\r
2655 do { \\r
2656 p[ihi] = ((CH) >> 8) & 0xff; \\r
2657 p[ilo] = (CH) & 0xff; \\r
2658 p += 2; \\r
2659 } while(0)\r
2660\r
2661#ifdef Py_UNICODE_WIDE\r
2662 for (i = pairs = 0; i < size; i++)\r
2663 if (s[i] >= 0x10000)\r
2664 pairs++;\r
2665#endif\r
2666 /* 2 * (size + pairs + (byteorder == 0)) */\r
2667 if (size > PY_SSIZE_T_MAX ||\r
2668 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))\r
2669 return PyErr_NoMemory();\r
2670 nsize = size + pairs + (byteorder == 0);\r
2671 bytesize = nsize * 2;\r
2672 if (bytesize / 2 != nsize)\r
2673 return PyErr_NoMemory();\r
2674 v = PyString_FromStringAndSize(NULL, bytesize);\r
2675 if (v == NULL)\r
2676 return NULL;\r
2677\r
2678 p = (unsigned char *)PyString_AS_STRING(v);\r
2679 if (byteorder == 0)\r
2680 STORECHAR(0xFEFF);\r
2681 if (size == 0)\r
2682 return v;\r
2683\r
2684 if (byteorder == -1) {\r
2685 /* force LE */\r
2686 ihi = 1;\r
2687 ilo = 0;\r
2688 }\r
2689 else if (byteorder == 1) {\r
2690 /* force BE */\r
2691 ihi = 0;\r
2692 ilo = 1;\r
2693 }\r
2694\r
2695 while (size-- > 0) {\r
2696 Py_UNICODE ch = *s++;\r
2697 Py_UNICODE ch2 = 0;\r
2698#ifdef Py_UNICODE_WIDE\r
2699 if (ch >= 0x10000) {\r
2700 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);\r
2701 ch = 0xD800 | ((ch-0x10000) >> 10);\r
2702 }\r
2703#endif\r
2704 STORECHAR(ch);\r
2705 if (ch2)\r
2706 STORECHAR(ch2);\r
2707 }\r
2708 return v;\r
2709#undef STORECHAR\r
2710}\r
2711\r
2712PyObject *PyUnicode_AsUTF16String(PyObject *unicode)\r
2713{\r
2714 if (!PyUnicode_Check(unicode)) {\r
2715 PyErr_BadArgument();\r
2716 return NULL;\r
2717 }\r
2718 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),\r
2719 PyUnicode_GET_SIZE(unicode),\r
2720 NULL,\r
2721 0);\r
2722}\r
2723\r
2724/* --- Unicode Escape Codec ----------------------------------------------- */\r
2725\r
2726static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;\r
2727\r
2728PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,\r
2729 Py_ssize_t size,\r
2730 const char *errors)\r
2731{\r
2732 const char *starts = s;\r
2733 Py_ssize_t startinpos;\r
2734 Py_ssize_t endinpos;\r
2735 Py_ssize_t outpos;\r
2736 int i;\r
2737 PyUnicodeObject *v;\r
2738 Py_UNICODE *p;\r
2739 const char *end;\r
2740 char* message;\r
2741 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */\r
2742 PyObject *errorHandler = NULL;\r
2743 PyObject *exc = NULL;\r
2744\r
2745 /* Escaped strings will always be longer than the resulting\r
2746 Unicode string, so we start with size here and then reduce the\r
2747 length after conversion to the true value.\r
2748 (but if the error callback returns a long replacement string\r
2749 we'll have to allocate more space) */\r
2750 v = _PyUnicode_New(size);\r
2751 if (v == NULL)\r
2752 goto onError;\r
2753 if (size == 0)\r
2754 return (PyObject *)v;\r
2755\r
2756 p = PyUnicode_AS_UNICODE(v);\r
2757 end = s + size;\r
2758\r
2759 while (s < end) {\r
2760 unsigned char c;\r
2761 Py_UNICODE x;\r
2762 int digits;\r
2763\r
2764 /* Non-escape characters are interpreted as Unicode ordinals */\r
2765 if (*s != '\\') {\r
2766 *p++ = (unsigned char) *s++;\r
2767 continue;\r
2768 }\r
2769\r
2770 startinpos = s-starts;\r
2771 /* \ - Escapes */\r
2772 s++;\r
2773 c = *s++;\r
2774 if (s > end)\r
2775 c = '\0'; /* Invalid after \ */\r
2776 switch (c) {\r
2777\r
2778 /* \x escapes */\r
2779 case '\n': break;\r
2780 case '\\': *p++ = '\\'; break;\r
2781 case '\'': *p++ = '\''; break;\r
2782 case '\"': *p++ = '\"'; break;\r
2783 case 'b': *p++ = '\b'; break;\r
2784 case 'f': *p++ = '\014'; break; /* FF */\r
2785 case 't': *p++ = '\t'; break;\r
2786 case 'n': *p++ = '\n'; break;\r
2787 case 'r': *p++ = '\r'; break;\r
2788 case 'v': *p++ = '\013'; break; /* VT */\r
2789 case 'a': *p++ = '\007'; break; /* BEL, not classic C */\r
2790\r
2791 /* \OOO (octal) escapes */\r
2792 case '0': case '1': case '2': case '3':\r
2793 case '4': case '5': case '6': case '7':\r
2794 x = s[-1] - '0';\r
2795 if (s < end && '0' <= *s && *s <= '7') {\r
2796 x = (x<<3) + *s++ - '0';\r
2797 if (s < end && '0' <= *s && *s <= '7')\r
2798 x = (x<<3) + *s++ - '0';\r
2799 }\r
2800 *p++ = x;\r
2801 break;\r
2802\r
2803 /* hex escapes */\r
2804 /* \xXX */\r
2805 case 'x':\r
2806 digits = 2;\r
2807 message = "truncated \\xXX escape";\r
2808 goto hexescape;\r
2809\r
2810 /* \uXXXX */\r
2811 case 'u':\r
2812 digits = 4;\r
2813 message = "truncated \\uXXXX escape";\r
2814 goto hexescape;\r
2815\r
2816 /* \UXXXXXXXX */\r
2817 case 'U':\r
2818 digits = 8;\r
2819 message = "truncated \\UXXXXXXXX escape";\r
2820 hexescape:\r
2821 chr = 0;\r
2822 outpos = p-PyUnicode_AS_UNICODE(v);\r
2823 if (s+digits>end) {\r
2824 endinpos = size;\r
2825 if (unicode_decode_call_errorhandler(\r
2826 errors, &errorHandler,\r
2827 "unicodeescape", "end of string in escape sequence",\r
2828 starts, size, &startinpos, &endinpos, &exc, &s,\r
2829 &v, &outpos, &p))\r
2830 goto onError;\r
2831 goto nextByte;\r
2832 }\r
2833 for (i = 0; i < digits; ++i) {\r
2834 c = (unsigned char) s[i];\r
2835 if (!isxdigit(c)) {\r
2836 endinpos = (s+i+1)-starts;\r
2837 if (unicode_decode_call_errorhandler(\r
2838 errors, &errorHandler,\r
2839 "unicodeescape", message,\r
2840 starts, size, &startinpos, &endinpos, &exc, &s,\r
2841 &v, &outpos, &p))\r
2842 goto onError;\r
2843 goto nextByte;\r
2844 }\r
2845 chr = (chr<<4) & ~0xF;\r
2846 if (c >= '0' && c <= '9')\r
2847 chr += c - '0';\r
2848 else if (c >= 'a' && c <= 'f')\r
2849 chr += 10 + c - 'a';\r
2850 else\r
2851 chr += 10 + c - 'A';\r
2852 }\r
2853 s += i;\r
2854 if (chr == 0xffffffff && PyErr_Occurred())\r
2855 /* _decoding_error will have already written into the\r
2856 target buffer. */\r
2857 break;\r
2858 store:\r
2859 /* when we get here, chr is a 32-bit unicode character */\r
2860 if (chr <= 0xffff)\r
2861 /* UCS-2 character */\r
2862 *p++ = (Py_UNICODE) chr;\r
2863 else if (chr <= 0x10ffff) {\r
2864 /* UCS-4 character. Either store directly, or as\r
2865 surrogate pair. */\r
2866#ifdef Py_UNICODE_WIDE\r
2867 *p++ = chr;\r
2868#else\r
2869 chr -= 0x10000L;\r
2870 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);\r
2871 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);\r
2872#endif\r
2873 } else {\r
2874 endinpos = s-starts;\r
2875 outpos = p-PyUnicode_AS_UNICODE(v);\r
2876 if (unicode_decode_call_errorhandler(\r
2877 errors, &errorHandler,\r
2878 "unicodeescape", "illegal Unicode character",\r
2879 starts, size, &startinpos, &endinpos, &exc, &s,\r
2880 &v, &outpos, &p))\r
2881 goto onError;\r
2882 }\r
2883 break;\r
2884\r
2885 /* \N{name} */\r
2886 case 'N':\r
2887 message = "malformed \\N character escape";\r
2888 if (ucnhash_CAPI == NULL) {\r
2889 /* load the unicode data module */\r
2890 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);\r
2891 if (ucnhash_CAPI == NULL)\r
2892 goto ucnhashError;\r
2893 }\r
2894 if (*s == '{') {\r
2895 const char *start = s+1;\r
2896 /* look for the closing brace */\r
2897 while (*s != '}' && s < end)\r
2898 s++;\r
2899 if (s > start && s < end && *s == '}') {\r
2900 /* found a name. look it up in the unicode database */\r
2901 message = "unknown Unicode character name";\r
2902 s++;\r
2903 if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))\r
2904 goto store;\r
2905 }\r
2906 }\r
2907 endinpos = s-starts;\r
2908 outpos = p-PyUnicode_AS_UNICODE(v);\r
2909 if (unicode_decode_call_errorhandler(\r
2910 errors, &errorHandler,\r
2911 "unicodeescape", message,\r
2912 starts, size, &startinpos, &endinpos, &exc, &s,\r
2913 &v, &outpos, &p))\r
2914 goto onError;\r
2915 break;\r
2916\r
2917 default:\r
2918 if (s > end) {\r
2919 message = "\\ at end of string";\r
2920 s--;\r
2921 endinpos = s-starts;\r
2922 outpos = p-PyUnicode_AS_UNICODE(v);\r
2923 if (unicode_decode_call_errorhandler(\r
2924 errors, &errorHandler,\r
2925 "unicodeescape", message,\r
2926 starts, size, &startinpos, &endinpos, &exc, &s,\r
2927 &v, &outpos, &p))\r
2928 goto onError;\r
2929 }\r
2930 else {\r
2931 *p++ = '\\';\r
2932 *p++ = (unsigned char)s[-1];\r
2933 }\r
2934 break;\r
2935 }\r
2936 nextByte:\r
2937 ;\r
2938 }\r
2939 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r
2940 goto onError;\r
2941 Py_XDECREF(errorHandler);\r
2942 Py_XDECREF(exc);\r
2943 return (PyObject *)v;\r
2944\r
2945 ucnhashError:\r
2946 PyErr_SetString(\r
2947 PyExc_UnicodeError,\r
2948 "\\N escapes not supported (can't load unicodedata module)"\r
2949 );\r
2950 Py_XDECREF(v);\r
2951 Py_XDECREF(errorHandler);\r
2952 Py_XDECREF(exc);\r
2953 return NULL;\r
2954\r
2955 onError:\r
2956 Py_XDECREF(v);\r
2957 Py_XDECREF(errorHandler);\r
2958 Py_XDECREF(exc);\r
2959 return NULL;\r
2960}\r
2961\r
2962/* Return a Unicode-Escape string version of the Unicode object.\r
2963\r
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as\r
2965 appropriate.\r
2966\r
2967*/\r
2968\r
2969Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,\r
2970 Py_ssize_t size,\r
2971 Py_UNICODE ch)\r
2972{\r
2973 /* like wcschr, but doesn't stop at NULL characters */\r
2974\r
2975 while (size-- > 0) {\r
2976 if (*s == ch)\r
2977 return s;\r
2978 s++;\r
2979 }\r
2980\r
2981 return NULL;\r
2982}\r
2983\r
2984static\r
2985PyObject *unicodeescape_string(const Py_UNICODE *s,\r
2986 Py_ssize_t size,\r
2987 int quotes)\r
2988{\r
2989 PyObject *repr;\r
2990 char *p;\r
2991\r
2992 static const char *hexdigit = "0123456789abcdef";\r
2993#ifdef Py_UNICODE_WIDE\r
2994 const Py_ssize_t expandsize = 10;\r
2995#else\r
2996 const Py_ssize_t expandsize = 6;\r
2997#endif\r
2998\r
2999 /* XXX(nnorwitz): rather than over-allocating, it would be\r
3000 better to choose a different scheme. Perhaps scan the\r
3001 first N-chars of the string and allocate based on that size.\r
3002 */\r
3003 /* Initial allocation is based on the longest-possible unichr\r
3004 escape.\r
3005\r
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source\r
3007 unichr, so in this case it's the longest unichr escape. In\r
3008 narrow (UTF-16) builds this is five chars per source unichr\r
3009 since there are two unichrs in the surrogate pair, so in narrow\r
3010 (UTF-16) builds it's not the longest unichr escape.\r
3011\r
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,\r
3013 so in the narrow (UTF-16) build case it's the longest unichr\r
3014 escape.\r
3015 */\r
3016\r
3017 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)\r
3018 return PyErr_NoMemory();\r
3019\r
3020 repr = PyString_FromStringAndSize(NULL,\r
3021 2\r
3022 + expandsize*size\r
3023 + 1);\r
3024 if (repr == NULL)\r
3025 return NULL;\r
3026\r
3027 p = PyString_AS_STRING(repr);\r
3028\r
3029 if (quotes) {\r
3030 *p++ = 'u';\r
3031 *p++ = (findchar(s, size, '\'') &&\r
3032 !findchar(s, size, '"')) ? '"' : '\'';\r
3033 }\r
3034 while (size-- > 0) {\r
3035 Py_UNICODE ch = *s++;\r
3036\r
3037 /* Escape quotes and backslashes */\r
3038 if ((quotes &&\r
3039 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {\r
3040 *p++ = '\\';\r
3041 *p++ = (char) ch;\r
3042 continue;\r
3043 }\r
3044\r
3045#ifdef Py_UNICODE_WIDE\r
3046 /* Map 21-bit characters to '\U00xxxxxx' */\r
3047 else if (ch >= 0x10000) {\r
3048 *p++ = '\\';\r
3049 *p++ = 'U';\r
3050 *p++ = hexdigit[(ch >> 28) & 0x0000000F];\r
3051 *p++ = hexdigit[(ch >> 24) & 0x0000000F];\r
3052 *p++ = hexdigit[(ch >> 20) & 0x0000000F];\r
3053 *p++ = hexdigit[(ch >> 16) & 0x0000000F];\r
3054 *p++ = hexdigit[(ch >> 12) & 0x0000000F];\r
3055 *p++ = hexdigit[(ch >> 8) & 0x0000000F];\r
3056 *p++ = hexdigit[(ch >> 4) & 0x0000000F];\r
3057 *p++ = hexdigit[ch & 0x0000000F];\r
3058 continue;\r
3059 }\r
3060#else\r
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */\r
3062 else if (ch >= 0xD800 && ch < 0xDC00) {\r
3063 Py_UNICODE ch2;\r
3064 Py_UCS4 ucs;\r
3065\r
3066 ch2 = *s++;\r
3067 size--;\r
3068 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {\r
3069 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;\r
3070 *p++ = '\\';\r
3071 *p++ = 'U';\r
3072 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];\r
3073 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];\r
3074 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];\r
3075 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];\r
3076 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];\r
3077 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];\r
3078 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];\r
3079 *p++ = hexdigit[ucs & 0x0000000F];\r
3080 continue;\r
3081 }\r
3082 /* Fall through: isolated surrogates are copied as-is */\r
3083 s--;\r
3084 size++;\r
3085 }\r
3086#endif\r
3087\r
3088 /* Map 16-bit characters to '\uxxxx' */\r
3089 if (ch >= 256) {\r
3090 *p++ = '\\';\r
3091 *p++ = 'u';\r
3092 *p++ = hexdigit[(ch >> 12) & 0x000F];\r
3093 *p++ = hexdigit[(ch >> 8) & 0x000F];\r
3094 *p++ = hexdigit[(ch >> 4) & 0x000F];\r
3095 *p++ = hexdigit[ch & 0x000F];\r
3096 }\r
3097\r
3098 /* Map special whitespace to '\t', \n', '\r' */\r
3099 else if (ch == '\t') {\r
3100 *p++ = '\\';\r
3101 *p++ = 't';\r
3102 }\r
3103 else if (ch == '\n') {\r
3104 *p++ = '\\';\r
3105 *p++ = 'n';\r
3106 }\r
3107 else if (ch == '\r') {\r
3108 *p++ = '\\';\r
3109 *p++ = 'r';\r
3110 }\r
3111\r
3112 /* Map non-printable US ASCII to '\xhh' */\r
3113 else if (ch < ' ' || ch >= 0x7F) {\r
3114 *p++ = '\\';\r
3115 *p++ = 'x';\r
3116 *p++ = hexdigit[(ch >> 4) & 0x000F];\r
3117 *p++ = hexdigit[ch & 0x000F];\r
3118 }\r
3119\r
3120 /* Copy everything else as-is */\r
3121 else\r
3122 *p++ = (char) ch;\r
3123 }\r
3124 if (quotes)\r
3125 *p++ = PyString_AS_STRING(repr)[1];\r
3126\r
3127 *p = '\0';\r
3128 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))\r
3129 return NULL;\r
3130 return repr;\r
3131}\r
3132\r
3133PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,\r
3134 Py_ssize_t size)\r
3135{\r
3136 return unicodeescape_string(s, size, 0);\r
3137}\r
3138\r
3139PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)\r
3140{\r
3141 if (!PyUnicode_Check(unicode)) {\r
3142 PyErr_BadArgument();\r
3143 return NULL;\r
3144 }\r
3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),\r
3146 PyUnicode_GET_SIZE(unicode));\r
3147}\r
3148\r
3149/* --- Raw Unicode Escape Codec ------------------------------------------- */\r
3150\r
3151PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,\r
3152 Py_ssize_t size,\r
3153 const char *errors)\r
3154{\r
3155 const char *starts = s;\r
3156 Py_ssize_t startinpos;\r
3157 Py_ssize_t endinpos;\r
3158 Py_ssize_t outpos;\r
3159 PyUnicodeObject *v;\r
3160 Py_UNICODE *p;\r
3161 const char *end;\r
3162 const char *bs;\r
3163 PyObject *errorHandler = NULL;\r
3164 PyObject *exc = NULL;\r
3165\r
3166 /* Escaped strings will always be longer than the resulting\r
3167 Unicode string, so we start with size here and then reduce the\r
3168 length after conversion to the true value. (But decoding error\r
3169 handler might have to resize the string) */\r
3170 v = _PyUnicode_New(size);\r
3171 if (v == NULL)\r
3172 goto onError;\r
3173 if (size == 0)\r
3174 return (PyObject *)v;\r
3175 p = PyUnicode_AS_UNICODE(v);\r
3176 end = s + size;\r
3177 while (s < end) {\r
3178 unsigned char c;\r
3179 Py_UCS4 x;\r
3180 int i;\r
3181 int count;\r
3182\r
3183 /* Non-escape characters are interpreted as Unicode ordinals */\r
3184 if (*s != '\\') {\r
3185 *p++ = (unsigned char)*s++;\r
3186 continue;\r
3187 }\r
3188 startinpos = s-starts;\r
3189\r
3190 /* \u-escapes are only interpreted iff the number of leading\r
3191 backslashes if odd */\r
3192 bs = s;\r
3193 for (;s < end;) {\r
3194 if (*s != '\\')\r
3195 break;\r
3196 *p++ = (unsigned char)*s++;\r
3197 }\r
3198 if (((s - bs) & 1) == 0 ||\r
3199 s >= end ||\r
3200 (*s != 'u' && *s != 'U')) {\r
3201 continue;\r
3202 }\r
3203 p--;\r
3204 count = *s=='u' ? 4 : 8;\r
3205 s++;\r
3206\r
3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */\r
3208 outpos = p-PyUnicode_AS_UNICODE(v);\r
3209 for (x = 0, i = 0; i < count; ++i, ++s) {\r
3210 c = (unsigned char)*s;\r
3211 if (!isxdigit(c)) {\r
3212 endinpos = s-starts;\r
3213 if (unicode_decode_call_errorhandler(\r
3214 errors, &errorHandler,\r
3215 "rawunicodeescape", "truncated \\uXXXX",\r
3216 starts, size, &startinpos, &endinpos, &exc, &s,\r
3217 &v, &outpos, &p))\r
3218 goto onError;\r
3219 goto nextByte;\r
3220 }\r
3221 x = (x<<4) & ~0xF;\r
3222 if (c >= '0' && c <= '9')\r
3223 x += c - '0';\r
3224 else if (c >= 'a' && c <= 'f')\r
3225 x += 10 + c - 'a';\r
3226 else\r
3227 x += 10 + c - 'A';\r
3228 }\r
3229 if (x <= 0xffff)\r
3230 /* UCS-2 character */\r
3231 *p++ = (Py_UNICODE) x;\r
3232 else if (x <= 0x10ffff) {\r
3233 /* UCS-4 character. Either store directly, or as\r
3234 surrogate pair. */\r
3235#ifdef Py_UNICODE_WIDE\r
3236 *p++ = (Py_UNICODE) x;\r
3237#else\r
3238 x -= 0x10000L;\r
3239 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);\r
3240 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);\r
3241#endif\r
3242 } else {\r
3243 endinpos = s-starts;\r
3244 outpos = p-PyUnicode_AS_UNICODE(v);\r
3245 if (unicode_decode_call_errorhandler(\r
3246 errors, &errorHandler,\r
3247 "rawunicodeescape", "\\Uxxxxxxxx out of range",\r
3248 starts, size, &startinpos, &endinpos, &exc, &s,\r
3249 &v, &outpos, &p))\r
3250 goto onError;\r
3251 }\r
3252 nextByte:\r
3253 ;\r
3254 }\r
3255 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r
3256 goto onError;\r
3257 Py_XDECREF(errorHandler);\r
3258 Py_XDECREF(exc);\r
3259 return (PyObject *)v;\r
3260\r
3261 onError:\r
3262 Py_XDECREF(v);\r
3263 Py_XDECREF(errorHandler);\r
3264 Py_XDECREF(exc);\r
3265 return NULL;\r
3266}\r
3267\r
3268PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,\r
3269 Py_ssize_t size)\r
3270{\r
3271 PyObject *repr;\r
3272 char *p;\r
3273 char *q;\r
3274\r
3275 static const char *hexdigit = "0123456789abcdef";\r
3276#ifdef Py_UNICODE_WIDE\r
3277 const Py_ssize_t expandsize = 10;\r
3278#else\r
3279 const Py_ssize_t expandsize = 6;\r
3280#endif\r
3281\r
3282 if (size > PY_SSIZE_T_MAX / expandsize)\r
3283 return PyErr_NoMemory();\r
3284\r
3285 repr = PyString_FromStringAndSize(NULL, expandsize * size);\r
3286 if (repr == NULL)\r
3287 return NULL;\r
3288 if (size == 0)\r
3289 return repr;\r
3290\r
3291 p = q = PyString_AS_STRING(repr);\r
3292 while (size-- > 0) {\r
3293 Py_UNICODE ch = *s++;\r
3294#ifdef Py_UNICODE_WIDE\r
3295 /* Map 32-bit characters to '\Uxxxxxxxx' */\r
3296 if (ch >= 0x10000) {\r
3297 *p++ = '\\';\r
3298 *p++ = 'U';\r
3299 *p++ = hexdigit[(ch >> 28) & 0xf];\r
3300 *p++ = hexdigit[(ch >> 24) & 0xf];\r
3301 *p++ = hexdigit[(ch >> 20) & 0xf];\r
3302 *p++ = hexdigit[(ch >> 16) & 0xf];\r
3303 *p++ = hexdigit[(ch >> 12) & 0xf];\r
3304 *p++ = hexdigit[(ch >> 8) & 0xf];\r
3305 *p++ = hexdigit[(ch >> 4) & 0xf];\r
3306 *p++ = hexdigit[ch & 15];\r
3307 }\r
3308 else\r
3309#else\r
3310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */\r
3311 if (ch >= 0xD800 && ch < 0xDC00) {\r
3312 Py_UNICODE ch2;\r
3313 Py_UCS4 ucs;\r
3314\r
3315 ch2 = *s++;\r
3316 size--;\r
3317 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {\r
3318 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;\r
3319 *p++ = '\\';\r
3320 *p++ = 'U';\r
3321 *p++ = hexdigit[(ucs >> 28) & 0xf];\r
3322 *p++ = hexdigit[(ucs >> 24) & 0xf];\r
3323 *p++ = hexdigit[(ucs >> 20) & 0xf];\r
3324 *p++ = hexdigit[(ucs >> 16) & 0xf];\r
3325 *p++ = hexdigit[(ucs >> 12) & 0xf];\r
3326 *p++ = hexdigit[(ucs >> 8) & 0xf];\r
3327 *p++ = hexdigit[(ucs >> 4) & 0xf];\r
3328 *p++ = hexdigit[ucs & 0xf];\r
3329 continue;\r
3330 }\r
3331 /* Fall through: isolated surrogates are copied as-is */\r
3332 s--;\r
3333 size++;\r
3334 }\r
3335#endif\r
3336 /* Map 16-bit characters to '\uxxxx' */\r
3337 if (ch >= 256) {\r
3338 *p++ = '\\';\r
3339 *p++ = 'u';\r
3340 *p++ = hexdigit[(ch >> 12) & 0xf];\r
3341 *p++ = hexdigit[(ch >> 8) & 0xf];\r
3342 *p++ = hexdigit[(ch >> 4) & 0xf];\r
3343 *p++ = hexdigit[ch & 15];\r
3344 }\r
3345 /* Copy everything else as-is */\r
3346 else\r
3347 *p++ = (char) ch;\r
3348 }\r
3349 *p = '\0';\r
3350 if (_PyString_Resize(&repr, p - q))\r
3351 return NULL;\r
3352 return repr;\r
3353}\r
3354\r
3355PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)\r
3356{\r
3357 if (!PyUnicode_Check(unicode)) {\r
3358 PyErr_BadArgument();\r
3359 return NULL;\r
3360 }\r
3361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),\r
3362 PyUnicode_GET_SIZE(unicode));\r
3363}\r
3364\r
3365/* --- Unicode Internal Codec ------------------------------------------- */\r
3366\r
3367PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,\r
3368 Py_ssize_t size,\r
3369 const char *errors)\r
3370{\r
3371 const char *starts = s;\r
3372 Py_ssize_t startinpos;\r
3373 Py_ssize_t endinpos;\r
3374 Py_ssize_t outpos;\r
3375 PyUnicodeObject *v;\r
3376 Py_UNICODE *p;\r
3377 const char *end;\r
3378 const char *reason;\r
3379 PyObject *errorHandler = NULL;\r
3380 PyObject *exc = NULL;\r
3381\r
3382#ifdef Py_UNICODE_WIDE\r
3383 Py_UNICODE unimax = PyUnicode_GetMax();\r
3384#endif\r
3385\r
3386 /* XXX overflow detection missing */\r
3387 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);\r
3388 if (v == NULL)\r
3389 goto onError;\r
3390 if (PyUnicode_GetSize((PyObject *)v) == 0)\r
3391 return (PyObject *)v;\r
3392 p = PyUnicode_AS_UNICODE(v);\r
3393 end = s + size;\r
3394\r
3395 while (s < end) {\r
3396 memcpy(p, s, sizeof(Py_UNICODE));\r
3397 /* We have to sanity check the raw data, otherwise doom looms for\r
3398 some malformed UCS-4 data. */\r
3399 if (\r
3400#ifdef Py_UNICODE_WIDE\r
3401 *p > unimax || *p < 0 ||\r
3402#endif\r
3403 end-s < Py_UNICODE_SIZE\r
3404 )\r
3405 {\r
3406 startinpos = s - starts;\r
3407 if (end-s < Py_UNICODE_SIZE) {\r
3408 endinpos = end-starts;\r
3409 reason = "truncated input";\r
3410 }\r
3411 else {\r
3412 endinpos = s - starts + Py_UNICODE_SIZE;\r
3413 reason = "illegal code point (> 0x10FFFF)";\r
3414 }\r
3415 outpos = p - PyUnicode_AS_UNICODE(v);\r
3416 if (unicode_decode_call_errorhandler(\r
3417 errors, &errorHandler,\r
3418 "unicode_internal", reason,\r
3419 starts, size, &startinpos, &endinpos, &exc, &s,\r
3420 &v, &outpos, &p)) {\r
3421 goto onError;\r
3422 }\r
3423 }\r
3424 else {\r
3425 p++;\r
3426 s += Py_UNICODE_SIZE;\r
3427 }\r
3428 }\r
3429\r
3430 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r
3431 goto onError;\r
3432 Py_XDECREF(errorHandler);\r
3433 Py_XDECREF(exc);\r
3434 return (PyObject *)v;\r
3435\r
3436 onError:\r
3437 Py_XDECREF(v);\r
3438 Py_XDECREF(errorHandler);\r
3439 Py_XDECREF(exc);\r
3440 return NULL;\r
3441}\r
3442\r
3443/* --- Latin-1 Codec ------------------------------------------------------ */\r
3444\r
3445PyObject *PyUnicode_DecodeLatin1(const char *s,\r
3446 Py_ssize_t size,\r
3447 const char *errors)\r
3448{\r
3449 PyUnicodeObject *v;\r
3450 Py_UNICODE *p;\r
3451\r
3452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */\r
3453 if (size == 1) {\r
3454 Py_UNICODE r = *(unsigned char*)s;\r
3455 return PyUnicode_FromUnicode(&r, 1);\r
3456 }\r
3457\r
3458 v = _PyUnicode_New(size);\r
3459 if (v == NULL)\r
3460 goto onError;\r
3461 if (size == 0)\r
3462 return (PyObject *)v;\r
3463 p = PyUnicode_AS_UNICODE(v);\r
3464 while (size-- > 0)\r
3465 *p++ = (unsigned char)*s++;\r
3466 return (PyObject *)v;\r
3467\r
3468 onError:\r
3469 Py_XDECREF(v);\r
3470 return NULL;\r
3471}\r
3472\r
3473/* create or adjust a UnicodeEncodeError */\r
3474static void make_encode_exception(PyObject **exceptionObject,\r
3475 const char *encoding,\r
3476 const Py_UNICODE *unicode, Py_ssize_t size,\r
3477 Py_ssize_t startpos, Py_ssize_t endpos,\r
3478 const char *reason)\r
3479{\r
3480 if (*exceptionObject == NULL) {\r
3481 *exceptionObject = PyUnicodeEncodeError_Create(\r
3482 encoding, unicode, size, startpos, endpos, reason);\r
3483 }\r
3484 else {\r
3485 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))\r
3486 goto onError;\r
3487 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))\r
3488 goto onError;\r
3489 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))\r
3490 goto onError;\r
3491 return;\r
3492 onError:\r
3493 Py_DECREF(*exceptionObject);\r
3494 *exceptionObject = NULL;\r
3495 }\r
3496}\r
3497\r
3498/* raises a UnicodeEncodeError */\r
3499static void raise_encode_exception(PyObject **exceptionObject,\r
3500 const char *encoding,\r
3501 const Py_UNICODE *unicode, Py_ssize_t size,\r
3502 Py_ssize_t startpos, Py_ssize_t endpos,\r
3503 const char *reason)\r
3504{\r
3505 make_encode_exception(exceptionObject,\r
3506 encoding, unicode, size, startpos, endpos, reason);\r
3507 if (*exceptionObject != NULL)\r
3508 PyCodec_StrictErrors(*exceptionObject);\r
3509}\r
3510\r
3511/* error handling callback helper:\r
3512 build arguments, call the callback and check the arguments,\r
3513 put the result into newpos and return the replacement string, which\r
3514 has to be freed by the caller */\r
3515static PyObject *unicode_encode_call_errorhandler(const char *errors,\r
3516 PyObject **errorHandler,\r
3517 const char *encoding, const char *reason,\r
3518 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,\r
3519 Py_ssize_t startpos, Py_ssize_t endpos,\r
3520 Py_ssize_t *newpos)\r
3521{\r
3522 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";\r
3523\r
3524 PyObject *restuple;\r
3525 PyObject *resunicode;\r
3526\r
3527 if (*errorHandler == NULL) {\r
3528 *errorHandler = PyCodec_LookupError(errors);\r
3529 if (*errorHandler == NULL)\r
3530 return NULL;\r
3531 }\r
3532\r
3533 make_encode_exception(exceptionObject,\r
3534 encoding, unicode, size, startpos, endpos, reason);\r
3535 if (*exceptionObject == NULL)\r
3536 return NULL;\r
3537\r
3538 restuple = PyObject_CallFunctionObjArgs(\r
3539 *errorHandler, *exceptionObject, NULL);\r
3540 if (restuple == NULL)\r
3541 return NULL;\r
3542 if (!PyTuple_Check(restuple)) {\r
3543 PyErr_SetString(PyExc_TypeError, &argparse[4]);\r
3544 Py_DECREF(restuple);\r
3545 return NULL;\r
3546 }\r
3547 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,\r
3548 &resunicode, newpos)) {\r
3549 Py_DECREF(restuple);\r
3550 return NULL;\r
3551 }\r
3552 if (*newpos<0)\r
3553 *newpos = size+*newpos;\r
3554 if (*newpos<0 || *newpos>size) {\r
3555 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);\r
3556 Py_DECREF(restuple);\r
3557 return NULL;\r
3558 }\r
3559 Py_INCREF(resunicode);\r
3560 Py_DECREF(restuple);\r
3561 return resunicode;\r
3562}\r
3563\r
3564static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,\r
3565 Py_ssize_t size,\r
3566 const char *errors,\r
3567 int limit)\r
3568{\r
3569 /* output object */\r
3570 PyObject *res;\r
3571 /* pointers to the beginning and end+1 of input */\r
3572 const Py_UNICODE *startp = p;\r
3573 const Py_UNICODE *endp = p + size;\r
3574 /* pointer to the beginning of the unencodable characters */\r
3575 /* const Py_UNICODE *badp = NULL; */\r
3576 /* pointer into the output */\r
3577 char *str;\r
3578 /* current output position */\r
3579 Py_ssize_t respos = 0;\r
3580 Py_ssize_t ressize;\r
3581 const char *encoding = (limit == 256) ? "latin-1" : "ascii";\r
3582 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";\r
3583 PyObject *errorHandler = NULL;\r
3584 PyObject *exc = NULL;\r
3585 /* the following variable is used for caching string comparisons\r
3586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */\r
3587 int known_errorHandler = -1;\r
3588\r
3589 /* allocate enough for a simple encoding without\r
3590 replacements, if we need more, we'll resize */\r
3591 res = PyString_FromStringAndSize(NULL, size);\r
3592 if (res == NULL)\r
3593 goto onError;\r
3594 if (size == 0)\r
3595 return res;\r
3596 str = PyString_AS_STRING(res);\r
3597 ressize = size;\r
3598\r
3599 while (p<endp) {\r
3600 Py_UNICODE c = *p;\r
3601\r
3602 /* can we encode this? */\r
3603 if (c<limit) {\r
3604 /* no overflow check, because we know that the space is enough */\r
3605 *str++ = (char)c;\r
3606 ++p;\r
3607 }\r
3608 else {\r
3609 Py_ssize_t unicodepos = p-startp;\r
3610 Py_ssize_t requiredsize;\r
3611 PyObject *repunicode;\r
3612 Py_ssize_t repsize;\r
3613 Py_ssize_t newpos;\r
3614 Py_ssize_t respos;\r
3615 Py_UNICODE *uni2;\r
3616 /* startpos for collecting unencodable chars */\r
3617 const Py_UNICODE *collstart = p;\r
3618 const Py_UNICODE *collend = p;\r
3619 /* find all unecodable characters */\r
3620 while ((collend < endp) && ((*collend)>=limit))\r
3621 ++collend;\r
3622 /* cache callback name lookup (if not done yet, i.e. it's the first error) */\r
3623 if (known_errorHandler==-1) {\r
3624 if ((errors==NULL) || (!strcmp(errors, "strict")))\r
3625 known_errorHandler = 1;\r
3626 else if (!strcmp(errors, "replace"))\r
3627 known_errorHandler = 2;\r
3628 else if (!strcmp(errors, "ignore"))\r
3629 known_errorHandler = 3;\r
3630 else if (!strcmp(errors, "xmlcharrefreplace"))\r
3631 known_errorHandler = 4;\r
3632 else\r
3633 known_errorHandler = 0;\r
3634 }\r
3635 switch (known_errorHandler) {\r
3636 case 1: /* strict */\r
3637 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);\r
3638 goto onError;\r
3639 case 2: /* replace */\r
3640 while (collstart++<collend)\r
3641 *str++ = '?'; /* fall through */\r
3642 case 3: /* ignore */\r
3643 p = collend;\r
3644 break;\r
3645 case 4: /* xmlcharrefreplace */\r
3646 respos = str-PyString_AS_STRING(res);\r
3647 /* determine replacement size (temporarily (mis)uses p) */\r
3648 for (p = collstart, repsize = 0; p < collend; ++p) {\r
3649 if (*p<10)\r
3650 repsize += 2+1+1;\r
3651 else if (*p<100)\r
3652 repsize += 2+2+1;\r
3653 else if (*p<1000)\r
3654 repsize += 2+3+1;\r
3655 else if (*p<10000)\r
3656 repsize += 2+4+1;\r
3657#ifndef Py_UNICODE_WIDE\r
3658 else\r
3659 repsize += 2+5+1;\r
3660#else\r
3661 else if (*p<100000)\r
3662 repsize += 2+5+1;\r
3663 else if (*p<1000000)\r
3664 repsize += 2+6+1;\r
3665 else\r
3666 repsize += 2+7+1;\r
3667#endif\r
3668 }\r
3669 requiredsize = respos+repsize+(endp-collend);\r
3670 if (requiredsize > ressize) {\r
3671 if (requiredsize<2*ressize)\r
3672 requiredsize = 2*ressize;\r
3673 if (_PyString_Resize(&res, requiredsize))\r
3674 goto onError;\r
3675 str = PyString_AS_STRING(res) + respos;\r
3676 ressize = requiredsize;\r
3677 }\r
3678 /* generate replacement (temporarily (mis)uses p) */\r
3679 for (p = collstart; p < collend; ++p) {\r
3680 str += sprintf(str, "&#%d;", (int)*p);\r
3681 }\r
3682 p = collend;\r
3683 break;\r
3684 default:\r
3685 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,\r
3686 encoding, reason, startp, size, &exc,\r
3687 collstart-startp, collend-startp, &newpos);\r
3688 if (repunicode == NULL)\r
3689 goto onError;\r
3690 /* need more space? (at least enough for what we have+the\r
3691 replacement+the rest of the string, so we won't have to\r
3692 check space for encodable characters) */\r
3693 respos = str-PyString_AS_STRING(res);\r
3694 repsize = PyUnicode_GET_SIZE(repunicode);\r
3695 requiredsize = respos+repsize+(endp-collend);\r
3696 if (requiredsize > ressize) {\r
3697 if (requiredsize<2*ressize)\r
3698 requiredsize = 2*ressize;\r
3699 if (_PyString_Resize(&res, requiredsize)) {\r
3700 Py_DECREF(repunicode);\r
3701 goto onError;\r
3702 }\r
3703 str = PyString_AS_STRING(res) + respos;\r
3704 ressize = requiredsize;\r
3705 }\r
3706 /* check if there is anything unencodable in the replacement\r
3707 and copy it to the output */\r
3708 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {\r
3709 c = *uni2;\r
3710 if (c >= limit) {\r
3711 raise_encode_exception(&exc, encoding, startp, size,\r
3712 unicodepos, unicodepos+1, reason);\r
3713 Py_DECREF(repunicode);\r
3714 goto onError;\r
3715 }\r
3716 *str = (char)c;\r
3717 }\r
3718 p = startp + newpos;\r
3719 Py_DECREF(repunicode);\r
3720 }\r
3721 }\r
3722 }\r
3723 /* Resize if we allocated to much */\r
3724 respos = str-PyString_AS_STRING(res);\r
3725 if (respos<ressize)\r
3726 /* If this falls res will be NULL */\r
3727 _PyString_Resize(&res, respos);\r
3728 Py_XDECREF(errorHandler);\r
3729 Py_XDECREF(exc);\r
3730 return res;\r
3731\r
3732 onError:\r
3733 Py_XDECREF(res);\r
3734 Py_XDECREF(errorHandler);\r
3735 Py_XDECREF(exc);\r
3736 return NULL;\r
3737}\r
3738\r
3739PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,\r
3740 Py_ssize_t size,\r
3741 const char *errors)\r
3742{\r
3743 return unicode_encode_ucs1(p, size, errors, 256);\r
3744}\r
3745\r
3746PyObject *PyUnicode_AsLatin1String(PyObject *unicode)\r
3747{\r
3748 if (!PyUnicode_Check(unicode)) {\r
3749 PyErr_BadArgument();\r
3750 return NULL;\r
3751 }\r
3752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),\r
3753 PyUnicode_GET_SIZE(unicode),\r
3754 NULL);\r
3755}\r
3756\r
3757/* --- 7-bit ASCII Codec -------------------------------------------------- */\r
3758\r
3759PyObject *PyUnicode_DecodeASCII(const char *s,\r
3760 Py_ssize_t size,\r
3761 const char *errors)\r
3762{\r
3763 const char *starts = s;\r
3764 PyUnicodeObject *v;\r
3765 Py_UNICODE *p;\r
3766 Py_ssize_t startinpos;\r
3767 Py_ssize_t endinpos;\r
3768 Py_ssize_t outpos;\r
3769 const char *e;\r
3770 PyObject *errorHandler = NULL;\r
3771 PyObject *exc = NULL;\r
3772\r
3773 /* ASCII is equivalent to the first 128 ordinals in Unicode. */\r
3774 if (size == 1 && *(unsigned char*)s < 128) {\r
3775 Py_UNICODE r = *(unsigned char*)s;\r
3776 return PyUnicode_FromUnicode(&r, 1);\r
3777 }\r
3778\r
3779 v = _PyUnicode_New(size);\r
3780 if (v == NULL)\r
3781 goto onError;\r
3782 if (size == 0)\r
3783 return (PyObject *)v;\r
3784 p = PyUnicode_AS_UNICODE(v);\r
3785 e = s + size;\r
3786 while (s < e) {\r
3787 register unsigned char c = (unsigned char)*s;\r
3788 if (c < 128) {\r
3789 *p++ = c;\r
3790 ++s;\r
3791 }\r
3792 else {\r
3793 startinpos = s-starts;\r
3794 endinpos = startinpos + 1;\r
3795 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);\r
3796 if (unicode_decode_call_errorhandler(\r
3797 errors, &errorHandler,\r
3798 "ascii", "ordinal not in range(128)",\r
3799 starts, size, &startinpos, &endinpos, &exc, &s,\r
3800 &v, &outpos, &p))\r
3801 goto onError;\r
3802 }\r
3803 }\r
3804 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))\r
3805 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r
3806 goto onError;\r
3807 Py_XDECREF(errorHandler);\r
3808 Py_XDECREF(exc);\r
3809 return (PyObject *)v;\r
3810\r
3811 onError:\r
3812 Py_XDECREF(v);\r
3813 Py_XDECREF(errorHandler);\r
3814 Py_XDECREF(exc);\r
3815 return NULL;\r
3816}\r
3817\r
3818PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,\r
3819 Py_ssize_t size,\r
3820 const char *errors)\r
3821{\r
3822 return unicode_encode_ucs1(p, size, errors, 128);\r
3823}\r
3824\r
3825PyObject *PyUnicode_AsASCIIString(PyObject *unicode)\r
3826{\r
3827 if (!PyUnicode_Check(unicode)) {\r
3828 PyErr_BadArgument();\r
3829 return NULL;\r
3830 }\r
3831 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),\r
3832 PyUnicode_GET_SIZE(unicode),\r
3833 NULL);\r
3834}\r
3835\r
3836#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)\r
3837\r
3838/* --- MBCS codecs for Windows -------------------------------------------- */\r
3839\r
3840#if SIZEOF_INT < SIZEOF_SIZE_T\r
3841#define NEED_RETRY\r
3842#endif\r
3843\r
3844/* XXX This code is limited to "true" double-byte encodings, as\r
3845 a) it assumes an incomplete character consists of a single byte, and\r
3846 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte\r
3847 encodings, see IsDBCSLeadByteEx documentation. */\r
3848\r
3849static int is_dbcs_lead_byte(const char *s, int offset)\r
3850{\r
3851 const char *curr = s + offset;\r
3852\r
3853 if (IsDBCSLeadByte(*curr)) {\r
3854 const char *prev = CharPrev(s, curr);\r
3855 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);\r
3856 }\r
3857 return 0;\r
3858}\r
3859\r
3860/*\r
3861 * Decode MBCS string into unicode object. If 'final' is set, converts\r
3862 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.\r
3863 */\r
3864static int decode_mbcs(PyUnicodeObject **v,\r
3865 const char *s, /* MBCS string */\r
3866 int size, /* sizeof MBCS string */\r
3867 int final)\r
3868{\r
3869 Py_UNICODE *p;\r
3870 Py_ssize_t n = 0;\r
3871 int usize = 0;\r
3872\r
3873 assert(size >= 0);\r
3874\r
3875 /* Skip trailing lead-byte unless 'final' is set */\r
3876 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))\r
3877 --size;\r
3878\r
3879 /* First get the size of the result */\r
3880 if (size > 0) {\r
3881 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);\r
3882 if (usize == 0) {\r
3883 PyErr_SetFromWindowsErrWithFilename(0, NULL);\r
3884 return -1;\r
3885 }\r
3886 }\r
3887\r
3888 if (*v == NULL) {\r
3889 /* Create unicode object */\r
3890 *v = _PyUnicode_New(usize);\r
3891 if (*v == NULL)\r
3892 return -1;\r
3893 }\r
3894 else {\r
3895 /* Extend unicode object */\r
3896 n = PyUnicode_GET_SIZE(*v);\r
3897 if (_PyUnicode_Resize(v, n + usize) < 0)\r
3898 return -1;\r
3899 }\r
3900\r
3901 /* Do the conversion */\r
3902 if (size > 0) {\r
3903 p = PyUnicode_AS_UNICODE(*v) + n;\r
3904 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {\r
3905 PyErr_SetFromWindowsErrWithFilename(0, NULL);\r
3906 return -1;\r
3907 }\r
3908 }\r
3909\r
3910 return size;\r
3911}\r
3912\r
3913PyObject *PyUnicode_DecodeMBCSStateful(const char *s,\r
3914 Py_ssize_t size,\r
3915 const char *errors,\r
3916 Py_ssize_t *consumed)\r
3917{\r
3918 PyUnicodeObject *v = NULL;\r
3919 int done;\r
3920\r
3921 if (consumed)\r
3922 *consumed = 0;\r
3923\r
3924#ifdef NEED_RETRY\r
3925 retry:\r
3926 if (size > INT_MAX)\r
3927 done = decode_mbcs(&v, s, INT_MAX, 0);\r
3928 else\r
3929#endif\r
3930 done = decode_mbcs(&v, s, (int)size, !consumed);\r
3931\r
3932 if (done < 0) {\r
3933 Py_XDECREF(v);\r
3934 return NULL;\r
3935 }\r
3936\r
3937 if (consumed)\r
3938 *consumed += done;\r
3939\r
3940#ifdef NEED_RETRY\r
3941 if (size > INT_MAX) {\r
3942 s += done;\r
3943 size -= done;\r
3944 goto retry;\r
3945 }\r
3946#endif\r
3947\r
3948 return (PyObject *)v;\r
3949}\r
3950\r
3951PyObject *PyUnicode_DecodeMBCS(const char *s,\r
3952 Py_ssize_t size,\r
3953 const char *errors)\r
3954{\r
3955 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);\r
3956}\r
3957\r
3958/*\r
3959 * Convert unicode into string object (MBCS).\r
3960 * Returns 0 if succeed, -1 otherwise.\r
3961 */\r
3962static int encode_mbcs(PyObject **repr,\r
3963 const Py_UNICODE *p, /* unicode */\r
3964 int size) /* size of unicode */\r
3965{\r
3966 int mbcssize = 0;\r
3967 Py_ssize_t n = 0;\r
3968\r
3969 assert(size >= 0);\r
3970\r
3971 /* First get the size of the result */\r
3972 if (size > 0) {\r
3973 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);\r
3974 if (mbcssize == 0) {\r
3975 PyErr_SetFromWindowsErrWithFilename(0, NULL);\r
3976 return -1;\r
3977 }\r
3978 }\r
3979\r
3980 if (*repr == NULL) {\r
3981 /* Create string object */\r
3982 *repr = PyString_FromStringAndSize(NULL, mbcssize);\r
3983 if (*repr == NULL)\r
3984 return -1;\r
3985 }\r
3986 else {\r
3987 /* Extend string object */\r
3988 n = PyString_Size(*repr);\r
3989 if (_PyString_Resize(repr, n + mbcssize) < 0)\r
3990 return -1;\r
3991 }\r
3992\r
3993 /* Do the conversion */\r
3994 if (size > 0) {\r
3995 char *s = PyString_AS_STRING(*repr) + n;\r
3996 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {\r
3997 PyErr_SetFromWindowsErrWithFilename(0, NULL);\r
3998 return -1;\r
3999 }\r
4000 }\r
4001\r
4002 return 0;\r
4003}\r
4004\r
4005PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,\r
4006 Py_ssize_t size,\r
4007 const char *errors)\r
4008{\r
4009 PyObject *repr = NULL;\r
4010 int ret;\r
4011\r
4012#ifdef NEED_RETRY\r
4013 retry:\r
4014 if (size > INT_MAX)\r
4015 ret = encode_mbcs(&repr, p, INT_MAX);\r
4016 else\r
4017#endif\r
4018 ret = encode_mbcs(&repr, p, (int)size);\r
4019\r
4020 if (ret < 0) {\r
4021 Py_XDECREF(repr);\r
4022 return NULL;\r
4023 }\r
4024\r
4025#ifdef NEED_RETRY\r
4026 if (size > INT_MAX) {\r
4027 p += INT_MAX;\r
4028 size -= INT_MAX;\r
4029 goto retry;\r
4030 }\r
4031#endif\r
4032\r
4033 return repr;\r
4034}\r
4035\r
4036PyObject *PyUnicode_AsMBCSString(PyObject *unicode)\r
4037{\r
4038 if (!PyUnicode_Check(unicode)) {\r
4039 PyErr_BadArgument();\r
4040 return NULL;\r
4041 }\r
4042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),\r
4043 PyUnicode_GET_SIZE(unicode),\r
4044 NULL);\r
4045}\r
4046\r
4047#undef NEED_RETRY\r
4048\r
4049#endif /* MS_WINDOWS */\r
4050\r
4051/* --- Character Mapping Codec -------------------------------------------- */\r
4052\r
4053PyObject *PyUnicode_DecodeCharmap(const char *s,\r
4054 Py_ssize_t size,\r
4055 PyObject *mapping,\r
4056 const char *errors)\r
4057{\r
4058 const char *starts = s;\r
4059 Py_ssize_t startinpos;\r
4060 Py_ssize_t endinpos;\r
4061 Py_ssize_t outpos;\r
4062 const char *e;\r
4063 PyUnicodeObject *v;\r
4064 Py_UNICODE *p;\r
4065 Py_ssize_t extrachars = 0;\r
4066 PyObject *errorHandler = NULL;\r
4067 PyObject *exc = NULL;\r
4068 Py_UNICODE *mapstring = NULL;\r
4069 Py_ssize_t maplen = 0;\r
4070\r
4071 /* Default to Latin-1 */\r
4072 if (mapping == NULL)\r
4073 return PyUnicode_DecodeLatin1(s, size, errors);\r
4074\r
4075 v = _PyUnicode_New(size);\r
4076 if (v == NULL)\r
4077 goto onError;\r
4078 if (size == 0)\r
4079 return (PyObject *)v;\r
4080 p = PyUnicode_AS_UNICODE(v);\r
4081 e = s + size;\r
4082 if (PyUnicode_CheckExact(mapping)) {\r
4083 mapstring = PyUnicode_AS_UNICODE(mapping);\r
4084 maplen = PyUnicode_GET_SIZE(mapping);\r
4085 while (s < e) {\r
4086 unsigned char ch = *s;\r
4087 Py_UNICODE x = 0xfffe; /* illegal value */\r
4088\r
4089 if (ch < maplen)\r
4090 x = mapstring[ch];\r
4091\r
4092 if (x == 0xfffe) {\r
4093 /* undefined mapping */\r
4094 outpos = p-PyUnicode_AS_UNICODE(v);\r
4095 startinpos = s-starts;\r
4096 endinpos = startinpos+1;\r
4097 if (unicode_decode_call_errorhandler(\r
4098 errors, &errorHandler,\r
4099 "charmap", "character maps to <undefined>",\r
4100 starts, size, &startinpos, &endinpos, &exc, &s,\r
4101 &v, &outpos, &p)) {\r
4102 goto onError;\r
4103 }\r
4104 continue;\r
4105 }\r
4106 *p++ = x;\r
4107 ++s;\r
4108 }\r
4109 }\r
4110 else {\r
4111 while (s < e) {\r
4112 unsigned char ch = *s;\r
4113 PyObject *w, *x;\r
4114\r
4115 /* Get mapping (char ordinal -> integer, Unicode char or None) */\r
4116 w = PyInt_FromLong((long)ch);\r
4117 if (w == NULL)\r
4118 goto onError;\r
4119 x = PyObject_GetItem(mapping, w);\r
4120 Py_DECREF(w);\r
4121 if (x == NULL) {\r
4122 if (PyErr_ExceptionMatches(PyExc_LookupError)) {\r
4123 /* No mapping found means: mapping is undefined. */\r
4124 PyErr_Clear();\r
4125 x = Py_None;\r
4126 Py_INCREF(x);\r
4127 } else\r
4128 goto onError;\r
4129 }\r
4130\r
4131 /* Apply mapping */\r
4132 if (PyInt_Check(x)) {\r
4133 long value = PyInt_AS_LONG(x);\r
4134 if (value < 0 || value > 65535) {\r
4135 PyErr_SetString(PyExc_TypeError,\r
4136 "character mapping must be in range(65536)");\r
4137 Py_DECREF(x);\r
4138 goto onError;\r
4139 }\r
4140 *p++ = (Py_UNICODE)value;\r
4141 }\r
4142 else if (x == Py_None) {\r
4143 /* undefined mapping */\r
4144 outpos = p-PyUnicode_AS_UNICODE(v);\r
4145 startinpos = s-starts;\r
4146 endinpos = startinpos+1;\r
4147 if (unicode_decode_call_errorhandler(\r
4148 errors, &errorHandler,\r
4149 "charmap", "character maps to <undefined>",\r
4150 starts, size, &startinpos, &endinpos, &exc, &s,\r
4151 &v, &outpos, &p)) {\r
4152 Py_DECREF(x);\r
4153 goto onError;\r
4154 }\r
4155 Py_DECREF(x);\r
4156 continue;\r
4157 }\r
4158 else if (PyUnicode_Check(x)) {\r
4159 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);\r
4160\r
4161 if (targetsize == 1)\r
4162 /* 1-1 mapping */\r
4163 *p++ = *PyUnicode_AS_UNICODE(x);\r
4164\r
4165 else if (targetsize > 1) {\r
4166 /* 1-n mapping */\r
4167 if (targetsize > extrachars) {\r
4168 /* resize first */\r
4169 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);\r
4170 Py_ssize_t needed = (targetsize - extrachars) + \\r
4171 (targetsize << 2);\r
4172 extrachars += needed;\r
4173 /* XXX overflow detection missing */\r
4174 if (_PyUnicode_Resize(&v,\r
4175 PyUnicode_GET_SIZE(v) + needed) < 0) {\r
4176 Py_DECREF(x);\r
4177 goto onError;\r
4178 }\r
4179 p = PyUnicode_AS_UNICODE(v) + oldpos;\r
4180 }\r
4181 Py_UNICODE_COPY(p,\r
4182 PyUnicode_AS_UNICODE(x),\r
4183 targetsize);\r
4184 p += targetsize;\r
4185 extrachars -= targetsize;\r
4186 }\r
4187 /* 1-0 mapping: skip the character */\r
4188 }\r
4189 else {\r
4190 /* wrong return value */\r
4191 PyErr_SetString(PyExc_TypeError,\r
4192 "character mapping must return integer, None or unicode");\r
4193 Py_DECREF(x);\r
4194 goto onError;\r
4195 }\r
4196 Py_DECREF(x);\r
4197 ++s;\r
4198 }\r
4199 }\r
4200 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))\r
4201 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)\r
4202 goto onError;\r
4203 Py_XDECREF(errorHandler);\r
4204 Py_XDECREF(exc);\r
4205 return (PyObject *)v;\r
4206\r
4207 onError:\r
4208 Py_XDECREF(errorHandler);\r
4209 Py_XDECREF(exc);\r
4210 Py_XDECREF(v);\r
4211 return NULL;\r
4212}\r
4213\r
4214/* Charmap encoding: the lookup table */\r
4215\r
4216struct encoding_map{\r
4217 PyObject_HEAD\r
4218 unsigned char level1[32];\r
4219 int count2, count3;\r
4220 unsigned char level23[1];\r
4221};\r
4222\r
4223static PyObject*\r
4224encoding_map_size(PyObject *obj, PyObject* args)\r
4225{\r
4226 struct encoding_map *map = (struct encoding_map*)obj;\r
4227 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +\r
4228 128*map->count3);\r
4229}\r
4230\r
4231static PyMethodDef encoding_map_methods[] = {\r
4232 {"size", encoding_map_size, METH_NOARGS,\r
4233 PyDoc_STR("Return the size (in bytes) of this object") },\r
4234 { 0 }\r
4235};\r
4236\r
4237static void\r
4238encoding_map_dealloc(PyObject* o)\r
4239{\r
4240 PyObject_FREE(o);\r
4241}\r
4242\r
4243static PyTypeObject EncodingMapType = {\r
4244 PyVarObject_HEAD_INIT(NULL, 0)\r
4245 "EncodingMap", /*tp_name*/\r
4246 sizeof(struct encoding_map), /*tp_basicsize*/\r
4247 0, /*tp_itemsize*/\r
4248 /* methods */\r
4249 encoding_map_dealloc, /*tp_dealloc*/\r
4250 0, /*tp_print*/\r
4251 0, /*tp_getattr*/\r
4252 0, /*tp_setattr*/\r
4253 0, /*tp_compare*/\r
4254 0, /*tp_repr*/\r
4255 0, /*tp_as_number*/\r
4256 0, /*tp_as_sequence*/\r
4257 0, /*tp_as_mapping*/\r
4258 0, /*tp_hash*/\r
4259 0, /*tp_call*/\r
4260 0, /*tp_str*/\r
4261 0, /*tp_getattro*/\r
4262 0, /*tp_setattro*/\r
4263 0, /*tp_as_buffer*/\r
4264 Py_TPFLAGS_DEFAULT, /*tp_flags*/\r
4265 0, /*tp_doc*/\r
4266 0, /*tp_traverse*/\r
4267 0, /*tp_clear*/\r
4268 0, /*tp_richcompare*/\r
4269 0, /*tp_weaklistoffset*/\r
4270 0, /*tp_iter*/\r
4271 0, /*tp_iternext*/\r
4272 encoding_map_methods, /*tp_methods*/\r
4273 0, /*tp_members*/\r
4274 0, /*tp_getset*/\r
4275 0, /*tp_base*/\r
4276 0, /*tp_dict*/\r
4277 0, /*tp_descr_get*/\r
4278 0, /*tp_descr_set*/\r
4279 0, /*tp_dictoffset*/\r
4280 0, /*tp_init*/\r
4281 0, /*tp_alloc*/\r
4282 0, /*tp_new*/\r
4283 0, /*tp_free*/\r
4284 0, /*tp_is_gc*/\r
4285};\r
4286\r
4287PyObject*\r
4288PyUnicode_BuildEncodingMap(PyObject* string)\r
4289{\r
4290 Py_UNICODE *decode;\r
4291 PyObject *result;\r
4292 struct encoding_map *mresult;\r
4293 int i;\r
4294 int need_dict = 0;\r
4295 unsigned char level1[32];\r
4296 unsigned char level2[512];\r
4297 unsigned char *mlevel1, *mlevel2, *mlevel3;\r
4298 int count2 = 0, count3 = 0;\r
4299\r
4300 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {\r
4301 PyErr_BadArgument();\r
4302 return NULL;\r
4303 }\r
4304 decode = PyUnicode_AS_UNICODE(string);\r
4305 memset(level1, 0xFF, sizeof level1);\r
4306 memset(level2, 0xFF, sizeof level2);\r
4307\r
4308 /* If there isn't a one-to-one mapping of NULL to \0,\r
4309 or if there are non-BMP characters, we need to use\r
4310 a mapping dictionary. */\r
4311 if (decode[0] != 0)\r
4312 need_dict = 1;\r
4313 for (i = 1; i < 256; i++) {\r
4314 int l1, l2;\r
4315 if (decode[i] == 0\r
4316#ifdef Py_UNICODE_WIDE\r
4317 || decode[i] > 0xFFFF\r
4318#endif\r
4319 ) {\r
4320 need_dict = 1;\r
4321 break;\r
4322 }\r
4323 if (decode[i] == 0xFFFE)\r
4324 /* unmapped character */\r
4325 continue;\r
4326 l1 = decode[i] >> 11;\r
4327 l2 = decode[i] >> 7;\r
4328 if (level1[l1] == 0xFF)\r
4329 level1[l1] = count2++;\r
4330 if (level2[l2] == 0xFF)\r
4331 level2[l2] = count3++;\r
4332 }\r
4333\r
4334 if (count2 >= 0xFF || count3 >= 0xFF)\r
4335 need_dict = 1;\r
4336\r
4337 if (need_dict) {\r
4338 PyObject *result = PyDict_New();\r
4339 PyObject *key, *value;\r
4340 if (!result)\r
4341 return NULL;\r
4342 for (i = 0; i < 256; i++) {\r
4343 value = NULL;\r
4344 key = PyInt_FromLong(decode[i]);\r
4345 value = PyInt_FromLong(i);\r
4346 if (!key || !value)\r
4347 goto failed1;\r
4348 if (PyDict_SetItem(result, key, value) == -1)\r
4349 goto failed1;\r
4350 Py_DECREF(key);\r
4351 Py_DECREF(value);\r
4352 }\r
4353 return result;\r
4354 failed1:\r
4355 Py_XDECREF(key);\r
4356 Py_XDECREF(value);\r
4357 Py_DECREF(result);\r
4358 return NULL;\r
4359 }\r
4360\r
4361 /* Create a three-level trie */\r
4362 result = PyObject_MALLOC(sizeof(struct encoding_map) +\r
4363 16*count2 + 128*count3 - 1);\r
4364 if (!result)\r
4365 return PyErr_NoMemory();\r
4366 PyObject_Init(result, &EncodingMapType);\r
4367 mresult = (struct encoding_map*)result;\r
4368 mresult->count2 = count2;\r
4369 mresult->count3 = count3;\r
4370 mlevel1 = mresult->level1;\r
4371 mlevel2 = mresult->level23;\r
4372 mlevel3 = mresult->level23 + 16*count2;\r
4373 memcpy(mlevel1, level1, 32);\r
4374 memset(mlevel2, 0xFF, 16*count2);\r
4375 memset(mlevel3, 0, 128*count3);\r
4376 count3 = 0;\r
4377 for (i = 1; i < 256; i++) {\r
4378 int o1, o2, o3, i2, i3;\r
4379 if (decode[i] == 0xFFFE)\r
4380 /* unmapped character */\r
4381 continue;\r
4382 o1 = decode[i]>>11;\r
4383 o2 = (decode[i]>>7) & 0xF;\r
4384 i2 = 16*mlevel1[o1] + o2;\r
4385 if (mlevel2[i2] == 0xFF)\r
4386 mlevel2[i2] = count3++;\r
4387 o3 = decode[i] & 0x7F;\r
4388 i3 = 128*mlevel2[i2] + o3;\r
4389 mlevel3[i3] = i;\r
4390 }\r
4391 return result;\r
4392}\r
4393\r
4394static int\r
4395encoding_map_lookup(Py_UNICODE c, PyObject *mapping)\r
4396{\r
4397 struct encoding_map *map = (struct encoding_map*)mapping;\r
4398 int l1 = c>>11;\r
4399 int l2 = (c>>7) & 0xF;\r
4400 int l3 = c & 0x7F;\r
4401 int i;\r
4402\r
4403#ifdef Py_UNICODE_WIDE\r
4404 if (c > 0xFFFF) {\r
4405 return -1;\r
4406 }\r
4407#endif\r
4408 if (c == 0)\r
4409 return 0;\r
4410 /* level 1*/\r
4411 i = map->level1[l1];\r
4412 if (i == 0xFF) {\r
4413 return -1;\r
4414 }\r
4415 /* level 2*/\r
4416 i = map->level23[16*i+l2];\r
4417 if (i == 0xFF) {\r
4418 return -1;\r
4419 }\r
4420 /* level 3 */\r
4421 i = map->level23[16*map->count2 + 128*i + l3];\r
4422 if (i == 0) {\r
4423 return -1;\r
4424 }\r
4425 return i;\r
4426}\r
4427\r
4428/* Lookup the character ch in the mapping. If the character\r
4429 can't be found, Py_None is returned (or NULL, if another\r
4430 error occurred). */\r
4431static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)\r
4432{\r
4433 PyObject *w = PyInt_FromLong((long)c);\r
4434 PyObject *x;\r
4435\r
4436 if (w == NULL)\r
4437 return NULL;\r
4438 x = PyObject_GetItem(mapping, w);\r
4439 Py_DECREF(w);\r
4440 if (x == NULL) {\r
4441 if (PyErr_ExceptionMatches(PyExc_LookupError)) {\r
4442 /* No mapping found means: mapping is undefined. */\r
4443 PyErr_Clear();\r
4444 x = Py_None;\r
4445 Py_INCREF(x);\r
4446 return x;\r
4447 } else\r
4448 return NULL;\r
4449 }\r
4450 else if (x == Py_None)\r
4451 return x;\r
4452 else if (PyInt_Check(x)) {\r
4453 long value = PyInt_AS_LONG(x);\r
4454 if (value < 0 || value > 255) {\r
4455 PyErr_SetString(PyExc_TypeError,\r
4456 "character mapping must be in range(256)");\r
4457 Py_DECREF(x);\r
4458 return NULL;\r
4459 }\r
4460 return x;\r
4461 }\r
4462 else if (PyString_Check(x))\r
4463 return x;\r
4464 else {\r
4465 /* wrong return value */\r
4466 PyErr_SetString(PyExc_TypeError,\r
4467 "character mapping must return integer, None or str");\r
4468 Py_DECREF(x);\r
4469 return NULL;\r
4470 }\r
4471}\r
4472\r
4473static int\r
4474charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)\r
4475{\r
4476 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);\r
4477 /* exponentially overallocate to minimize reallocations */\r
4478 if (requiredsize < 2*outsize)\r
4479 requiredsize = 2*outsize;\r
4480 if (_PyString_Resize(outobj, requiredsize)) {\r
4481 return 0;\r
4482 }\r
4483 return 1;\r
4484}\r
4485\r
4486typedef enum charmapencode_result {\r
4487 enc_SUCCESS, enc_FAILED, enc_EXCEPTION\r
4488}charmapencode_result;\r
4489/* lookup the character, put the result in the output string and adjust\r
4490 various state variables. Reallocate the output string if not enough\r
4491 space is available. Return a new reference to the object that\r
4492 was put in the output buffer, or Py_None, if the mapping was undefined\r
4493 (in which case no character was written) or NULL, if a\r
4494 reallocation error occurred. The caller must decref the result */\r
4495static\r
4496charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,\r
4497 PyObject **outobj, Py_ssize_t *outpos)\r
4498{\r
4499 PyObject *rep;\r
4500 char *outstart;\r
4501 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);\r
4502\r
4503 if (Py_TYPE(mapping) == &EncodingMapType) {\r
4504 int res = encoding_map_lookup(c, mapping);\r
4505 Py_ssize_t requiredsize = *outpos+1;\r
4506 if (res == -1)\r
4507 return enc_FAILED;\r
4508 if (outsize<requiredsize)\r
4509 if (!charmapencode_resize(outobj, outpos, requiredsize))\r
4510 return enc_EXCEPTION;\r
4511 outstart = PyString_AS_STRING(*outobj);\r
4512 outstart[(*outpos)++] = (char)res;\r
4513 return enc_SUCCESS;\r
4514 }\r
4515\r
4516 rep = charmapencode_lookup(c, mapping);\r
4517 if (rep==NULL)\r
4518 return enc_EXCEPTION;\r
4519 else if (rep==Py_None) {\r
4520 Py_DECREF(rep);\r
4521 return enc_FAILED;\r
4522 } else {\r
4523 if (PyInt_Check(rep)) {\r
4524 Py_ssize_t requiredsize = *outpos+1;\r
4525 if (outsize<requiredsize)\r
4526 if (!charmapencode_resize(outobj, outpos, requiredsize)) {\r
4527 Py_DECREF(rep);\r
4528 return enc_EXCEPTION;\r
4529 }\r
4530 outstart = PyString_AS_STRING(*outobj);\r
4531 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);\r
4532 }\r
4533 else {\r
4534 const char *repchars = PyString_AS_STRING(rep);\r
4535 Py_ssize_t repsize = PyString_GET_SIZE(rep);\r
4536 Py_ssize_t requiredsize = *outpos+repsize;\r
4537 if (outsize<requiredsize)\r
4538 if (!charmapencode_resize(outobj, outpos, requiredsize)) {\r
4539 Py_DECREF(rep);\r
4540 return enc_EXCEPTION;\r
4541 }\r
4542 outstart = PyString_AS_STRING(*outobj);\r
4543 memcpy(outstart + *outpos, repchars, repsize);\r
4544 *outpos += repsize;\r
4545 }\r
4546 }\r
4547 Py_DECREF(rep);\r
4548 return enc_SUCCESS;\r
4549}\r
4550\r
4551/* handle an error in PyUnicode_EncodeCharmap\r
4552 Return 0 on success, -1 on error */\r
4553static\r
4554int charmap_encoding_error(\r
4555 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,\r
4556 PyObject **exceptionObject,\r
4557 int *known_errorHandler, PyObject **errorHandler, const char *errors,\r
4558 PyObject **res, Py_ssize_t *respos)\r
4559{\r
4560 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */\r
4561 Py_ssize_t repsize;\r
4562 Py_ssize_t newpos;\r
4563 Py_UNICODE *uni2;\r
4564 /* startpos for collecting unencodable chars */\r
4565 Py_ssize_t collstartpos = *inpos;\r
4566 Py_ssize_t collendpos = *inpos+1;\r
4567 Py_ssize_t collpos;\r
4568 char *encoding = "charmap";\r
4569 char *reason = "character maps to <undefined>";\r
4570 charmapencode_result x;\r
4571\r
4572 /* find all unencodable characters */\r
4573 while (collendpos < size) {\r
4574 PyObject *rep;\r
4575 if (Py_TYPE(mapping) == &EncodingMapType) {\r
4576 int res = encoding_map_lookup(p[collendpos], mapping);\r
4577 if (res != -1)\r
4578 break;\r
4579 ++collendpos;\r
4580 continue;\r
4581 }\r
4582\r
4583 rep = charmapencode_lookup(p[collendpos], mapping);\r
4584 if (rep==NULL)\r
4585 return -1;\r
4586 else if (rep!=Py_None) {\r
4587 Py_DECREF(rep);\r
4588 break;\r
4589 }\r
4590 Py_DECREF(rep);\r
4591 ++collendpos;\r
4592 }\r
4593 /* cache callback name lookup\r
4594 * (if not done yet, i.e. it's the first error) */\r
4595 if (*known_errorHandler==-1) {\r
4596 if ((errors==NULL) || (!strcmp(errors, "strict")))\r
4597 *known_errorHandler = 1;\r
4598 else if (!strcmp(errors, "replace"))\r
4599 *known_errorHandler = 2;\r
4600 else if (!strcmp(errors, "ignore"))\r
4601 *known_errorHandler = 3;\r
4602 else if (!strcmp(errors, "xmlcharrefreplace"))\r
4603 *known_errorHandler = 4;\r
4604 else\r
4605 *known_errorHandler = 0;\r
4606 }\r
4607 switch (*known_errorHandler) {\r
4608 case 1: /* strict */\r
4609 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r
4610 return -1;\r
4611 case 2: /* replace */\r
4612 for (collpos = collstartpos; collpos<collendpos; ++collpos) {\r
4613 x = charmapencode_output('?', mapping, res, respos);\r
4614 if (x==enc_EXCEPTION) {\r
4615 return -1;\r
4616 }\r
4617 else if (x==enc_FAILED) {\r
4618 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r
4619 return -1;\r
4620 }\r
4621 }\r
4622 /* fall through */\r
4623 case 3: /* ignore */\r
4624 *inpos = collendpos;\r
4625 break;\r
4626 case 4: /* xmlcharrefreplace */\r
4627 /* generate replacement (temporarily (mis)uses p) */\r
4628 for (collpos = collstartpos; collpos < collendpos; ++collpos) {\r
4629 char buffer[2+29+1+1];\r
4630 char *cp;\r
4631 sprintf(buffer, "&#%d;", (int)p[collpos]);\r
4632 for (cp = buffer; *cp; ++cp) {\r
4633 x = charmapencode_output(*cp, mapping, res, respos);\r
4634 if (x==enc_EXCEPTION)\r
4635 return -1;\r
4636 else if (x==enc_FAILED) {\r
4637 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r
4638 return -1;\r
4639 }\r
4640 }\r
4641 }\r
4642 *inpos = collendpos;\r
4643 break;\r
4644 default:\r
4645 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,\r
4646 encoding, reason, p, size, exceptionObject,\r
4647 collstartpos, collendpos, &newpos);\r
4648 if (repunicode == NULL)\r
4649 return -1;\r
4650 /* generate replacement */\r
4651 repsize = PyUnicode_GET_SIZE(repunicode);\r
4652 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {\r
4653 x = charmapencode_output(*uni2, mapping, res, respos);\r
4654 if (x==enc_EXCEPTION) {\r
4655 return -1;\r
4656 }\r
4657 else if (x==enc_FAILED) {\r
4658 Py_DECREF(repunicode);\r
4659 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);\r
4660 return -1;\r
4661 }\r
4662 }\r
4663 *inpos = newpos;\r
4664 Py_DECREF(repunicode);\r
4665 }\r
4666 return 0;\r
4667}\r
4668\r
4669PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,\r
4670 Py_ssize_t size,\r
4671 PyObject *mapping,\r
4672 const char *errors)\r
4673{\r
4674 /* output object */\r
4675 PyObject *res = NULL;\r
4676 /* current input position */\r
4677 Py_ssize_t inpos = 0;\r
4678 /* current output position */\r
4679 Py_ssize_t respos = 0;\r
4680 PyObject *errorHandler = NULL;\r
4681 PyObject *exc = NULL;\r
4682 /* the following variable is used for caching string comparisons\r
4683 * -1=not initialized, 0=unknown, 1=strict, 2=replace,\r
4684 * 3=ignore, 4=xmlcharrefreplace */\r
4685 int known_errorHandler = -1;\r
4686\r
4687 /* Default to Latin-1 */\r
4688 if (mapping == NULL)\r
4689 return PyUnicode_EncodeLatin1(p, size, errors);\r
4690\r
4691 /* allocate enough for a simple encoding without\r
4692 replacements, if we need more, we'll resize */\r
4693 res = PyString_FromStringAndSize(NULL, size);\r
4694 if (res == NULL)\r
4695 goto onError;\r
4696 if (size == 0)\r
4697 return res;\r
4698\r
4699 while (inpos<size) {\r
4700 /* try to encode it */\r
4701 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);\r
4702 if (x==enc_EXCEPTION) /* error */\r
4703 goto onError;\r
4704 if (x==enc_FAILED) { /* unencodable character */\r
4705 if (charmap_encoding_error(p, size, &inpos, mapping,\r
4706 &exc,\r
4707 &known_errorHandler, &errorHandler, errors,\r
4708 &res, &respos)) {\r
4709 goto onError;\r
4710 }\r
4711 }\r
4712 else\r
4713 /* done with this character => adjust input position */\r
4714 ++inpos;\r
4715 }\r
4716\r
4717 /* Resize if we allocated to much */\r
4718 if (respos<PyString_GET_SIZE(res)) {\r
4719 if (_PyString_Resize(&res, respos))\r
4720 goto onError;\r
4721 }\r
4722 Py_XDECREF(exc);\r
4723 Py_XDECREF(errorHandler);\r
4724 return res;\r
4725\r
4726 onError:\r
4727 Py_XDECREF(res);\r
4728 Py_XDECREF(exc);\r
4729 Py_XDECREF(errorHandler);\r
4730 return NULL;\r
4731}\r
4732\r
4733PyObject *PyUnicode_AsCharmapString(PyObject *unicode,\r
4734 PyObject *mapping)\r
4735{\r
4736 if (!PyUnicode_Check(unicode) || mapping == NULL) {\r
4737 PyErr_BadArgument();\r
4738 return NULL;\r
4739 }\r
4740 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),\r
4741 PyUnicode_GET_SIZE(unicode),\r
4742 mapping,\r
4743 NULL);\r
4744}\r
4745\r
4746/* create or adjust a UnicodeTranslateError */\r
4747static void make_translate_exception(PyObject **exceptionObject,\r
4748 const Py_UNICODE *unicode, Py_ssize_t size,\r
4749 Py_ssize_t startpos, Py_ssize_t endpos,\r
4750 const char *reason)\r
4751{\r
4752 if (*exceptionObject == NULL) {\r
4753 *exceptionObject = PyUnicodeTranslateError_Create(\r
4754 unicode, size, startpos, endpos, reason);\r
4755 }\r
4756 else {\r
4757 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))\r
4758 goto onError;\r
4759 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))\r
4760 goto onError;\r
4761 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))\r
4762 goto onError;\r
4763 return;\r
4764 onError:\r
4765 Py_DECREF(*exceptionObject);\r
4766 *exceptionObject = NULL;\r
4767 }\r
4768}\r
4769\r
4770/* raises a UnicodeTranslateError */\r
4771static void raise_translate_exception(PyObject **exceptionObject,\r
4772 const Py_UNICODE *unicode, Py_ssize_t size,\r
4773 Py_ssize_t startpos, Py_ssize_t endpos,\r
4774 const char *reason)\r
4775{\r
4776 make_translate_exception(exceptionObject,\r
4777 unicode, size, startpos, endpos, reason);\r
4778 if (*exceptionObject != NULL)\r
4779 PyCodec_StrictErrors(*exceptionObject);\r
4780}\r
4781\r
4782/* error handling callback helper:\r
4783 build arguments, call the callback and check the arguments,\r
4784 put the result into newpos and return the replacement string, which\r
4785 has to be freed by the caller */\r
4786static PyObject *unicode_translate_call_errorhandler(const char *errors,\r
4787 PyObject **errorHandler,\r
4788 const char *reason,\r
4789 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,\r
4790 Py_ssize_t startpos, Py_ssize_t endpos,\r
4791 Py_ssize_t *newpos)\r
4792{\r
4793 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";\r
4794\r
4795 Py_ssize_t i_newpos;\r
4796 PyObject *restuple;\r
4797 PyObject *resunicode;\r
4798\r
4799 if (*errorHandler == NULL) {\r
4800 *errorHandler = PyCodec_LookupError(errors);\r
4801 if (*errorHandler == NULL)\r
4802 return NULL;\r
4803 }\r
4804\r
4805 make_translate_exception(exceptionObject,\r
4806 unicode, size, startpos, endpos, reason);\r
4807 if (*exceptionObject == NULL)\r
4808 return NULL;\r
4809\r
4810 restuple = PyObject_CallFunctionObjArgs(\r
4811 *errorHandler, *exceptionObject, NULL);\r
4812 if (restuple == NULL)\r
4813 return NULL;\r
4814 if (!PyTuple_Check(restuple)) {\r
4815 PyErr_SetString(PyExc_TypeError, &argparse[4]);\r
4816 Py_DECREF(restuple);\r
4817 return NULL;\r
4818 }\r
4819 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,\r
4820 &resunicode, &i_newpos)) {\r
4821 Py_DECREF(restuple);\r
4822 return NULL;\r
4823 }\r
4824 if (i_newpos<0)\r
4825 *newpos = size+i_newpos;\r
4826 else\r
4827 *newpos = i_newpos;\r
4828 if (*newpos<0 || *newpos>size) {\r
4829 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);\r
4830 Py_DECREF(restuple);\r
4831 return NULL;\r
4832 }\r
4833 Py_INCREF(resunicode);\r
4834 Py_DECREF(restuple);\r
4835 return resunicode;\r
4836}\r
4837\r
4838/* Lookup the character ch in the mapping and put the result in result,\r
4839 which must be decrefed by the caller.\r
4840 Return 0 on success, -1 on error */\r
4841static\r
4842int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)\r
4843{\r
4844 PyObject *w = PyInt_FromLong((long)c);\r
4845 PyObject *x;\r
4846\r
4847 if (w == NULL)\r
4848 return -1;\r
4849 x = PyObject_GetItem(mapping, w);\r
4850 Py_DECREF(w);\r
4851 if (x == NULL) {\r
4852 if (PyErr_ExceptionMatches(PyExc_LookupError)) {\r
4853 /* No mapping found means: use 1:1 mapping. */\r
4854 PyErr_Clear();\r
4855 *result = NULL;\r
4856 return 0;\r
4857 } else\r
4858 return -1;\r
4859 }\r
4860 else if (x == Py_None) {\r
4861 *result = x;\r
4862 return 0;\r
4863 }\r
4864 else if (PyInt_Check(x)) {\r
4865 long value = PyInt_AS_LONG(x);\r
4866 long max = PyUnicode_GetMax();\r
4867 if (value < 0 || value > max) {\r
4868 PyErr_Format(PyExc_TypeError,\r
4869 "character mapping must be in range(0x%lx)", max+1);\r
4870 Py_DECREF(x);\r
4871 return -1;\r
4872 }\r
4873 *result = x;\r
4874 return 0;\r
4875 }\r
4876 else if (PyUnicode_Check(x)) {\r
4877 *result = x;\r
4878 return 0;\r
4879 }\r
4880 else {\r
4881 /* wrong return value */\r
4882 PyErr_SetString(PyExc_TypeError,\r
4883 "character mapping must return integer, None or unicode");\r
4884 Py_DECREF(x);\r
4885 return -1;\r
4886 }\r
4887}\r
4888/* ensure that *outobj is at least requiredsize characters long,\r
4889 if not reallocate and adjust various state variables.\r
4890 Return 0 on success, -1 on error */\r
4891static\r
4892int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,\r
4893 Py_ssize_t requiredsize)\r
4894{\r
4895 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);\r
4896 if (requiredsize > oldsize) {\r
4897 /* remember old output position */\r
4898 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);\r
4899 /* exponentially overallocate to minimize reallocations */\r
4900 if (requiredsize < 2 * oldsize)\r
4901 requiredsize = 2 * oldsize;\r
4902 if (PyUnicode_Resize(outobj, requiredsize) < 0)\r
4903 return -1;\r
4904 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;\r
4905 }\r
4906 return 0;\r
4907}\r
4908/* lookup the character, put the result in the output string and adjust\r
4909 various state variables. Return a new reference to the object that\r
4910 was put in the output buffer in *result, or Py_None, if the mapping was\r
4911 undefined (in which case no character was written).\r
4912 The called must decref result.\r
4913 Return 0 on success, -1 on error. */\r
4914static\r
4915int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,\r
4916 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,\r
4917 PyObject **res)\r
4918{\r
4919 if (charmaptranslate_lookup(*curinp, mapping, res))\r
4920 return -1;\r
4921 if (*res==NULL) {\r
4922 /* not found => default to 1:1 mapping */\r
4923 *(*outp)++ = *curinp;\r
4924 }\r
4925 else if (*res==Py_None)\r
4926 ;\r
4927 else if (PyInt_Check(*res)) {\r
4928 /* no overflow check, because we know that the space is enough */\r
4929 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);\r
4930 }\r
4931 else if (PyUnicode_Check(*res)) {\r
4932 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);\r
4933 if (repsize==1) {\r
4934 /* no overflow check, because we know that the space is enough */\r
4935 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);\r
4936 }\r
4937 else if (repsize!=0) {\r
4938 /* more than one character */\r
4939 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +\r
4940 (insize - (curinp-startinp)) +\r
4941 repsize - 1;\r
4942 if (charmaptranslate_makespace(outobj, outp, requiredsize))\r
4943 return -1;\r
4944 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);\r
4945 *outp += repsize;\r
4946 }\r
4947 }\r
4948 else\r
4949 return -1;\r
4950 return 0;\r
4951}\r
4952\r
4953PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,\r
4954 Py_ssize_t size,\r
4955 PyObject *mapping,\r
4956 const char *errors)\r
4957{\r
4958 /* output object */\r
4959 PyObject *res = NULL;\r
4960 /* pointers to the beginning and end+1 of input */\r
4961 const Py_UNICODE *startp = p;\r
4962 const Py_UNICODE *endp = p + size;\r
4963 /* pointer into the output */\r
4964 Py_UNICODE *str;\r
4965 /* current output position */\r
4966 Py_ssize_t respos = 0;\r
4967 char *reason = "character maps to <undefined>";\r
4968 PyObject *errorHandler = NULL;\r
4969 PyObject *exc = NULL;\r
4970 /* the following variable is used for caching string comparisons\r
4971 * -1=not initialized, 0=unknown, 1=strict, 2=replace,\r
4972 * 3=ignore, 4=xmlcharrefreplace */\r
4973 int known_errorHandler = -1;\r
4974\r
4975 if (mapping == NULL) {\r
4976 PyErr_BadArgument();\r
4977 return NULL;\r
4978 }\r
4979\r
4980 /* allocate enough for a simple 1:1 translation without\r
4981 replacements, if we need more, we'll resize */\r
4982 res = PyUnicode_FromUnicode(NULL, size);\r
4983 if (res == NULL)\r
4984 goto onError;\r
4985 if (size == 0)\r
4986 return res;\r
4987 str = PyUnicode_AS_UNICODE(res);\r
4988\r
4989 while (p<endp) {\r
4990 /* try to encode it */\r
4991 PyObject *x = NULL;\r
4992 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {\r
4993 Py_XDECREF(x);\r
4994 goto onError;\r
4995 }\r
4996 Py_XDECREF(x);\r
4997 if (x!=Py_None) /* it worked => adjust input pointer */\r
4998 ++p;\r
4999 else { /* untranslatable character */\r
5000 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */\r
5001 Py_ssize_t repsize;\r
5002 Py_ssize_t newpos;\r
5003 Py_UNICODE *uni2;\r
5004 /* startpos for collecting untranslatable chars */\r
5005 const Py_UNICODE *collstart = p;\r
5006 const Py_UNICODE *collend = p+1;\r
5007 const Py_UNICODE *coll;\r
5008\r
5009 /* find all untranslatable characters */\r
5010 while (collend < endp) {\r
5011 if (charmaptranslate_lookup(*collend, mapping, &x))\r
5012 goto onError;\r
5013 Py_XDECREF(x);\r
5014 if (x!=Py_None)\r
5015 break;\r
5016 ++collend;\r
5017 }\r
5018 /* cache callback name lookup\r
5019 * (if not done yet, i.e. it's the first error) */\r
5020 if (known_errorHandler==-1) {\r
5021 if ((errors==NULL) || (!strcmp(errors, "strict")))\r
5022 known_errorHandler = 1;\r
5023 else if (!strcmp(errors, "replace"))\r
5024 known_errorHandler = 2;\r
5025 else if (!strcmp(errors, "ignore"))\r
5026 known_errorHandler = 3;\r
5027 else if (!strcmp(errors, "xmlcharrefreplace"))\r
5028 known_errorHandler = 4;\r
5029 else\r
5030 known_errorHandler = 0;\r
5031 }\r
5032 switch (known_errorHandler) {\r
5033 case 1: /* strict */\r
5034 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);\r
5035 goto onError;\r
5036 case 2: /* replace */\r
5037 /* No need to check for space, this is a 1:1 replacement */\r
5038 for (coll = collstart; coll<collend; ++coll)\r
5039 *str++ = '?';\r
5040 /* fall through */\r
5041 case 3: /* ignore */\r
5042 p = collend;\r
5043 break;\r
5044 case 4: /* xmlcharrefreplace */\r
5045 /* generate replacement (temporarily (mis)uses p) */\r
5046 for (p = collstart; p < collend; ++p) {\r
5047 char buffer[2+29+1+1];\r
5048 char *cp;\r
5049 sprintf(buffer, "&#%d;", (int)*p);\r
5050 if (charmaptranslate_makespace(&res, &str,\r
5051 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))\r
5052 goto onError;\r
5053 for (cp = buffer; *cp; ++cp)\r
5054 *str++ = *cp;\r
5055 }\r
5056 p = collend;\r
5057 break;\r
5058 default:\r
5059 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,\r
5060 reason, startp, size, &exc,\r
5061 collstart-startp, collend-startp, &newpos);\r
5062 if (repunicode == NULL)\r
5063 goto onError;\r
5064 /* generate replacement */\r
5065 repsize = PyUnicode_GET_SIZE(repunicode);\r
5066 if (charmaptranslate_makespace(&res, &str,\r
5067 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {\r
5068 Py_DECREF(repunicode);\r
5069 goto onError;\r
5070 }\r
5071 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)\r
5072 *str++ = *uni2;\r
5073 p = startp + newpos;\r
5074 Py_DECREF(repunicode);\r
5075 }\r
5076 }\r
5077 }\r
5078 /* Resize if we allocated to much */\r
5079 respos = str-PyUnicode_AS_UNICODE(res);\r
5080 if (respos<PyUnicode_GET_SIZE(res)) {\r
5081 if (PyUnicode_Resize(&res, respos) < 0)\r
5082 goto onError;\r
5083 }\r
5084 Py_XDECREF(exc);\r
5085 Py_XDECREF(errorHandler);\r
5086 return res;\r
5087\r
5088 onError:\r
5089 Py_XDECREF(res);\r
5090 Py_XDECREF(exc);\r
5091 Py_XDECREF(errorHandler);\r
5092 return NULL;\r
5093}\r
5094\r
5095PyObject *PyUnicode_Translate(PyObject *str,\r
5096 PyObject *mapping,\r
5097 const char *errors)\r
5098{\r
5099 PyObject *result;\r
5100\r
5101 str = PyUnicode_FromObject(str);\r
5102 if (str == NULL)\r
5103 goto onError;\r
5104 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),\r
5105 PyUnicode_GET_SIZE(str),\r
5106 mapping,\r
5107 errors);\r
5108 Py_DECREF(str);\r
5109 return result;\r
5110\r
5111 onError:\r
5112 Py_XDECREF(str);\r
5113 return NULL;\r
5114}\r
5115\r
5116/* --- Decimal Encoder ---------------------------------------------------- */\r
5117\r
5118int PyUnicode_EncodeDecimal(Py_UNICODE *s,\r
5119 Py_ssize_t length,\r
5120 char *output,\r
5121 const char *errors)\r
5122{\r
5123 Py_UNICODE *p, *end;\r
5124 PyObject *errorHandler = NULL;\r
5125 PyObject *exc = NULL;\r
5126 const char *encoding = "decimal";\r
5127 const char *reason = "invalid decimal Unicode string";\r
5128 /* the following variable is used for caching string comparisons\r
5129 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */\r
5130 int known_errorHandler = -1;\r
5131\r
5132 if (output == NULL) {\r
5133 PyErr_BadArgument();\r
5134 return -1;\r
5135 }\r
5136\r
5137 p = s;\r
5138 end = s + length;\r
5139 while (p < end) {\r
5140 register Py_UNICODE ch = *p;\r
5141 int decimal;\r
5142 PyObject *repunicode;\r
5143 Py_ssize_t repsize;\r
5144 Py_ssize_t newpos;\r
5145 Py_UNICODE *uni2;\r
5146 Py_UNICODE *collstart;\r
5147 Py_UNICODE *collend;\r
5148\r
5149 if (Py_UNICODE_ISSPACE(ch)) {\r
5150 *output++ = ' ';\r
5151 ++p;\r
5152 continue;\r
5153 }\r
5154 decimal = Py_UNICODE_TODECIMAL(ch);\r
5155 if (decimal >= 0) {\r
5156 *output++ = '0' + decimal;\r
5157 ++p;\r
5158 continue;\r
5159 }\r
5160 if (0 < ch && ch < 256) {\r
5161 *output++ = (char)ch;\r
5162 ++p;\r
5163 continue;\r
5164 }\r
5165 /* All other characters are considered unencodable */\r
5166 collstart = p;\r
5167 collend = p+1;\r
5168 while (collend < end) {\r
5169 if ((0 < *collend && *collend < 256) ||\r
5170 !Py_UNICODE_ISSPACE(*collend) ||\r
5171 Py_UNICODE_TODECIMAL(*collend))\r
5172 break;\r
5173 }\r
5174 /* cache callback name lookup\r
5175 * (if not done yet, i.e. it's the first error) */\r
5176 if (known_errorHandler==-1) {\r
5177 if ((errors==NULL) || (!strcmp(errors, "strict")))\r
5178 known_errorHandler = 1;\r
5179 else if (!strcmp(errors, "replace"))\r
5180 known_errorHandler = 2;\r
5181 else if (!strcmp(errors, "ignore"))\r
5182 known_errorHandler = 3;\r
5183 else if (!strcmp(errors, "xmlcharrefreplace"))\r
5184 known_errorHandler = 4;\r
5185 else\r
5186 known_errorHandler = 0;\r
5187 }\r
5188 switch (known_errorHandler) {\r
5189 case 1: /* strict */\r
5190 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);\r
5191 goto onError;\r
5192 case 2: /* replace */\r
5193 for (p = collstart; p < collend; ++p)\r
5194 *output++ = '?';\r
5195 /* fall through */\r
5196 case 3: /* ignore */\r
5197 p = collend;\r
5198 break;\r
5199 case 4: /* xmlcharrefreplace */\r
5200 /* generate replacement (temporarily (mis)uses p) */\r
5201 for (p = collstart; p < collend; ++p)\r
5202 output += sprintf(output, "&#%d;", (int)*p);\r
5203 p = collend;\r
5204 break;\r
5205 default:\r
5206 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,\r
5207 encoding, reason, s, length, &exc,\r
5208 collstart-s, collend-s, &newpos);\r
5209 if (repunicode == NULL)\r
5210 goto onError;\r
5211 /* generate replacement */\r
5212 repsize = PyUnicode_GET_SIZE(repunicode);\r
5213 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {\r
5214 Py_UNICODE ch = *uni2;\r
5215 if (Py_UNICODE_ISSPACE(ch))\r
5216 *output++ = ' ';\r
5217 else {\r
5218 decimal = Py_UNICODE_TODECIMAL(ch);\r
5219 if (decimal >= 0)\r
5220 *output++ = '0' + decimal;\r
5221 else if (0 < ch && ch < 256)\r
5222 *output++ = (char)ch;\r
5223 else {\r
5224 Py_DECREF(repunicode);\r
5225 raise_encode_exception(&exc, encoding,\r
5226 s, length, collstart-s, collend-s, reason);\r
5227 goto onError;\r
5228 }\r
5229 }\r
5230 }\r
5231 p = s + newpos;\r
5232 Py_DECREF(repunicode);\r
5233 }\r
5234 }\r
5235 /* 0-terminate the output string */\r
5236 *output++ = '\0';\r
5237 Py_XDECREF(exc);\r
5238 Py_XDECREF(errorHandler);\r
5239 return 0;\r
5240\r
5241 onError:\r
5242 Py_XDECREF(exc);\r
5243 Py_XDECREF(errorHandler);\r
5244 return -1;\r
5245}\r
5246\r
5247/* --- Helpers ------------------------------------------------------------ */\r
5248\r
5249#include "stringlib/unicodedefs.h"\r
5250#include "stringlib/fastsearch.h"\r
5251\r
5252#include "stringlib/count.h"\r
5253#include "stringlib/find.h"\r
5254#include "stringlib/partition.h"\r
5255#include "stringlib/split.h"\r
5256\r
5257/* helper macro to fixup start/end slice values */\r
5258#define ADJUST_INDICES(start, end, len) \\r
5259 if (end > len) \\r
5260 end = len; \\r
5261 else if (end < 0) { \\r
5262 end += len; \\r
5263 if (end < 0) \\r
5264 end = 0; \\r
5265 } \\r
5266 if (start < 0) { \\r
5267 start += len; \\r
5268 if (start < 0) \\r
5269 start = 0; \\r
5270 }\r
5271\r
5272Py_ssize_t PyUnicode_Count(PyObject *str,\r
5273 PyObject *substr,\r
5274 Py_ssize_t start,\r
5275 Py_ssize_t end)\r
5276{\r
5277 Py_ssize_t result;\r
5278 PyUnicodeObject* str_obj;\r
5279 PyUnicodeObject* sub_obj;\r
5280\r
5281 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);\r
5282 if (!str_obj)\r
5283 return -1;\r
5284 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);\r
5285 if (!sub_obj) {\r
5286 Py_DECREF(str_obj);\r
5287 return -1;\r
5288 }\r
5289\r
5290 ADJUST_INDICES(start, end, str_obj->length);\r
5291 result = stringlib_count(\r
5292 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,\r
5293 PY_SSIZE_T_MAX\r
5294 );\r
5295\r
5296 Py_DECREF(sub_obj);\r
5297 Py_DECREF(str_obj);\r
5298\r
5299 return result;\r
5300}\r
5301\r
5302Py_ssize_t PyUnicode_Find(PyObject *str,\r
5303 PyObject *sub,\r
5304 Py_ssize_t start,\r
5305 Py_ssize_t end,\r
5306 int direction)\r
5307{\r
5308 Py_ssize_t result;\r
5309\r
5310 str = PyUnicode_FromObject(str);\r
5311 if (!str)\r
5312 return -2;\r
5313 sub = PyUnicode_FromObject(sub);\r
5314 if (!sub) {\r
5315 Py_DECREF(str);\r
5316 return -2;\r
5317 }\r
5318\r
5319 if (direction > 0)\r
5320 result = stringlib_find_slice(\r
5321 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),\r
5322 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),\r
5323 start, end\r
5324 );\r
5325 else\r
5326 result = stringlib_rfind_slice(\r
5327 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),\r
5328 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),\r
5329 start, end\r
5330 );\r
5331\r
5332 Py_DECREF(str);\r
5333 Py_DECREF(sub);\r
5334\r
5335 return result;\r
5336}\r
5337\r
5338static\r
5339int tailmatch(PyUnicodeObject *self,\r
5340 PyUnicodeObject *substring,\r
5341 Py_ssize_t start,\r
5342 Py_ssize_t end,\r
5343 int direction)\r
5344{\r
5345 if (substring->length == 0)\r
5346 return 1;\r
5347\r
5348 ADJUST_INDICES(start, end, self->length);\r
5349 end -= substring->length;\r
5350 if (end < start)\r
5351 return 0;\r
5352\r
5353 if (direction > 0) {\r
5354 if (Py_UNICODE_MATCH(self, end, substring))\r
5355 return 1;\r
5356 } else {\r
5357 if (Py_UNICODE_MATCH(self, start, substring))\r
5358 return 1;\r
5359 }\r
5360\r
5361 return 0;\r
5362}\r
5363\r
5364Py_ssize_t PyUnicode_Tailmatch(PyObject *str,\r
5365 PyObject *substr,\r
5366 Py_ssize_t start,\r
5367 Py_ssize_t end,\r
5368 int direction)\r
5369{\r
5370 Py_ssize_t result;\r
5371\r
5372 str = PyUnicode_FromObject(str);\r
5373 if (str == NULL)\r
5374 return -1;\r
5375 substr = PyUnicode_FromObject(substr);\r
5376 if (substr == NULL) {\r
5377 Py_DECREF(str);\r
5378 return -1;\r
5379 }\r
5380\r
5381 result = tailmatch((PyUnicodeObject *)str,\r
5382 (PyUnicodeObject *)substr,\r
5383 start, end, direction);\r
5384 Py_DECREF(str);\r
5385 Py_DECREF(substr);\r
5386 return result;\r
5387}\r
5388\r
5389/* Apply fixfct filter to the Unicode object self and return a\r
5390 reference to the modified object */\r
5391\r
5392static\r
5393PyObject *fixup(PyUnicodeObject *self,\r
5394 int (*fixfct)(PyUnicodeObject *s))\r
5395{\r
5396\r
5397 PyUnicodeObject *u;\r
5398\r
5399 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);\r
5400 if (u == NULL)\r
5401 return NULL;\r
5402\r
5403 Py_UNICODE_COPY(u->str, self->str, self->length);\r
5404\r
5405 if (!fixfct(u) && PyUnicode_CheckExact(self)) {\r
5406 /* fixfct should return TRUE if it modified the buffer. If\r
5407 FALSE, return a reference to the original buffer instead\r
5408 (to save space, not time) */\r
5409 Py_INCREF(self);\r
5410 Py_DECREF(u);\r
5411 return (PyObject*) self;\r
5412 }\r
5413 return (PyObject*) u;\r
5414}\r
5415\r
5416static\r
5417int fixupper(PyUnicodeObject *self)\r
5418{\r
5419 Py_ssize_t len = self->length;\r
5420 Py_UNICODE *s = self->str;\r
5421 int status = 0;\r
5422\r
5423 while (len-- > 0) {\r
5424 register Py_UNICODE ch;\r
5425\r
5426 ch = Py_UNICODE_TOUPPER(*s);\r
5427 if (ch != *s) {\r
5428 status = 1;\r
5429 *s = ch;\r
5430 }\r
5431 s++;\r
5432 }\r
5433\r
5434 return status;\r
5435}\r
5436\r
5437static\r
5438int fixlower(PyUnicodeObject *self)\r
5439{\r
5440 Py_ssize_t len = self->length;\r
5441 Py_UNICODE *s = self->str;\r
5442 int status = 0;\r
5443\r
5444 while (len-- > 0) {\r
5445 register Py_UNICODE ch;\r
5446\r
5447 ch = Py_UNICODE_TOLOWER(*s);\r
5448 if (ch != *s) {\r
5449 status = 1;\r
5450 *s = ch;\r
5451 }\r
5452 s++;\r
5453 }\r
5454\r
5455 return status;\r
5456}\r
5457\r
5458static\r
5459int fixswapcase(PyUnicodeObject *self)\r
5460{\r
5461 Py_ssize_t len = self->length;\r
5462 Py_UNICODE *s = self->str;\r
5463 int status = 0;\r
5464\r
5465 while (len-- > 0) {\r
5466 if (Py_UNICODE_ISUPPER(*s)) {\r
5467 *s = Py_UNICODE_TOLOWER(*s);\r
5468 status = 1;\r
5469 } else if (Py_UNICODE_ISLOWER(*s)) {\r
5470 *s = Py_UNICODE_TOUPPER(*s);\r
5471 status = 1;\r
5472 }\r
5473 s++;\r
5474 }\r
5475\r
5476 return status;\r
5477}\r
5478\r
5479static\r
5480int fixcapitalize(PyUnicodeObject *self)\r
5481{\r
5482 Py_ssize_t len = self->length;\r
5483 Py_UNICODE *s = self->str;\r
5484 int status = 0;\r
5485\r
5486 if (len == 0)\r
5487 return 0;\r
5488 if (Py_UNICODE_ISLOWER(*s)) {\r
5489 *s = Py_UNICODE_TOUPPER(*s);\r
5490 status = 1;\r
5491 }\r
5492 s++;\r
5493 while (--len > 0) {\r
5494 if (Py_UNICODE_ISUPPER(*s)) {\r
5495 *s = Py_UNICODE_TOLOWER(*s);\r
5496 status = 1;\r
5497 }\r
5498 s++;\r
5499 }\r
5500 return status;\r
5501}\r
5502\r
5503static\r
5504int fixtitle(PyUnicodeObject *self)\r
5505{\r
5506 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
5507 register Py_UNICODE *e;\r
5508 int previous_is_cased;\r
5509\r
5510 /* Shortcut for single character strings */\r
5511 if (PyUnicode_GET_SIZE(self) == 1) {\r
5512 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);\r
5513 if (*p != ch) {\r
5514 *p = ch;\r
5515 return 1;\r
5516 }\r
5517 else\r
5518 return 0;\r
5519 }\r
5520\r
5521 e = p + PyUnicode_GET_SIZE(self);\r
5522 previous_is_cased = 0;\r
5523 for (; p < e; p++) {\r
5524 register const Py_UNICODE ch = *p;\r
5525\r
5526 if (previous_is_cased)\r
5527 *p = Py_UNICODE_TOLOWER(ch);\r
5528 else\r
5529 *p = Py_UNICODE_TOTITLE(ch);\r
5530\r
5531 if (Py_UNICODE_ISLOWER(ch) ||\r
5532 Py_UNICODE_ISUPPER(ch) ||\r
5533 Py_UNICODE_ISTITLE(ch))\r
5534 previous_is_cased = 1;\r
5535 else\r
5536 previous_is_cased = 0;\r
5537 }\r
5538 return 1;\r
5539}\r
5540\r
5541PyObject *\r
5542PyUnicode_Join(PyObject *separator, PyObject *seq)\r
5543{\r
5544 PyObject *internal_separator = NULL;\r
5545 const Py_UNICODE blank = ' ';\r
5546 const Py_UNICODE *sep = &blank;\r
5547 Py_ssize_t seplen = 1;\r
5548 PyUnicodeObject *res = NULL; /* the result */\r
5549 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */\r
5550 Py_ssize_t res_used; /* # used bytes */\r
5551 Py_UNICODE *res_p; /* pointer to free byte in res's string area */\r
5552 PyObject *fseq; /* PySequence_Fast(seq) */\r
5553 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */\r
5554 PyObject *item;\r
5555 Py_ssize_t i;\r
5556\r
5557 fseq = PySequence_Fast(seq, "");\r
5558 if (fseq == NULL) {\r
5559 return NULL;\r
5560 }\r
5561\r
5562 /* Grrrr. A codec may be invoked to convert str objects to\r
5563 * Unicode, and so it's possible to call back into Python code\r
5564 * during PyUnicode_FromObject(), and so it's possible for a sick\r
5565 * codec to change the size of fseq (if seq is a list). Therefore\r
5566 * we have to keep refetching the size -- can't assume seqlen\r
5567 * is invariant.\r
5568 */\r
5569 seqlen = PySequence_Fast_GET_SIZE(fseq);\r
5570 /* If empty sequence, return u"". */\r
5571 if (seqlen == 0) {\r
5572 res = _PyUnicode_New(0); /* empty sequence; return u"" */\r
5573 goto Done;\r
5574 }\r
5575 /* If singleton sequence with an exact Unicode, return that. */\r
5576 if (seqlen == 1) {\r
5577 item = PySequence_Fast_GET_ITEM(fseq, 0);\r
5578 if (PyUnicode_CheckExact(item)) {\r
5579 Py_INCREF(item);\r
5580 res = (PyUnicodeObject *)item;\r
5581 goto Done;\r
5582 }\r
5583 }\r
5584\r
5585 /* At least two items to join, or one that isn't exact Unicode. */\r
5586 if (seqlen > 1) {\r
5587 /* Set up sep and seplen -- they're needed. */\r
5588 if (separator == NULL) {\r
5589 sep = &blank;\r
5590 seplen = 1;\r
5591 }\r
5592 else {\r
5593 internal_separator = PyUnicode_FromObject(separator);\r
5594 if (internal_separator == NULL)\r
5595 goto onError;\r
5596 sep = PyUnicode_AS_UNICODE(internal_separator);\r
5597 seplen = PyUnicode_GET_SIZE(internal_separator);\r
5598 /* In case PyUnicode_FromObject() mutated seq. */\r
5599 seqlen = PySequence_Fast_GET_SIZE(fseq);\r
5600 }\r
5601 }\r
5602\r
5603 /* Get space. */\r
5604 res = _PyUnicode_New(res_alloc);\r
5605 if (res == NULL)\r
5606 goto onError;\r
5607 res_p = PyUnicode_AS_UNICODE(res);\r
5608 res_used = 0;\r
5609\r
5610 for (i = 0; i < seqlen; ++i) {\r
5611 Py_ssize_t itemlen;\r
5612 Py_ssize_t new_res_used;\r
5613\r
5614 item = PySequence_Fast_GET_ITEM(fseq, i);\r
5615 /* Convert item to Unicode. */\r
5616 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {\r
5617 PyErr_Format(PyExc_TypeError,\r
5618 "sequence item %zd: expected string or Unicode,"\r
5619 " %.80s found",\r
5620 i, Py_TYPE(item)->tp_name);\r
5621 goto onError;\r
5622 }\r
5623 item = PyUnicode_FromObject(item);\r
5624 if (item == NULL)\r
5625 goto onError;\r
5626 /* We own a reference to item from here on. */\r
5627\r
5628 /* In case PyUnicode_FromObject() mutated seq. */\r
5629 seqlen = PySequence_Fast_GET_SIZE(fseq);\r
5630\r
5631 /* Make sure we have enough space for the separator and the item. */\r
5632 itemlen = PyUnicode_GET_SIZE(item);\r
5633 new_res_used = res_used + itemlen;\r
5634 if (new_res_used < 0)\r
5635 goto Overflow;\r
5636 if (i < seqlen - 1) {\r
5637 new_res_used += seplen;\r
5638 if (new_res_used < 0)\r
5639 goto Overflow;\r
5640 }\r
5641 if (new_res_used > res_alloc) {\r
5642 /* double allocated size until it's big enough */\r
5643 do {\r
5644 res_alloc += res_alloc;\r
5645 if (res_alloc <= 0)\r
5646 goto Overflow;\r
5647 } while (new_res_used > res_alloc);\r
5648 if (_PyUnicode_Resize(&res, res_alloc) < 0) {\r
5649 Py_DECREF(item);\r
5650 goto onError;\r
5651 }\r
5652 res_p = PyUnicode_AS_UNICODE(res) + res_used;\r
5653 }\r
5654\r
5655 /* Copy item, and maybe the separator. */\r
5656 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);\r
5657 res_p += itemlen;\r
5658 if (i < seqlen - 1) {\r
5659 Py_UNICODE_COPY(res_p, sep, seplen);\r
5660 res_p += seplen;\r
5661 }\r
5662 Py_DECREF(item);\r
5663 res_used = new_res_used;\r
5664 }\r
5665\r
5666 /* Shrink res to match the used area; this probably can't fail,\r
5667 * but it's cheap to check.\r
5668 */\r
5669 if (_PyUnicode_Resize(&res, res_used) < 0)\r
5670 goto onError;\r
5671\r
5672 Done:\r
5673 Py_XDECREF(internal_separator);\r
5674 Py_DECREF(fseq);\r
5675 return (PyObject *)res;\r
5676\r
5677 Overflow:\r
5678 PyErr_SetString(PyExc_OverflowError,\r
5679 "join() result is too long for a Python string");\r
5680 Py_DECREF(item);\r
5681 /* fall through */\r
5682\r
5683 onError:\r
5684 Py_XDECREF(internal_separator);\r
5685 Py_DECREF(fseq);\r
5686 Py_XDECREF(res);\r
5687 return NULL;\r
5688}\r
5689\r
5690static\r
5691PyUnicodeObject *pad(PyUnicodeObject *self,\r
5692 Py_ssize_t left,\r
5693 Py_ssize_t right,\r
5694 Py_UNICODE fill)\r
5695{\r
5696 PyUnicodeObject *u;\r
5697\r
5698 if (left < 0)\r
5699 left = 0;\r
5700 if (right < 0)\r
5701 right = 0;\r
5702\r
5703 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {\r
5704 Py_INCREF(self);\r
5705 return self;\r
5706 }\r
5707\r
5708 if (left > PY_SSIZE_T_MAX - self->length ||\r
5709 right > PY_SSIZE_T_MAX - (left + self->length)) {\r
5710 PyErr_SetString(PyExc_OverflowError, "padded string is too long");\r
5711 return NULL;\r
5712 }\r
5713 u = _PyUnicode_New(left + self->length + right);\r
5714 if (u) {\r
5715 if (left)\r
5716 Py_UNICODE_FILL(u->str, fill, left);\r
5717 Py_UNICODE_COPY(u->str + left, self->str, self->length);\r
5718 if (right)\r
5719 Py_UNICODE_FILL(u->str + left + self->length, fill, right);\r
5720 }\r
5721\r
5722 return u;\r
5723}\r
5724\r
5725PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)\r
5726{\r
5727 PyObject *list;\r
5728\r
5729 string = PyUnicode_FromObject(string);\r
5730 if (string == NULL)\r
5731 return NULL;\r
5732\r
5733 list = stringlib_splitlines(\r
5734 (PyObject*) string, PyUnicode_AS_UNICODE(string),\r
5735 PyUnicode_GET_SIZE(string), keepends);\r
5736\r
5737 Py_DECREF(string);\r
5738 return list;\r
5739}\r
5740\r
5741static\r
5742PyObject *split(PyUnicodeObject *self,\r
5743 PyUnicodeObject *substring,\r
5744 Py_ssize_t maxcount)\r
5745{\r
5746 if (maxcount < 0)\r
5747 maxcount = PY_SSIZE_T_MAX;\r
5748\r
5749 if (substring == NULL)\r
5750 return stringlib_split_whitespace(\r
5751 (PyObject*) self, self->str, self->length, maxcount\r
5752 );\r
5753\r
5754 return stringlib_split(\r
5755 (PyObject*) self, self->str, self->length,\r
5756 substring->str, substring->length,\r
5757 maxcount\r
5758 );\r
5759}\r
5760\r
5761static\r
5762PyObject *rsplit(PyUnicodeObject *self,\r
5763 PyUnicodeObject *substring,\r
5764 Py_ssize_t maxcount)\r
5765{\r
5766 if (maxcount < 0)\r
5767 maxcount = PY_SSIZE_T_MAX;\r
5768\r
5769 if (substring == NULL)\r
5770 return stringlib_rsplit_whitespace(\r
5771 (PyObject*) self, self->str, self->length, maxcount\r
5772 );\r
5773\r
5774 return stringlib_rsplit(\r
5775 (PyObject*) self, self->str, self->length,\r
5776 substring->str, substring->length,\r
5777 maxcount\r
5778 );\r
5779}\r
5780\r
5781static\r
5782PyObject *replace(PyUnicodeObject *self,\r
5783 PyUnicodeObject *str1,\r
5784 PyUnicodeObject *str2,\r
5785 Py_ssize_t maxcount)\r
5786{\r
5787 PyUnicodeObject *u;\r
5788\r
5789 if (maxcount < 0)\r
5790 maxcount = PY_SSIZE_T_MAX;\r
5791 else if (maxcount == 0 || self->length == 0)\r
5792 goto nothing;\r
5793\r
5794 if (str1->length == str2->length) {\r
5795 Py_ssize_t i;\r
5796 /* same length */\r
5797 if (str1->length == 0)\r
5798 goto nothing;\r
5799 if (str1->length == 1) {\r
5800 /* replace characters */\r
5801 Py_UNICODE u1, u2;\r
5802 if (!findchar(self->str, self->length, str1->str[0]))\r
5803 goto nothing;\r
5804 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);\r
5805 if (!u)\r
5806 return NULL;\r
5807 Py_UNICODE_COPY(u->str, self->str, self->length);\r
5808 u1 = str1->str[0];\r
5809 u2 = str2->str[0];\r
5810 for (i = 0; i < u->length; i++)\r
5811 if (u->str[i] == u1) {\r
5812 if (--maxcount < 0)\r
5813 break;\r
5814 u->str[i] = u2;\r
5815 }\r
5816 } else {\r
5817 i = stringlib_find(\r
5818 self->str, self->length, str1->str, str1->length, 0\r
5819 );\r
5820 if (i < 0)\r
5821 goto nothing;\r
5822 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);\r
5823 if (!u)\r
5824 return NULL;\r
5825 Py_UNICODE_COPY(u->str, self->str, self->length);\r
5826\r
5827 /* change everything in-place, starting with this one */\r
5828 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);\r
5829 i += str1->length;\r
5830\r
5831 while ( --maxcount > 0) {\r
5832 i = stringlib_find(self->str+i, self->length-i,\r
5833 str1->str, str1->length,\r
5834 i);\r
5835 if (i == -1)\r
5836 break;\r
5837 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);\r
5838 i += str1->length;\r
5839 }\r
5840 }\r
5841 } else {\r
5842\r
5843 Py_ssize_t n, i, j;\r
5844 Py_ssize_t product, new_size, delta;\r
5845 Py_UNICODE *p;\r
5846\r
5847 /* replace strings */\r
5848 n = stringlib_count(self->str, self->length, str1->str, str1->length,\r
5849 maxcount);\r
5850 if (n == 0)\r
5851 goto nothing;\r
5852 /* new_size = self->length + n * (str2->length - str1->length)); */\r
5853 delta = (str2->length - str1->length);\r
5854 if (delta == 0) {\r
5855 new_size = self->length;\r
5856 } else {\r
5857 product = n * (str2->length - str1->length);\r
5858 if ((product / (str2->length - str1->length)) != n) {\r
5859 PyErr_SetString(PyExc_OverflowError,\r
5860 "replace string is too long");\r
5861 return NULL;\r
5862 }\r
5863 new_size = self->length + product;\r
5864 if (new_size < 0) {\r
5865 PyErr_SetString(PyExc_OverflowError,\r
5866 "replace string is too long");\r
5867 return NULL;\r
5868 }\r
5869 }\r
5870 u = _PyUnicode_New(new_size);\r
5871 if (!u)\r
5872 return NULL;\r
5873 i = 0;\r
5874 p = u->str;\r
5875 if (str1->length > 0) {\r
5876 while (n-- > 0) {\r
5877 /* look for next match */\r
5878 j = stringlib_find(self->str+i, self->length-i,\r
5879 str1->str, str1->length,\r
5880 i);\r
5881 if (j == -1)\r
5882 break;\r
5883 else if (j > i) {\r
5884 /* copy unchanged part [i:j] */\r
5885 Py_UNICODE_COPY(p, self->str+i, j-i);\r
5886 p += j - i;\r
5887 }\r
5888 /* copy substitution string */\r
5889 if (str2->length > 0) {\r
5890 Py_UNICODE_COPY(p, str2->str, str2->length);\r
5891 p += str2->length;\r
5892 }\r
5893 i = j + str1->length;\r
5894 }\r
5895 if (i < self->length)\r
5896 /* copy tail [i:] */\r
5897 Py_UNICODE_COPY(p, self->str+i, self->length-i);\r
5898 } else {\r
5899 /* interleave */\r
5900 while (n > 0) {\r
5901 Py_UNICODE_COPY(p, str2->str, str2->length);\r
5902 p += str2->length;\r
5903 if (--n <= 0)\r
5904 break;\r
5905 *p++ = self->str[i++];\r
5906 }\r
5907 Py_UNICODE_COPY(p, self->str+i, self->length-i);\r
5908 }\r
5909 }\r
5910 return (PyObject *) u;\r
5911\r
5912 nothing:\r
5913 /* nothing to replace; return original string (when possible) */\r
5914 if (PyUnicode_CheckExact(self)) {\r
5915 Py_INCREF(self);\r
5916 return (PyObject *) self;\r
5917 }\r
5918 return PyUnicode_FromUnicode(self->str, self->length);\r
5919}\r
5920\r
5921/* --- Unicode Object Methods --------------------------------------------- */\r
5922\r
5923PyDoc_STRVAR(title__doc__,\r
5924 "S.title() -> unicode\n\\r
5925\n\\r
5926Return a titlecased version of S, i.e. words start with title case\n\\r
5927characters, all remaining cased characters have lower case.");\r
5928\r
5929static PyObject*\r
5930unicode_title(PyUnicodeObject *self)\r
5931{\r
5932 return fixup(self, fixtitle);\r
5933}\r
5934\r
5935PyDoc_STRVAR(capitalize__doc__,\r
5936 "S.capitalize() -> unicode\n\\r
5937\n\\r
5938Return a capitalized version of S, i.e. make the first character\n\\r
5939have upper case and the rest lower case.");\r
5940\r
5941static PyObject*\r
5942unicode_capitalize(PyUnicodeObject *self)\r
5943{\r
5944 return fixup(self, fixcapitalize);\r
5945}\r
5946\r
5947#if 0\r
5948PyDoc_STRVAR(capwords__doc__,\r
5949 "S.capwords() -> unicode\n\\r
5950\n\\r
5951Apply .capitalize() to all words in S and return the result with\n\\r
5952normalized whitespace (all whitespace strings are replaced by ' ').");\r
5953\r
5954static PyObject*\r
5955unicode_capwords(PyUnicodeObject *self)\r
5956{\r
5957 PyObject *list;\r
5958 PyObject *item;\r
5959 Py_ssize_t i;\r
5960\r
5961 /* Split into words */\r
5962 list = split(self, NULL, -1);\r
5963 if (!list)\r
5964 return NULL;\r
5965\r
5966 /* Capitalize each word */\r
5967 for (i = 0; i < PyList_GET_SIZE(list); i++) {\r
5968 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),\r
5969 fixcapitalize);\r
5970 if (item == NULL)\r
5971 goto onError;\r
5972 Py_DECREF(PyList_GET_ITEM(list, i));\r
5973 PyList_SET_ITEM(list, i, item);\r
5974 }\r
5975\r
5976 /* Join the words to form a new string */\r
5977 item = PyUnicode_Join(NULL, list);\r
5978\r
5979 onError:\r
5980 Py_DECREF(list);\r
5981 return (PyObject *)item;\r
5982}\r
5983#endif\r
5984\r
5985/* Argument converter. Coerces to a single unicode character */\r
5986\r
5987static int\r
5988convert_uc(PyObject *obj, void *addr)\r
5989{\r
5990 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;\r
5991 PyObject *uniobj;\r
5992 Py_UNICODE *unistr;\r
5993\r
5994 uniobj = PyUnicode_FromObject(obj);\r
5995 if (uniobj == NULL) {\r
5996 PyErr_SetString(PyExc_TypeError,\r
5997 "The fill character cannot be converted to Unicode");\r
5998 return 0;\r
5999 }\r
6000 if (PyUnicode_GET_SIZE(uniobj) != 1) {\r
6001 PyErr_SetString(PyExc_TypeError,\r
6002 "The fill character must be exactly one character long");\r
6003 Py_DECREF(uniobj);\r
6004 return 0;\r
6005 }\r
6006 unistr = PyUnicode_AS_UNICODE(uniobj);\r
6007 *fillcharloc = unistr[0];\r
6008 Py_DECREF(uniobj);\r
6009 return 1;\r
6010}\r
6011\r
6012PyDoc_STRVAR(center__doc__,\r
6013 "S.center(width[, fillchar]) -> unicode\n\\r
6014\n\\r
6015Return S centered in a Unicode string of length width. Padding is\n\\r
6016done using the specified fill character (default is a space)");\r
6017\r
6018static PyObject *\r
6019unicode_center(PyUnicodeObject *self, PyObject *args)\r
6020{\r
6021 Py_ssize_t marg, left;\r
6022 Py_ssize_t width;\r
6023 Py_UNICODE fillchar = ' ';\r
6024\r
6025 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))\r
6026 return NULL;\r
6027\r
6028 if (self->length >= width && PyUnicode_CheckExact(self)) {\r
6029 Py_INCREF(self);\r
6030 return (PyObject*) self;\r
6031 }\r
6032\r
6033 marg = width - self->length;\r
6034 left = marg / 2 + (marg & width & 1);\r
6035\r
6036 return (PyObject*) pad(self, left, marg - left, fillchar);\r
6037}\r
6038\r
6039#if 0\r
6040\r
6041/* This code should go into some future Unicode collation support\r
6042 module. The basic comparison should compare ordinals on a naive\r
6043 basis (this is what Java does and thus Jython too). */\r
6044\r
6045/* speedy UTF-16 code point order comparison */\r
6046/* gleaned from: */\r
6047/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */\r
6048\r
6049static short utf16Fixup[32] =\r
6050{\r
6051 0, 0, 0, 0, 0, 0, 0, 0,\r
6052 0, 0, 0, 0, 0, 0, 0, 0,\r
6053 0, 0, 0, 0, 0, 0, 0, 0,\r
6054 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800\r
6055};\r
6056\r
6057static int\r
6058unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)\r
6059{\r
6060 Py_ssize_t len1, len2;\r
6061\r
6062 Py_UNICODE *s1 = str1->str;\r
6063 Py_UNICODE *s2 = str2->str;\r
6064\r
6065 len1 = str1->length;\r
6066 len2 = str2->length;\r
6067\r
6068 while (len1 > 0 && len2 > 0) {\r
6069 Py_UNICODE c1, c2;\r
6070\r
6071 c1 = *s1++;\r
6072 c2 = *s2++;\r
6073\r
6074 if (c1 > (1<<11) * 26)\r
6075 c1 += utf16Fixup[c1>>11];\r
6076 if (c2 > (1<<11) * 26)\r
6077 c2 += utf16Fixup[c2>>11];\r
6078 /* now c1 and c2 are in UTF-32-compatible order */\r
6079\r
6080 if (c1 != c2)\r
6081 return (c1 < c2) ? -1 : 1;\r
6082\r
6083 len1--; len2--;\r
6084 }\r
6085\r
6086 return (len1 < len2) ? -1 : (len1 != len2);\r
6087}\r
6088\r
6089#else\r
6090\r
6091static int\r
6092unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)\r
6093{\r
6094 register Py_ssize_t len1, len2;\r
6095\r
6096 Py_UNICODE *s1 = str1->str;\r
6097 Py_UNICODE *s2 = str2->str;\r
6098\r
6099 len1 = str1->length;\r
6100 len2 = str2->length;\r
6101\r
6102 while (len1 > 0 && len2 > 0) {\r
6103 Py_UNICODE c1, c2;\r
6104\r
6105 c1 = *s1++;\r
6106 c2 = *s2++;\r
6107\r
6108 if (c1 != c2)\r
6109 return (c1 < c2) ? -1 : 1;\r
6110\r
6111 len1--; len2--;\r
6112 }\r
6113\r
6114 return (len1 < len2) ? -1 : (len1 != len2);\r
6115}\r
6116\r
6117#endif\r
6118\r
6119int PyUnicode_Compare(PyObject *left,\r
6120 PyObject *right)\r
6121{\r
6122 PyUnicodeObject *u = NULL, *v = NULL;\r
6123 int result;\r
6124\r
6125 /* Coerce the two arguments */\r
6126 u = (PyUnicodeObject *)PyUnicode_FromObject(left);\r
6127 if (u == NULL)\r
6128 goto onError;\r
6129 v = (PyUnicodeObject *)PyUnicode_FromObject(right);\r
6130 if (v == NULL)\r
6131 goto onError;\r
6132\r
6133 /* Shortcut for empty or interned objects */\r
6134 if (v == u) {\r
6135 Py_DECREF(u);\r
6136 Py_DECREF(v);\r
6137 return 0;\r
6138 }\r
6139\r
6140 result = unicode_compare(u, v);\r
6141\r
6142 Py_DECREF(u);\r
6143 Py_DECREF(v);\r
6144 return result;\r
6145\r
6146 onError:\r
6147 Py_XDECREF(u);\r
6148 Py_XDECREF(v);\r
6149 return -1;\r
6150}\r
6151\r
6152PyObject *PyUnicode_RichCompare(PyObject *left,\r
6153 PyObject *right,\r
6154 int op)\r
6155{\r
6156 int result;\r
6157\r
6158 result = PyUnicode_Compare(left, right);\r
6159 if (result == -1 && PyErr_Occurred())\r
6160 goto onError;\r
6161\r
6162 /* Convert the return value to a Boolean */\r
6163 switch (op) {\r
6164 case Py_EQ:\r
6165 result = (result == 0);\r
6166 break;\r
6167 case Py_NE:\r
6168 result = (result != 0);\r
6169 break;\r
6170 case Py_LE:\r
6171 result = (result <= 0);\r
6172 break;\r
6173 case Py_GE:\r
6174 result = (result >= 0);\r
6175 break;\r
6176 case Py_LT:\r
6177 result = (result == -1);\r
6178 break;\r
6179 case Py_GT:\r
6180 result = (result == 1);\r
6181 break;\r
6182 }\r
6183 return PyBool_FromLong(result);\r
6184\r
6185 onError:\r
6186\r
6187 /* Standard case\r
6188\r
6189 Type errors mean that PyUnicode_FromObject() could not convert\r
6190 one of the arguments (usually the right hand side) to Unicode,\r
6191 ie. we can't handle the comparison request. However, it is\r
6192 possible that the other object knows a comparison method, which\r
6193 is why we return Py_NotImplemented to give the other object a\r
6194 chance.\r
6195\r
6196 */\r
6197 if (PyErr_ExceptionMatches(PyExc_TypeError)) {\r
6198 PyErr_Clear();\r
6199 Py_INCREF(Py_NotImplemented);\r
6200 return Py_NotImplemented;\r
6201 }\r
6202 if (op != Py_EQ && op != Py_NE)\r
6203 return NULL;\r
6204\r
6205 /* Equality comparison.\r
6206\r
6207 This is a special case: we silence any PyExc_UnicodeDecodeError\r
6208 and instead turn it into a PyErr_UnicodeWarning.\r
6209\r
6210 */\r
6211 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))\r
6212 return NULL;\r
6213 PyErr_Clear();\r
6214 if (PyErr_Warn(PyExc_UnicodeWarning,\r
6215 (op == Py_EQ) ?\r
6216 "Unicode equal comparison "\r
6217 "failed to convert both arguments to Unicode - "\r
6218 "interpreting them as being unequal" :\r
6219 "Unicode unequal comparison "\r
6220 "failed to convert both arguments to Unicode - "\r
6221 "interpreting them as being unequal"\r
6222 ) < 0)\r
6223 return NULL;\r
6224 result = (op == Py_NE);\r
6225 return PyBool_FromLong(result);\r
6226}\r
6227\r
6228int PyUnicode_Contains(PyObject *container,\r
6229 PyObject *element)\r
6230{\r
6231 PyObject *str, *sub;\r
6232 int result;\r
6233\r
6234 /* Coerce the two arguments */\r
6235 sub = PyUnicode_FromObject(element);\r
6236 if (!sub) {\r
6237 return -1;\r
6238 }\r
6239\r
6240 str = PyUnicode_FromObject(container);\r
6241 if (!str) {\r
6242 Py_DECREF(sub);\r
6243 return -1;\r
6244 }\r
6245\r
6246 result = stringlib_contains_obj(str, sub);\r
6247\r
6248 Py_DECREF(str);\r
6249 Py_DECREF(sub);\r
6250\r
6251 return result;\r
6252}\r
6253\r
6254/* Concat to string or Unicode object giving a new Unicode object. */\r
6255\r
6256PyObject *PyUnicode_Concat(PyObject *left,\r
6257 PyObject *right)\r
6258{\r
6259 PyUnicodeObject *u = NULL, *v = NULL, *w;\r
6260\r
6261 /* Coerce the two arguments */\r
6262 u = (PyUnicodeObject *)PyUnicode_FromObject(left);\r
6263 if (u == NULL)\r
6264 goto onError;\r
6265 v = (PyUnicodeObject *)PyUnicode_FromObject(right);\r
6266 if (v == NULL)\r
6267 goto onError;\r
6268\r
6269 /* Shortcuts */\r
6270 if (v == unicode_empty) {\r
6271 Py_DECREF(v);\r
6272 return (PyObject *)u;\r
6273 }\r
6274 if (u == unicode_empty) {\r
6275 Py_DECREF(u);\r
6276 return (PyObject *)v;\r
6277 }\r
6278\r
6279 /* Concat the two Unicode strings */\r
6280 w = _PyUnicode_New(u->length + v->length);\r
6281 if (w == NULL)\r
6282 goto onError;\r
6283 Py_UNICODE_COPY(w->str, u->str, u->length);\r
6284 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);\r
6285\r
6286 Py_DECREF(u);\r
6287 Py_DECREF(v);\r
6288 return (PyObject *)w;\r
6289\r
6290 onError:\r
6291 Py_XDECREF(u);\r
6292 Py_XDECREF(v);\r
6293 return NULL;\r
6294}\r
6295\r
6296PyDoc_STRVAR(count__doc__,\r
6297 "S.count(sub[, start[, end]]) -> int\n\\r
6298\n\\r
6299Return the number of non-overlapping occurrences of substring sub in\n\\r
6300Unicode string S[start:end]. Optional arguments start and end are\n\\r
6301interpreted as in slice notation.");\r
6302\r
6303static PyObject *\r
6304unicode_count(PyUnicodeObject *self, PyObject *args)\r
6305{\r
6306 PyUnicodeObject *substring;\r
6307 Py_ssize_t start = 0;\r
6308 Py_ssize_t end = PY_SSIZE_T_MAX;\r
6309 PyObject *result;\r
6310\r
6311 if (!stringlib_parse_args_finds_unicode("count", args, &substring,\r
6312 &start, &end))\r
6313 return NULL;\r
6314\r
6315 ADJUST_INDICES(start, end, self->length);\r
6316 result = PyInt_FromSsize_t(\r
6317 stringlib_count(self->str + start, end - start,\r
6318 substring->str, substring->length,\r
6319 PY_SSIZE_T_MAX)\r
6320 );\r
6321\r
6322 Py_DECREF(substring);\r
6323\r
6324 return result;\r
6325}\r
6326\r
6327PyDoc_STRVAR(encode__doc__,\r
6328 "S.encode([encoding[,errors]]) -> string or unicode\n\\r
6329\n\\r
6330Encodes S using the codec registered for encoding. encoding defaults\n\\r
6331to the default encoding. errors may be given to set a different error\n\\r
6332handling scheme. Default is 'strict' meaning that encoding errors raise\n\\r
6333a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\\r
6334'xmlcharrefreplace' as well as any other name registered with\n\\r
6335codecs.register_error that can handle UnicodeEncodeErrors.");\r
6336\r
6337static PyObject *\r
6338unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)\r
6339{\r
6340 static char *kwlist[] = {"encoding", "errors", 0};\r
6341 char *encoding = NULL;\r
6342 char *errors = NULL;\r
6343 PyObject *v;\r
6344\r
6345 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",\r
6346 kwlist, &encoding, &errors))\r
6347 return NULL;\r
6348 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);\r
6349 if (v == NULL)\r
6350 goto onError;\r
6351 if (!PyString_Check(v) && !PyUnicode_Check(v)) {\r
6352 PyErr_Format(PyExc_TypeError,\r
6353 "encoder did not return a string/unicode object "\r
6354 "(type=%.400s)",\r
6355 Py_TYPE(v)->tp_name);\r
6356 Py_DECREF(v);\r
6357 return NULL;\r
6358 }\r
6359 return v;\r
6360\r
6361 onError:\r
6362 return NULL;\r
6363}\r
6364\r
6365PyDoc_STRVAR(decode__doc__,\r
6366 "S.decode([encoding[,errors]]) -> string or unicode\n\\r
6367\n\\r
6368Decodes S using the codec registered for encoding. encoding defaults\n\\r
6369to the default encoding. errors may be given to set a different error\n\\r
6370handling scheme. Default is 'strict' meaning that encoding errors raise\n\\r
6371a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\\r
6372as well as any other name registerd with codecs.register_error that is\n\\r
6373able to handle UnicodeDecodeErrors.");\r
6374\r
6375static PyObject *\r
6376unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)\r
6377{\r
6378 static char *kwlist[] = {"encoding", "errors", 0};\r
6379 char *encoding = NULL;\r
6380 char *errors = NULL;\r
6381 PyObject *v;\r
6382\r
6383 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",\r
6384 kwlist, &encoding, &errors))\r
6385 return NULL;\r
6386 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);\r
6387 if (v == NULL)\r
6388 goto onError;\r
6389 if (!PyString_Check(v) && !PyUnicode_Check(v)) {\r
6390 PyErr_Format(PyExc_TypeError,\r
6391 "decoder did not return a string/unicode object "\r
6392 "(type=%.400s)",\r
6393 Py_TYPE(v)->tp_name);\r
6394 Py_DECREF(v);\r
6395 return NULL;\r
6396 }\r
6397 return v;\r
6398\r
6399 onError:\r
6400 return NULL;\r
6401}\r
6402\r
6403PyDoc_STRVAR(expandtabs__doc__,\r
6404 "S.expandtabs([tabsize]) -> unicode\n\\r
6405\n\\r
6406Return a copy of S where all tab characters are expanded using spaces.\n\\r
6407If tabsize is not given, a tab size of 8 characters is assumed.");\r
6408\r
6409static PyObject*\r
6410unicode_expandtabs(PyUnicodeObject *self, PyObject *args)\r
6411{\r
6412 Py_UNICODE *e;\r
6413 Py_UNICODE *p;\r
6414 Py_UNICODE *q;\r
6415 Py_UNICODE *qe;\r
6416 Py_ssize_t i, j, incr;\r
6417 PyUnicodeObject *u;\r
6418 int tabsize = 8;\r
6419\r
6420 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))\r
6421 return NULL;\r
6422\r
6423 /* First pass: determine size of output string */\r
6424 i = 0; /* chars up to and including most recent \n or \r */\r
6425 j = 0; /* chars since most recent \n or \r (use in tab calculations) */\r
6426 e = self->str + self->length; /* end of input */\r
6427 for (p = self->str; p < e; p++)\r
6428 if (*p == '\t') {\r
6429 if (tabsize > 0) {\r
6430 incr = tabsize - (j % tabsize); /* cannot overflow */\r
6431 if (j > PY_SSIZE_T_MAX - incr)\r
6432 goto overflow1;\r
6433 j += incr;\r
6434 }\r
6435 }\r
6436 else {\r
6437 if (j > PY_SSIZE_T_MAX - 1)\r
6438 goto overflow1;\r
6439 j++;\r
6440 if (*p == '\n' || *p == '\r') {\r
6441 if (i > PY_SSIZE_T_MAX - j)\r
6442 goto overflow1;\r
6443 i += j;\r
6444 j = 0;\r
6445 }\r
6446 }\r
6447\r
6448 if (i > PY_SSIZE_T_MAX - j)\r
6449 goto overflow1;\r
6450\r
6451 /* Second pass: create output string and fill it */\r
6452 u = _PyUnicode_New(i + j);\r
6453 if (!u)\r
6454 return NULL;\r
6455\r
6456 j = 0; /* same as in first pass */\r
6457 q = u->str; /* next output char */\r
6458 qe = u->str + u->length; /* end of output */\r
6459\r
6460 for (p = self->str; p < e; p++)\r
6461 if (*p == '\t') {\r
6462 if (tabsize > 0) {\r
6463 i = tabsize - (j % tabsize);\r
6464 j += i;\r
6465 while (i--) {\r
6466 if (q >= qe)\r
6467 goto overflow2;\r
6468 *q++ = ' ';\r
6469 }\r
6470 }\r
6471 }\r
6472 else {\r
6473 if (q >= qe)\r
6474 goto overflow2;\r
6475 *q++ = *p;\r
6476 j++;\r
6477 if (*p == '\n' || *p == '\r')\r
6478 j = 0;\r
6479 }\r
6480\r
6481 return (PyObject*) u;\r
6482\r
6483 overflow2:\r
6484 Py_DECREF(u);\r
6485 overflow1:\r
6486 PyErr_SetString(PyExc_OverflowError, "new string is too long");\r
6487 return NULL;\r
6488}\r
6489\r
6490PyDoc_STRVAR(find__doc__,\r
6491 "S.find(sub [,start [,end]]) -> int\n\\r
6492\n\\r
6493Return the lowest index in S where substring sub is found,\n\\r
6494such that sub is contained within s[start:end]. Optional\n\\r
6495arguments start and end are interpreted as in slice notation.\n\\r
6496\n\\r
6497Return -1 on failure.");\r
6498\r
6499static PyObject *\r
6500unicode_find(PyUnicodeObject *self, PyObject *args)\r
6501{\r
6502 PyUnicodeObject *substring;\r
6503 Py_ssize_t start;\r
6504 Py_ssize_t end;\r
6505 Py_ssize_t result;\r
6506\r
6507 if (!stringlib_parse_args_finds_unicode("find", args, &substring,\r
6508 &start, &end))\r
6509 return NULL;\r
6510\r
6511 result = stringlib_find_slice(\r
6512 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r
6513 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r
6514 start, end\r
6515 );\r
6516\r
6517 Py_DECREF(substring);\r
6518\r
6519 return PyInt_FromSsize_t(result);\r
6520}\r
6521\r
6522static PyObject *\r
6523unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)\r
6524{\r
6525 if (index < 0 || index >= self->length) {\r
6526 PyErr_SetString(PyExc_IndexError, "string index out of range");\r
6527 return NULL;\r
6528 }\r
6529\r
6530 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);\r
6531}\r
6532\r
6533static long\r
6534unicode_hash(PyUnicodeObject *self)\r
6535{\r
6536 /* Since Unicode objects compare equal to their ASCII string\r
6537 counterparts, they should use the individual character values\r
6538 as basis for their hash value. This is needed to assure that\r
6539 strings and Unicode objects behave in the same way as\r
6540 dictionary keys. */\r
6541\r
6542 register Py_ssize_t len;\r
6543 register Py_UNICODE *p;\r
6544 register long x;\r
6545\r
6546 if (self->hash != -1)\r
6547 return self->hash;\r
6548 len = PyUnicode_GET_SIZE(self);\r
6549 p = PyUnicode_AS_UNICODE(self);\r
6550 x = *p << 7;\r
6551 while (--len >= 0)\r
6552 x = (1000003*x) ^ *p++;\r
6553 x ^= PyUnicode_GET_SIZE(self);\r
6554 if (x == -1)\r
6555 x = -2;\r
6556 self->hash = x;\r
6557 return x;\r
6558}\r
6559\r
6560PyDoc_STRVAR(index__doc__,\r
6561 "S.index(sub [,start [,end]]) -> int\n\\r
6562\n\\r
6563Like S.find() but raise ValueError when the substring is not found.");\r
6564\r
6565static PyObject *\r
6566unicode_index(PyUnicodeObject *self, PyObject *args)\r
6567{\r
6568 Py_ssize_t result;\r
6569 PyUnicodeObject *substring;\r
6570 Py_ssize_t start;\r
6571 Py_ssize_t end;\r
6572\r
6573 if (!stringlib_parse_args_finds_unicode("index", args, &substring,\r
6574 &start, &end))\r
6575 return NULL;\r
6576\r
6577 result = stringlib_find_slice(\r
6578 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r
6579 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r
6580 start, end\r
6581 );\r
6582\r
6583 Py_DECREF(substring);\r
6584\r
6585 if (result < 0) {\r
6586 PyErr_SetString(PyExc_ValueError, "substring not found");\r
6587 return NULL;\r
6588 }\r
6589\r
6590 return PyInt_FromSsize_t(result);\r
6591}\r
6592\r
6593PyDoc_STRVAR(islower__doc__,\r
6594 "S.islower() -> bool\n\\r
6595\n\\r
6596Return True if all cased characters in S are lowercase and there is\n\\r
6597at least one cased character in S, False otherwise.");\r
6598\r
6599static PyObject*\r
6600unicode_islower(PyUnicodeObject *self)\r
6601{\r
6602 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6603 register const Py_UNICODE *e;\r
6604 int cased;\r
6605\r
6606 /* Shortcut for single character strings */\r
6607 if (PyUnicode_GET_SIZE(self) == 1)\r
6608 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));\r
6609\r
6610 /* Special case for empty strings */\r
6611 if (PyUnicode_GET_SIZE(self) == 0)\r
6612 return PyBool_FromLong(0);\r
6613\r
6614 e = p + PyUnicode_GET_SIZE(self);\r
6615 cased = 0;\r
6616 for (; p < e; p++) {\r
6617 register const Py_UNICODE ch = *p;\r
6618\r
6619 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))\r
6620 return PyBool_FromLong(0);\r
6621 else if (!cased && Py_UNICODE_ISLOWER(ch))\r
6622 cased = 1;\r
6623 }\r
6624 return PyBool_FromLong(cased);\r
6625}\r
6626\r
6627PyDoc_STRVAR(isupper__doc__,\r
6628 "S.isupper() -> bool\n\\r
6629\n\\r
6630Return True if all cased characters in S are uppercase and there is\n\\r
6631at least one cased character in S, False otherwise.");\r
6632\r
6633static PyObject*\r
6634unicode_isupper(PyUnicodeObject *self)\r
6635{\r
6636 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6637 register const Py_UNICODE *e;\r
6638 int cased;\r
6639\r
6640 /* Shortcut for single character strings */\r
6641 if (PyUnicode_GET_SIZE(self) == 1)\r
6642 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);\r
6643\r
6644 /* Special case for empty strings */\r
6645 if (PyUnicode_GET_SIZE(self) == 0)\r
6646 return PyBool_FromLong(0);\r
6647\r
6648 e = p + PyUnicode_GET_SIZE(self);\r
6649 cased = 0;\r
6650 for (; p < e; p++) {\r
6651 register const Py_UNICODE ch = *p;\r
6652\r
6653 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))\r
6654 return PyBool_FromLong(0);\r
6655 else if (!cased && Py_UNICODE_ISUPPER(ch))\r
6656 cased = 1;\r
6657 }\r
6658 return PyBool_FromLong(cased);\r
6659}\r
6660\r
6661PyDoc_STRVAR(istitle__doc__,\r
6662 "S.istitle() -> bool\n\\r
6663\n\\r
6664Return True if S is a titlecased string and there is at least one\n\\r
6665character in S, i.e. upper- and titlecase characters may only\n\\r
6666follow uncased characters and lowercase characters only cased ones.\n\\r
6667Return False otherwise.");\r
6668\r
6669static PyObject*\r
6670unicode_istitle(PyUnicodeObject *self)\r
6671{\r
6672 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6673 register const Py_UNICODE *e;\r
6674 int cased, previous_is_cased;\r
6675\r
6676 /* Shortcut for single character strings */\r
6677 if (PyUnicode_GET_SIZE(self) == 1)\r
6678 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||\r
6679 (Py_UNICODE_ISUPPER(*p) != 0));\r
6680\r
6681 /* Special case for empty strings */\r
6682 if (PyUnicode_GET_SIZE(self) == 0)\r
6683 return PyBool_FromLong(0);\r
6684\r
6685 e = p + PyUnicode_GET_SIZE(self);\r
6686 cased = 0;\r
6687 previous_is_cased = 0;\r
6688 for (; p < e; p++) {\r
6689 register const Py_UNICODE ch = *p;\r
6690\r
6691 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {\r
6692 if (previous_is_cased)\r
6693 return PyBool_FromLong(0);\r
6694 previous_is_cased = 1;\r
6695 cased = 1;\r
6696 }\r
6697 else if (Py_UNICODE_ISLOWER(ch)) {\r
6698 if (!previous_is_cased)\r
6699 return PyBool_FromLong(0);\r
6700 previous_is_cased = 1;\r
6701 cased = 1;\r
6702 }\r
6703 else\r
6704 previous_is_cased = 0;\r
6705 }\r
6706 return PyBool_FromLong(cased);\r
6707}\r
6708\r
6709PyDoc_STRVAR(isspace__doc__,\r
6710 "S.isspace() -> bool\n\\r
6711\n\\r
6712Return True if all characters in S are whitespace\n\\r
6713and there is at least one character in S, False otherwise.");\r
6714\r
6715static PyObject*\r
6716unicode_isspace(PyUnicodeObject *self)\r
6717{\r
6718 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6719 register const Py_UNICODE *e;\r
6720\r
6721 /* Shortcut for single character strings */\r
6722 if (PyUnicode_GET_SIZE(self) == 1 &&\r
6723 Py_UNICODE_ISSPACE(*p))\r
6724 return PyBool_FromLong(1);\r
6725\r
6726 /* Special case for empty strings */\r
6727 if (PyUnicode_GET_SIZE(self) == 0)\r
6728 return PyBool_FromLong(0);\r
6729\r
6730 e = p + PyUnicode_GET_SIZE(self);\r
6731 for (; p < e; p++) {\r
6732 if (!Py_UNICODE_ISSPACE(*p))\r
6733 return PyBool_FromLong(0);\r
6734 }\r
6735 return PyBool_FromLong(1);\r
6736}\r
6737\r
6738PyDoc_STRVAR(isalpha__doc__,\r
6739 "S.isalpha() -> bool\n\\r
6740\n\\r
6741Return True if all characters in S are alphabetic\n\\r
6742and there is at least one character in S, False otherwise.");\r
6743\r
6744static PyObject*\r
6745unicode_isalpha(PyUnicodeObject *self)\r
6746{\r
6747 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6748 register const Py_UNICODE *e;\r
6749\r
6750 /* Shortcut for single character strings */\r
6751 if (PyUnicode_GET_SIZE(self) == 1 &&\r
6752 Py_UNICODE_ISALPHA(*p))\r
6753 return PyBool_FromLong(1);\r
6754\r
6755 /* Special case for empty strings */\r
6756 if (PyUnicode_GET_SIZE(self) == 0)\r
6757 return PyBool_FromLong(0);\r
6758\r
6759 e = p + PyUnicode_GET_SIZE(self);\r
6760 for (; p < e; p++) {\r
6761 if (!Py_UNICODE_ISALPHA(*p))\r
6762 return PyBool_FromLong(0);\r
6763 }\r
6764 return PyBool_FromLong(1);\r
6765}\r
6766\r
6767PyDoc_STRVAR(isalnum__doc__,\r
6768 "S.isalnum() -> bool\n\\r
6769\n\\r
6770Return True if all characters in S are alphanumeric\n\\r
6771and there is at least one character in S, False otherwise.");\r
6772\r
6773static PyObject*\r
6774unicode_isalnum(PyUnicodeObject *self)\r
6775{\r
6776 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6777 register const Py_UNICODE *e;\r
6778\r
6779 /* Shortcut for single character strings */\r
6780 if (PyUnicode_GET_SIZE(self) == 1 &&\r
6781 Py_UNICODE_ISALNUM(*p))\r
6782 return PyBool_FromLong(1);\r
6783\r
6784 /* Special case for empty strings */\r
6785 if (PyUnicode_GET_SIZE(self) == 0)\r
6786 return PyBool_FromLong(0);\r
6787\r
6788 e = p + PyUnicode_GET_SIZE(self);\r
6789 for (; p < e; p++) {\r
6790 if (!Py_UNICODE_ISALNUM(*p))\r
6791 return PyBool_FromLong(0);\r
6792 }\r
6793 return PyBool_FromLong(1);\r
6794}\r
6795\r
6796PyDoc_STRVAR(isdecimal__doc__,\r
6797 "S.isdecimal() -> bool\n\\r
6798\n\\r
6799Return True if there are only decimal characters in S,\n\\r
6800False otherwise.");\r
6801\r
6802static PyObject*\r
6803unicode_isdecimal(PyUnicodeObject *self)\r
6804{\r
6805 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6806 register const Py_UNICODE *e;\r
6807\r
6808 /* Shortcut for single character strings */\r
6809 if (PyUnicode_GET_SIZE(self) == 1 &&\r
6810 Py_UNICODE_ISDECIMAL(*p))\r
6811 return PyBool_FromLong(1);\r
6812\r
6813 /* Special case for empty strings */\r
6814 if (PyUnicode_GET_SIZE(self) == 0)\r
6815 return PyBool_FromLong(0);\r
6816\r
6817 e = p + PyUnicode_GET_SIZE(self);\r
6818 for (; p < e; p++) {\r
6819 if (!Py_UNICODE_ISDECIMAL(*p))\r
6820 return PyBool_FromLong(0);\r
6821 }\r
6822 return PyBool_FromLong(1);\r
6823}\r
6824\r
6825PyDoc_STRVAR(isdigit__doc__,\r
6826 "S.isdigit() -> bool\n\\r
6827\n\\r
6828Return True if all characters in S are digits\n\\r
6829and there is at least one character in S, False otherwise.");\r
6830\r
6831static PyObject*\r
6832unicode_isdigit(PyUnicodeObject *self)\r
6833{\r
6834 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6835 register const Py_UNICODE *e;\r
6836\r
6837 /* Shortcut for single character strings */\r
6838 if (PyUnicode_GET_SIZE(self) == 1 &&\r
6839 Py_UNICODE_ISDIGIT(*p))\r
6840 return PyBool_FromLong(1);\r
6841\r
6842 /* Special case for empty strings */\r
6843 if (PyUnicode_GET_SIZE(self) == 0)\r
6844 return PyBool_FromLong(0);\r
6845\r
6846 e = p + PyUnicode_GET_SIZE(self);\r
6847 for (; p < e; p++) {\r
6848 if (!Py_UNICODE_ISDIGIT(*p))\r
6849 return PyBool_FromLong(0);\r
6850 }\r
6851 return PyBool_FromLong(1);\r
6852}\r
6853\r
6854PyDoc_STRVAR(isnumeric__doc__,\r
6855 "S.isnumeric() -> bool\n\\r
6856\n\\r
6857Return True if there are only numeric characters in S,\n\\r
6858False otherwise.");\r
6859\r
6860static PyObject*\r
6861unicode_isnumeric(PyUnicodeObject *self)\r
6862{\r
6863 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);\r
6864 register const Py_UNICODE *e;\r
6865\r
6866 /* Shortcut for single character strings */\r
6867 if (PyUnicode_GET_SIZE(self) == 1 &&\r
6868 Py_UNICODE_ISNUMERIC(*p))\r
6869 return PyBool_FromLong(1);\r
6870\r
6871 /* Special case for empty strings */\r
6872 if (PyUnicode_GET_SIZE(self) == 0)\r
6873 return PyBool_FromLong(0);\r
6874\r
6875 e = p + PyUnicode_GET_SIZE(self);\r
6876 for (; p < e; p++) {\r
6877 if (!Py_UNICODE_ISNUMERIC(*p))\r
6878 return PyBool_FromLong(0);\r
6879 }\r
6880 return PyBool_FromLong(1);\r
6881}\r
6882\r
6883PyDoc_STRVAR(join__doc__,\r
6884 "S.join(iterable) -> unicode\n\\r
6885\n\\r
6886Return a string which is the concatenation of the strings in the\n\\r
6887iterable. The separator between elements is S.");\r
6888\r
6889static PyObject*\r
6890unicode_join(PyObject *self, PyObject *data)\r
6891{\r
6892 return PyUnicode_Join(self, data);\r
6893}\r
6894\r
6895static Py_ssize_t\r
6896unicode_length(PyUnicodeObject *self)\r
6897{\r
6898 return self->length;\r
6899}\r
6900\r
6901PyDoc_STRVAR(ljust__doc__,\r
6902 "S.ljust(width[, fillchar]) -> int\n\\r
6903\n\\r
6904Return S left-justified in a Unicode string of length width. Padding is\n\\r
6905done using the specified fill character (default is a space).");\r
6906\r
6907static PyObject *\r
6908unicode_ljust(PyUnicodeObject *self, PyObject *args)\r
6909{\r
6910 Py_ssize_t width;\r
6911 Py_UNICODE fillchar = ' ';\r
6912\r
6913 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))\r
6914 return NULL;\r
6915\r
6916 if (self->length >= width && PyUnicode_CheckExact(self)) {\r
6917 Py_INCREF(self);\r
6918 return (PyObject*) self;\r
6919 }\r
6920\r
6921 return (PyObject*) pad(self, 0, width - self->length, fillchar);\r
6922}\r
6923\r
6924PyDoc_STRVAR(lower__doc__,\r
6925 "S.lower() -> unicode\n\\r
6926\n\\r
6927Return a copy of the string S converted to lowercase.");\r
6928\r
6929static PyObject*\r
6930unicode_lower(PyUnicodeObject *self)\r
6931{\r
6932 return fixup(self, fixlower);\r
6933}\r
6934\r
6935#define LEFTSTRIP 0\r
6936#define RIGHTSTRIP 1\r
6937#define BOTHSTRIP 2\r
6938\r
6939/* Arrays indexed by above */\r
6940static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};\r
6941\r
6942#define STRIPNAME(i) (stripformat[i]+3)\r
6943\r
6944/* externally visible for str.strip(unicode) */\r
6945PyObject *\r
6946_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)\r
6947{\r
6948 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);\r
6949 Py_ssize_t len = PyUnicode_GET_SIZE(self);\r
6950 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);\r
6951 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);\r
6952 Py_ssize_t i, j;\r
6953\r
6954 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);\r
6955\r
6956 i = 0;\r
6957 if (striptype != RIGHTSTRIP) {\r
6958 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {\r
6959 i++;\r
6960 }\r
6961 }\r
6962\r
6963 j = len;\r
6964 if (striptype != LEFTSTRIP) {\r
6965 do {\r
6966 j--;\r
6967 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));\r
6968 j++;\r
6969 }\r
6970\r
6971 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {\r
6972 Py_INCREF(self);\r
6973 return (PyObject*)self;\r
6974 }\r
6975 else\r
6976 return PyUnicode_FromUnicode(s+i, j-i);\r
6977}\r
6978\r
6979\r
6980static PyObject *\r
6981do_strip(PyUnicodeObject *self, int striptype)\r
6982{\r
6983 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);\r
6984 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;\r
6985\r
6986 i = 0;\r
6987 if (striptype != RIGHTSTRIP) {\r
6988 while (i < len && Py_UNICODE_ISSPACE(s[i])) {\r
6989 i++;\r
6990 }\r
6991 }\r
6992\r
6993 j = len;\r
6994 if (striptype != LEFTSTRIP) {\r
6995 do {\r
6996 j--;\r
6997 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));\r
6998 j++;\r
6999 }\r
7000\r
7001 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {\r
7002 Py_INCREF(self);\r
7003 return (PyObject*)self;\r
7004 }\r
7005 else\r
7006 return PyUnicode_FromUnicode(s+i, j-i);\r
7007}\r
7008\r
7009\r
7010static PyObject *\r
7011do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)\r
7012{\r
7013 PyObject *sep = NULL;\r
7014\r
7015 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))\r
7016 return NULL;\r
7017\r
7018 if (sep != NULL && sep != Py_None) {\r
7019 if (PyUnicode_Check(sep))\r
7020 return _PyUnicode_XStrip(self, striptype, sep);\r
7021 else if (PyString_Check(sep)) {\r
7022 PyObject *res;\r
7023 sep = PyUnicode_FromObject(sep);\r
7024 if (sep==NULL)\r
7025 return NULL;\r
7026 res = _PyUnicode_XStrip(self, striptype, sep);\r
7027 Py_DECREF(sep);\r
7028 return res;\r
7029 }\r
7030 else {\r
7031 PyErr_Format(PyExc_TypeError,\r
7032 "%s arg must be None, unicode or str",\r
7033 STRIPNAME(striptype));\r
7034 return NULL;\r
7035 }\r
7036 }\r
7037\r
7038 return do_strip(self, striptype);\r
7039}\r
7040\r
7041\r
7042PyDoc_STRVAR(strip__doc__,\r
7043 "S.strip([chars]) -> unicode\n\\r
7044\n\\r
7045Return a copy of the string S with leading and trailing\n\\r
7046whitespace removed.\n\\r
7047If chars is given and not None, remove characters in chars instead.\n\\r
7048If chars is a str, it will be converted to unicode before stripping");\r
7049\r
7050static PyObject *\r
7051unicode_strip(PyUnicodeObject *self, PyObject *args)\r
7052{\r
7053 if (PyTuple_GET_SIZE(args) == 0)\r
7054 return do_strip(self, BOTHSTRIP); /* Common case */\r
7055 else\r
7056 return do_argstrip(self, BOTHSTRIP, args);\r
7057}\r
7058\r
7059\r
7060PyDoc_STRVAR(lstrip__doc__,\r
7061 "S.lstrip([chars]) -> unicode\n\\r
7062\n\\r
7063Return a copy of the string S with leading whitespace removed.\n\\r
7064If chars is given and not None, remove characters in chars instead.\n\\r
7065If chars is a str, it will be converted to unicode before stripping");\r
7066\r
7067static PyObject *\r
7068unicode_lstrip(PyUnicodeObject *self, PyObject *args)\r
7069{\r
7070 if (PyTuple_GET_SIZE(args) == 0)\r
7071 return do_strip(self, LEFTSTRIP); /* Common case */\r
7072 else\r
7073 return do_argstrip(self, LEFTSTRIP, args);\r
7074}\r
7075\r
7076\r
7077PyDoc_STRVAR(rstrip__doc__,\r
7078 "S.rstrip([chars]) -> unicode\n\\r
7079\n\\r
7080Return a copy of the string S with trailing whitespace removed.\n\\r
7081If chars is given and not None, remove characters in chars instead.\n\\r
7082If chars is a str, it will be converted to unicode before stripping");\r
7083\r
7084static PyObject *\r
7085unicode_rstrip(PyUnicodeObject *self, PyObject *args)\r
7086{\r
7087 if (PyTuple_GET_SIZE(args) == 0)\r
7088 return do_strip(self, RIGHTSTRIP); /* Common case */\r
7089 else\r
7090 return do_argstrip(self, RIGHTSTRIP, args);\r
7091}\r
7092\r
7093\r
7094static PyObject*\r
7095unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)\r
7096{\r
7097 PyUnicodeObject *u;\r
7098 Py_UNICODE *p;\r
7099 Py_ssize_t nchars;\r
7100 size_t nbytes;\r
7101\r
7102 if (len < 0)\r
7103 len = 0;\r
7104\r
7105 if (len == 1 && PyUnicode_CheckExact(str)) {\r
7106 /* no repeat, return original string */\r
7107 Py_INCREF(str);\r
7108 return (PyObject*) str;\r
7109 }\r
7110\r
7111 /* ensure # of chars needed doesn't overflow int and # of bytes\r
7112 * needed doesn't overflow size_t\r
7113 */\r
7114 nchars = len * str->length;\r
7115 if (len && nchars / len != str->length) {\r
7116 PyErr_SetString(PyExc_OverflowError,\r
7117 "repeated string is too long");\r
7118 return NULL;\r
7119 }\r
7120 nbytes = (nchars + 1) * sizeof(Py_UNICODE);\r
7121 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {\r
7122 PyErr_SetString(PyExc_OverflowError,\r
7123 "repeated string is too long");\r
7124 return NULL;\r
7125 }\r
7126 u = _PyUnicode_New(nchars);\r
7127 if (!u)\r
7128 return NULL;\r
7129\r
7130 p = u->str;\r
7131\r
7132 if (str->length == 1 && len > 0) {\r
7133 Py_UNICODE_FILL(p, str->str[0], len);\r
7134 } else {\r
7135 Py_ssize_t done = 0; /* number of characters copied this far */\r
7136 if (done < nchars) {\r
7137 Py_UNICODE_COPY(p, str->str, str->length);\r
7138 done = str->length;\r
7139 }\r
7140 while (done < nchars) {\r
7141 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;\r
7142 Py_UNICODE_COPY(p+done, p, n);\r
7143 done += n;\r
7144 }\r
7145 }\r
7146\r
7147 return (PyObject*) u;\r
7148}\r
7149\r
7150PyObject *PyUnicode_Replace(PyObject *obj,\r
7151 PyObject *subobj,\r
7152 PyObject *replobj,\r
7153 Py_ssize_t maxcount)\r
7154{\r
7155 PyObject *self;\r
7156 PyObject *str1;\r
7157 PyObject *str2;\r
7158 PyObject *result;\r
7159\r
7160 self = PyUnicode_FromObject(obj);\r
7161 if (self == NULL)\r
7162 return NULL;\r
7163 str1 = PyUnicode_FromObject(subobj);\r
7164 if (str1 == NULL) {\r
7165 Py_DECREF(self);\r
7166 return NULL;\r
7167 }\r
7168 str2 = PyUnicode_FromObject(replobj);\r
7169 if (str2 == NULL) {\r
7170 Py_DECREF(self);\r
7171 Py_DECREF(str1);\r
7172 return NULL;\r
7173 }\r
7174 result = replace((PyUnicodeObject *)self,\r
7175 (PyUnicodeObject *)str1,\r
7176 (PyUnicodeObject *)str2,\r
7177 maxcount);\r
7178 Py_DECREF(self);\r
7179 Py_DECREF(str1);\r
7180 Py_DECREF(str2);\r
7181 return result;\r
7182}\r
7183\r
7184PyDoc_STRVAR(replace__doc__,\r
7185 "S.replace(old, new[, count]) -> unicode\n\\r
7186\n\\r
7187Return a copy of S with all occurrences of substring\n\\r
7188old replaced by new. If the optional argument count is\n\\r
7189given, only the first count occurrences are replaced.");\r
7190\r
7191static PyObject*\r
7192unicode_replace(PyUnicodeObject *self, PyObject *args)\r
7193{\r
7194 PyUnicodeObject *str1;\r
7195 PyUnicodeObject *str2;\r
7196 Py_ssize_t maxcount = -1;\r
7197 PyObject *result;\r
7198\r
7199 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))\r
7200 return NULL;\r
7201 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);\r
7202 if (str1 == NULL)\r
7203 return NULL;\r
7204 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);\r
7205 if (str2 == NULL) {\r
7206 Py_DECREF(str1);\r
7207 return NULL;\r
7208 }\r
7209\r
7210 result = replace(self, str1, str2, maxcount);\r
7211\r
7212 Py_DECREF(str1);\r
7213 Py_DECREF(str2);\r
7214 return result;\r
7215}\r
7216\r
7217static\r
7218PyObject *unicode_repr(PyObject *unicode)\r
7219{\r
7220 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),\r
7221 PyUnicode_GET_SIZE(unicode),\r
7222 1);\r
7223}\r
7224\r
7225PyDoc_STRVAR(rfind__doc__,\r
7226 "S.rfind(sub [,start [,end]]) -> int\n\\r
7227\n\\r
7228Return the highest index in S where substring sub is found,\n\\r
7229such that sub is contained within s[start:end]. Optional\n\\r
7230arguments start and end are interpreted as in slice notation.\n\\r
7231\n\\r
7232Return -1 on failure.");\r
7233\r
7234static PyObject *\r
7235unicode_rfind(PyUnicodeObject *self, PyObject *args)\r
7236{\r
7237 PyUnicodeObject *substring;\r
7238 Py_ssize_t start;\r
7239 Py_ssize_t end;\r
7240 Py_ssize_t result;\r
7241\r
7242 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,\r
7243 &start, &end))\r
7244 return NULL;\r
7245\r
7246 result = stringlib_rfind_slice(\r
7247 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r
7248 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r
7249 start, end\r
7250 );\r
7251\r
7252 Py_DECREF(substring);\r
7253\r
7254 return PyInt_FromSsize_t(result);\r
7255}\r
7256\r
7257PyDoc_STRVAR(rindex__doc__,\r
7258 "S.rindex(sub [,start [,end]]) -> int\n\\r
7259\n\\r
7260Like S.rfind() but raise ValueError when the substring is not found.");\r
7261\r
7262static PyObject *\r
7263unicode_rindex(PyUnicodeObject *self, PyObject *args)\r
7264{\r
7265 PyUnicodeObject *substring;\r
7266 Py_ssize_t start;\r
7267 Py_ssize_t end;\r
7268 Py_ssize_t result;\r
7269\r
7270 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,\r
7271 &start, &end))\r
7272 return NULL;\r
7273\r
7274 result = stringlib_rfind_slice(\r
7275 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),\r
7276 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),\r
7277 start, end\r
7278 );\r
7279\r
7280 Py_DECREF(substring);\r
7281\r
7282 if (result < 0) {\r
7283 PyErr_SetString(PyExc_ValueError, "substring not found");\r
7284 return NULL;\r
7285 }\r
7286 return PyInt_FromSsize_t(result);\r
7287}\r
7288\r
7289PyDoc_STRVAR(rjust__doc__,\r
7290 "S.rjust(width[, fillchar]) -> unicode\n\\r
7291\n\\r
7292Return S right-justified in a Unicode string of length width. Padding is\n\\r
7293done using the specified fill character (default is a space).");\r
7294\r
7295static PyObject *\r
7296unicode_rjust(PyUnicodeObject *self, PyObject *args)\r
7297{\r
7298 Py_ssize_t width;\r
7299 Py_UNICODE fillchar = ' ';\r
7300\r
7301 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))\r
7302 return NULL;\r
7303\r
7304 if (self->length >= width && PyUnicode_CheckExact(self)) {\r
7305 Py_INCREF(self);\r
7306 return (PyObject*) self;\r
7307 }\r
7308\r
7309 return (PyObject*) pad(self, width - self->length, 0, fillchar);\r
7310}\r
7311\r
7312static PyObject*\r
7313unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)\r
7314{\r
7315 /* standard clamping */\r
7316 if (start < 0)\r
7317 start = 0;\r
7318 if (end < 0)\r
7319 end = 0;\r
7320 if (end > self->length)\r
7321 end = self->length;\r
7322 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {\r
7323 /* full slice, return original string */\r
7324 Py_INCREF(self);\r
7325 return (PyObject*) self;\r
7326 }\r
7327 if (start > end)\r
7328 start = end;\r
7329 /* copy slice */\r
7330 return (PyObject*) PyUnicode_FromUnicode(self->str + start,\r
7331 end - start);\r
7332}\r
7333\r
7334PyObject *PyUnicode_Split(PyObject *s,\r
7335 PyObject *sep,\r
7336 Py_ssize_t maxsplit)\r
7337{\r
7338 PyObject *result;\r
7339\r
7340 s = PyUnicode_FromObject(s);\r
7341 if (s == NULL)\r
7342 return NULL;\r
7343 if (sep != NULL) {\r
7344 sep = PyUnicode_FromObject(sep);\r
7345 if (sep == NULL) {\r
7346 Py_DECREF(s);\r
7347 return NULL;\r
7348 }\r
7349 }\r
7350\r
7351 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);\r
7352\r
7353 Py_DECREF(s);\r
7354 Py_XDECREF(sep);\r
7355 return result;\r
7356}\r
7357\r
7358PyDoc_STRVAR(split__doc__,\r
7359 "S.split([sep [,maxsplit]]) -> list of strings\n\\r
7360\n\\r
7361Return a list of the words in S, using sep as the\n\\r
7362delimiter string. If maxsplit is given, at most maxsplit\n\\r
7363splits are done. If sep is not specified or is None, any\n\\r
7364whitespace string is a separator and empty strings are\n\\r
7365removed from the result.");\r
7366\r
7367static PyObject*\r
7368unicode_split(PyUnicodeObject *self, PyObject *args)\r
7369{\r
7370 PyObject *substring = Py_None;\r
7371 Py_ssize_t maxcount = -1;\r
7372\r
7373 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))\r
7374 return NULL;\r
7375\r
7376 if (substring == Py_None)\r
7377 return split(self, NULL, maxcount);\r
7378 else if (PyUnicode_Check(substring))\r
7379 return split(self, (PyUnicodeObject *)substring, maxcount);\r
7380 else\r
7381 return PyUnicode_Split((PyObject *)self, substring, maxcount);\r
7382}\r
7383\r
7384PyObject *\r
7385PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)\r
7386{\r
7387 PyObject* str_obj;\r
7388 PyObject* sep_obj;\r
7389 PyObject* out;\r
7390\r
7391 str_obj = PyUnicode_FromObject(str_in);\r
7392 if (!str_obj)\r
7393 return NULL;\r
7394 sep_obj = PyUnicode_FromObject(sep_in);\r
7395 if (!sep_obj) {\r
7396 Py_DECREF(str_obj);\r
7397 return NULL;\r
7398 }\r
7399\r
7400 out = stringlib_partition(\r
7401 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),\r
7402 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)\r
7403 );\r
7404\r
7405 Py_DECREF(sep_obj);\r
7406 Py_DECREF(str_obj);\r
7407\r
7408 return out;\r
7409}\r
7410\r
7411\r
7412PyObject *\r
7413PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)\r
7414{\r
7415 PyObject* str_obj;\r
7416 PyObject* sep_obj;\r
7417 PyObject* out;\r
7418\r
7419 str_obj = PyUnicode_FromObject(str_in);\r
7420 if (!str_obj)\r
7421 return NULL;\r
7422 sep_obj = PyUnicode_FromObject(sep_in);\r
7423 if (!sep_obj) {\r
7424 Py_DECREF(str_obj);\r
7425 return NULL;\r
7426 }\r
7427\r
7428 out = stringlib_rpartition(\r
7429 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),\r
7430 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)\r
7431 );\r
7432\r
7433 Py_DECREF(sep_obj);\r
7434 Py_DECREF(str_obj);\r
7435\r
7436 return out;\r
7437}\r
7438\r
7439PyDoc_STRVAR(partition__doc__,\r
7440 "S.partition(sep) -> (head, sep, tail)\n\\r
7441\n\\r
7442Search for the separator sep in S, and return the part before it,\n\\r
7443the separator itself, and the part after it. If the separator is not\n\\r
7444found, return S and two empty strings.");\r
7445\r
7446static PyObject*\r
7447unicode_partition(PyUnicodeObject *self, PyObject *separator)\r
7448{\r
7449 return PyUnicode_Partition((PyObject *)self, separator);\r
7450}\r
7451\r
7452PyDoc_STRVAR(rpartition__doc__,\r
7453 "S.rpartition(sep) -> (head, sep, tail)\n\\r
7454\n\\r
7455Search for the separator sep in S, starting at the end of S, and return\n\\r
7456the part before it, the separator itself, and the part after it. If the\n\\r
7457separator is not found, return two empty strings and S.");\r
7458\r
7459static PyObject*\r
7460unicode_rpartition(PyUnicodeObject *self, PyObject *separator)\r
7461{\r
7462 return PyUnicode_RPartition((PyObject *)self, separator);\r
7463}\r
7464\r
7465PyObject *PyUnicode_RSplit(PyObject *s,\r
7466 PyObject *sep,\r
7467 Py_ssize_t maxsplit)\r
7468{\r
7469 PyObject *result;\r
7470\r
7471 s = PyUnicode_FromObject(s);\r
7472 if (s == NULL)\r
7473 return NULL;\r
7474 if (sep != NULL) {\r
7475 sep = PyUnicode_FromObject(sep);\r
7476 if (sep == NULL) {\r
7477 Py_DECREF(s);\r
7478 return NULL;\r
7479 }\r
7480 }\r
7481\r
7482 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);\r
7483\r
7484 Py_DECREF(s);\r
7485 Py_XDECREF(sep);\r
7486 return result;\r
7487}\r
7488\r
7489PyDoc_STRVAR(rsplit__doc__,\r
7490 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\\r
7491\n\\r
7492Return a list of the words in S, using sep as the\n\\r
7493delimiter string, starting at the end of the string and\n\\r
7494working to the front. If maxsplit is given, at most maxsplit\n\\r
7495splits are done. If sep is not specified, any whitespace string\n\\r
7496is a separator.");\r
7497\r
7498static PyObject*\r
7499unicode_rsplit(PyUnicodeObject *self, PyObject *args)\r
7500{\r
7501 PyObject *substring = Py_None;\r
7502 Py_ssize_t maxcount = -1;\r
7503\r
7504 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))\r
7505 return NULL;\r
7506\r
7507 if (substring == Py_None)\r
7508 return rsplit(self, NULL, maxcount);\r
7509 else if (PyUnicode_Check(substring))\r
7510 return rsplit(self, (PyUnicodeObject *)substring, maxcount);\r
7511 else\r
7512 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);\r
7513}\r
7514\r
7515PyDoc_STRVAR(splitlines__doc__,\r
7516 "S.splitlines([keepends]) -> list of strings\n\\r
7517\n\\r
7518Return a list of the lines in S, breaking at line boundaries.\n\\r
7519Line breaks are not included in the resulting list unless keepends\n\\r
7520is given and true.");\r
7521\r
7522static PyObject*\r
7523unicode_splitlines(PyUnicodeObject *self, PyObject *args)\r
7524{\r
7525 int keepends = 0;\r
7526\r
7527 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))\r
7528 return NULL;\r
7529\r
7530 return PyUnicode_Splitlines((PyObject *)self, keepends);\r
7531}\r
7532\r
7533static\r
7534PyObject *unicode_str(PyUnicodeObject *self)\r
7535{\r
7536 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);\r
7537}\r
7538\r
7539PyDoc_STRVAR(swapcase__doc__,\r
7540 "S.swapcase() -> unicode\n\\r
7541\n\\r
7542Return a copy of S with uppercase characters converted to lowercase\n\\r
7543and vice versa.");\r
7544\r
7545static PyObject*\r
7546unicode_swapcase(PyUnicodeObject *self)\r
7547{\r
7548 return fixup(self, fixswapcase);\r
7549}\r
7550\r
7551PyDoc_STRVAR(translate__doc__,\r
7552 "S.translate(table) -> unicode\n\\r
7553\n\\r
7554Return a copy of the string S, where all characters have been mapped\n\\r
7555through the given translation table, which must be a mapping of\n\\r
7556Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\\r
7557Unmapped characters are left untouched. Characters mapped to None\n\\r
7558are deleted.");\r
7559\r
7560static PyObject*\r
7561unicode_translate(PyUnicodeObject *self, PyObject *table)\r
7562{\r
7563 return PyUnicode_TranslateCharmap(self->str,\r
7564 self->length,\r
7565 table,\r
7566 "ignore");\r
7567}\r
7568\r
7569PyDoc_STRVAR(upper__doc__,\r
7570 "S.upper() -> unicode\n\\r
7571\n\\r
7572Return a copy of S converted to uppercase.");\r
7573\r
7574static PyObject*\r
7575unicode_upper(PyUnicodeObject *self)\r
7576{\r
7577 return fixup(self, fixupper);\r
7578}\r
7579\r
7580PyDoc_STRVAR(zfill__doc__,\r
7581 "S.zfill(width) -> unicode\n\\r
7582\n\\r
7583Pad a numeric string S with zeros on the left, to fill a field\n\\r
7584of the specified width. The string S is never truncated.");\r
7585\r
7586static PyObject *\r
7587unicode_zfill(PyUnicodeObject *self, PyObject *args)\r
7588{\r
7589 Py_ssize_t fill;\r
7590 PyUnicodeObject *u;\r
7591\r
7592 Py_ssize_t width;\r
7593 if (!PyArg_ParseTuple(args, "n:zfill", &width))\r
7594 return NULL;\r
7595\r
7596 if (self->length >= width) {\r
7597 if (PyUnicode_CheckExact(self)) {\r
7598 Py_INCREF(self);\r
7599 return (PyObject*) self;\r
7600 }\r
7601 else\r
7602 return PyUnicode_FromUnicode(\r
7603 PyUnicode_AS_UNICODE(self),\r
7604 PyUnicode_GET_SIZE(self)\r
7605 );\r
7606 }\r
7607\r
7608 fill = width - self->length;\r
7609\r
7610 u = pad(self, fill, 0, '0');\r
7611\r
7612 if (u == NULL)\r
7613 return NULL;\r
7614\r
7615 if (u->str[fill] == '+' || u->str[fill] == '-') {\r
7616 /* move sign to beginning of string */\r
7617 u->str[0] = u->str[fill];\r
7618 u->str[fill] = '0';\r
7619 }\r
7620\r
7621 return (PyObject*) u;\r
7622}\r
7623\r
7624#if 0\r
7625static PyObject*\r
7626free_listsize(PyUnicodeObject *self)\r
7627{\r
7628 return PyInt_FromLong(numfree);\r
7629}\r
7630#endif\r
7631\r
7632PyDoc_STRVAR(startswith__doc__,\r
7633 "S.startswith(prefix[, start[, end]]) -> bool\n\\r
7634\n\\r
7635Return True if S starts with the specified prefix, False otherwise.\n\\r
7636With optional start, test S beginning at that position.\n\\r
7637With optional end, stop comparing S at that position.\n\\r
7638prefix can also be a tuple of strings to try.");\r
7639\r
7640static PyObject *\r
7641unicode_startswith(PyUnicodeObject *self,\r
7642 PyObject *args)\r
7643{\r
7644 PyObject *subobj;\r
7645 PyUnicodeObject *substring;\r
7646 Py_ssize_t start = 0;\r
7647 Py_ssize_t end = PY_SSIZE_T_MAX;\r
7648 int result;\r
7649\r
7650 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))\r
7651 return NULL;\r
7652 if (PyTuple_Check(subobj)) {\r
7653 Py_ssize_t i;\r
7654 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {\r
7655 substring = (PyUnicodeObject *)PyUnicode_FromObject(\r
7656 PyTuple_GET_ITEM(subobj, i));\r
7657 if (substring == NULL)\r
7658 return NULL;\r
7659 result = tailmatch(self, substring, start, end, -1);\r
7660 Py_DECREF(substring);\r
7661 if (result) {\r
7662 Py_RETURN_TRUE;\r
7663 }\r
7664 }\r
7665 /* nothing matched */\r
7666 Py_RETURN_FALSE;\r
7667 }\r
7668 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);\r
7669 if (substring == NULL) {\r
7670 if (PyErr_ExceptionMatches(PyExc_TypeError))\r
7671 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "\r
7672 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);\r
7673 return NULL;\r
7674 }\r
7675 result = tailmatch(self, substring, start, end, -1);\r
7676 Py_DECREF(substring);\r
7677 return PyBool_FromLong(result);\r
7678}\r
7679\r
7680\r
7681PyDoc_STRVAR(endswith__doc__,\r
7682 "S.endswith(suffix[, start[, end]]) -> bool\n\\r
7683\n\\r
7684Return True if S ends with the specified suffix, False otherwise.\n\\r
7685With optional start, test S beginning at that position.\n\\r
7686With optional end, stop comparing S at that position.\n\\r
7687suffix can also be a tuple of strings to try.");\r
7688\r
7689static PyObject *\r
7690unicode_endswith(PyUnicodeObject *self,\r
7691 PyObject *args)\r
7692{\r
7693 PyObject *subobj;\r
7694 PyUnicodeObject *substring;\r
7695 Py_ssize_t start = 0;\r
7696 Py_ssize_t end = PY_SSIZE_T_MAX;\r
7697 int result;\r
7698\r
7699 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))\r
7700 return NULL;\r
7701 if (PyTuple_Check(subobj)) {\r
7702 Py_ssize_t i;\r
7703 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {\r
7704 substring = (PyUnicodeObject *)PyUnicode_FromObject(\r
7705 PyTuple_GET_ITEM(subobj, i));\r
7706 if (substring == NULL)\r
7707 return NULL;\r
7708 result = tailmatch(self, substring, start, end, +1);\r
7709 Py_DECREF(substring);\r
7710 if (result) {\r
7711 Py_RETURN_TRUE;\r
7712 }\r
7713 }\r
7714 Py_RETURN_FALSE;\r
7715 }\r
7716 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);\r
7717 if (substring == NULL) {\r
7718 if (PyErr_ExceptionMatches(PyExc_TypeError))\r
7719 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "\r
7720 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);\r
7721 return NULL;\r
7722 }\r
7723 result = tailmatch(self, substring, start, end, +1);\r
7724 Py_DECREF(substring);\r
7725 return PyBool_FromLong(result);\r
7726}\r
7727\r
7728\r
7729/* Implements do_string_format, which is unicode because of stringlib */\r
7730#include "stringlib/string_format.h"\r
7731\r
7732PyDoc_STRVAR(format__doc__,\r
7733 "S.format(*args, **kwargs) -> unicode\n\\r
7734\n\\r
7735Return a formatted version of S, using substitutions from args and kwargs.\n\\r
7736The substitutions are identified by braces ('{' and '}').");\r
7737\r
7738static PyObject *\r
7739unicode__format__(PyObject *self, PyObject *args)\r
7740{\r
7741 PyObject *format_spec;\r
7742 PyObject *result = NULL;\r
7743 PyObject *tmp = NULL;\r
7744\r
7745 /* If 2.x, convert format_spec to the same type as value */\r
7746 /* This is to allow things like u''.format('') */\r
7747 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))\r
7748 goto done;\r
7749 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {\r
7750 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "\r
7751 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);\r
7752 goto done;\r
7753 }\r
7754 tmp = PyObject_Unicode(format_spec);\r
7755 if (tmp == NULL)\r
7756 goto done;\r
7757 format_spec = tmp;\r
7758\r
7759 result = _PyUnicode_FormatAdvanced(self,\r
7760 PyUnicode_AS_UNICODE(format_spec),\r
7761 PyUnicode_GET_SIZE(format_spec));\r
7762 done:\r
7763 Py_XDECREF(tmp);\r
7764 return result;\r
7765}\r
7766\r
7767PyDoc_STRVAR(p_format__doc__,\r
7768 "S.__format__(format_spec) -> unicode\n\\r
7769\n\\r
7770Return a formatted version of S as described by format_spec.");\r
7771\r
7772static PyObject *\r
7773unicode__sizeof__(PyUnicodeObject *v)\r
7774{\r
7775 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +\r
7776 sizeof(Py_UNICODE) * (v->length + 1));\r
7777}\r
7778\r
7779PyDoc_STRVAR(sizeof__doc__,\r
7780 "S.__sizeof__() -> size of S in memory, in bytes\n\\r
7781\n\\r
7782");\r
7783\r
7784static PyObject *\r
7785unicode_getnewargs(PyUnicodeObject *v)\r
7786{\r
7787 return Py_BuildValue("(u#)", v->str, v->length);\r
7788}\r
7789\r
7790\r
7791static PyMethodDef unicode_methods[] = {\r
7792\r
7793 /* Order is according to common usage: often used methods should\r
7794 appear first, since lookup is done sequentially. */\r
7795\r
7796 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},\r
7797 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},\r
7798 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},\r
7799 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},\r
7800 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},\r
7801 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},\r
7802 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},\r
7803 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},\r
7804 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},\r
7805 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},\r
7806 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},\r
7807 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},\r
7808 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},\r
7809 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},\r
7810 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},\r
7811 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},\r
7812 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},\r
7813/* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */\r
7814 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},\r
7815 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},\r
7816 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},\r
7817 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},\r
7818 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},\r
7819 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},\r
7820 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},\r
7821 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},\r
7822 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},\r
7823 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},\r
7824 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},\r
7825 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},\r
7826 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},\r
7827 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},\r
7828 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},\r
7829 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},\r
7830 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},\r
7831 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},\r
7832 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},\r
7833 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},\r
7834 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},\r
7835 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},\r
7836 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},\r
7837 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},\r
7838 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},\r
7839 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},\r
7840 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},\r
7841#if 0\r
7842 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},\r
7843#endif\r
7844\r
7845#if 0\r
7846 /* This one is just used for debugging the implementation. */\r
7847 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},\r
7848#endif\r
7849\r
7850 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},\r
7851 {NULL, NULL}\r
7852};\r
7853\r
7854static PyObject *\r
7855unicode_mod(PyObject *v, PyObject *w)\r
7856{\r
7857 if (!PyUnicode_Check(v)) {\r
7858 Py_INCREF(Py_NotImplemented);\r
7859 return Py_NotImplemented;\r
7860 }\r
7861 return PyUnicode_Format(v, w);\r
7862}\r
7863\r
7864static PyNumberMethods unicode_as_number = {\r
7865 0, /*nb_add*/\r
7866 0, /*nb_subtract*/\r
7867 0, /*nb_multiply*/\r
7868 0, /*nb_divide*/\r
7869 unicode_mod, /*nb_remainder*/\r
7870};\r
7871\r
7872static PySequenceMethods unicode_as_sequence = {\r
7873 (lenfunc) unicode_length, /* sq_length */\r
7874 PyUnicode_Concat, /* sq_concat */\r
7875 (ssizeargfunc) unicode_repeat, /* sq_repeat */\r
7876 (ssizeargfunc) unicode_getitem, /* sq_item */\r
7877 (ssizessizeargfunc) unicode_slice, /* sq_slice */\r
7878 0, /* sq_ass_item */\r
7879 0, /* sq_ass_slice */\r
7880 PyUnicode_Contains, /* sq_contains */\r
7881};\r
7882\r
7883static PyObject*\r
7884unicode_subscript(PyUnicodeObject* self, PyObject* item)\r
7885{\r
7886 if (PyIndex_Check(item)) {\r
7887 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);\r
7888 if (i == -1 && PyErr_Occurred())\r
7889 return NULL;\r
7890 if (i < 0)\r
7891 i += PyUnicode_GET_SIZE(self);\r
7892 return unicode_getitem(self, i);\r
7893 } else if (PySlice_Check(item)) {\r
7894 Py_ssize_t start, stop, step, slicelength, cur, i;\r
7895 Py_UNICODE* source_buf;\r
7896 Py_UNICODE* result_buf;\r
7897 PyObject* result;\r
7898\r
7899 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),\r
7900 &start, &stop, &step, &slicelength) < 0) {\r
7901 return NULL;\r
7902 }\r
7903\r
7904 if (slicelength <= 0) {\r
7905 return PyUnicode_FromUnicode(NULL, 0);\r
7906 } else if (start == 0 && step == 1 && slicelength == self->length &&\r
7907 PyUnicode_CheckExact(self)) {\r
7908 Py_INCREF(self);\r
7909 return (PyObject *)self;\r
7910 } else if (step == 1) {\r
7911 return PyUnicode_FromUnicode(self->str + start, slicelength);\r
7912 } else {\r
7913 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);\r
7914 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*\r
7915 sizeof(Py_UNICODE));\r
7916\r
7917 if (result_buf == NULL)\r
7918 return PyErr_NoMemory();\r
7919\r
7920 for (cur = start, i = 0; i < slicelength; cur += step, i++) {\r
7921 result_buf[i] = source_buf[cur];\r
7922 }\r
7923\r
7924 result = PyUnicode_FromUnicode(result_buf, slicelength);\r
7925 PyObject_FREE(result_buf);\r
7926 return result;\r
7927 }\r
7928 } else {\r
7929 PyErr_SetString(PyExc_TypeError, "string indices must be integers");\r
7930 return NULL;\r
7931 }\r
7932}\r
7933\r
7934static PyMappingMethods unicode_as_mapping = {\r
7935 (lenfunc)unicode_length, /* mp_length */\r
7936 (binaryfunc)unicode_subscript, /* mp_subscript */\r
7937 (objobjargproc)0, /* mp_ass_subscript */\r
7938};\r
7939\r
7940static Py_ssize_t\r
7941unicode_buffer_getreadbuf(PyUnicodeObject *self,\r
7942 Py_ssize_t index,\r
7943 const void **ptr)\r
7944{\r
7945 if (index != 0) {\r
7946 PyErr_SetString(PyExc_SystemError,\r
7947 "accessing non-existent unicode segment");\r
7948 return -1;\r
7949 }\r
7950 *ptr = (void *) self->str;\r
7951 return PyUnicode_GET_DATA_SIZE(self);\r
7952}\r
7953\r
7954static Py_ssize_t\r
7955unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,\r
7956 const void **ptr)\r
7957{\r
7958 PyErr_SetString(PyExc_TypeError,\r
7959 "cannot use unicode as modifiable buffer");\r
7960 return -1;\r
7961}\r
7962\r
7963static int\r
7964unicode_buffer_getsegcount(PyUnicodeObject *self,\r
7965 Py_ssize_t *lenp)\r
7966{\r
7967 if (lenp)\r
7968 *lenp = PyUnicode_GET_DATA_SIZE(self);\r
7969 return 1;\r
7970}\r
7971\r
7972static Py_ssize_t\r
7973unicode_buffer_getcharbuf(PyUnicodeObject *self,\r
7974 Py_ssize_t index,\r
7975 const void **ptr)\r
7976{\r
7977 PyObject *str;\r
7978\r
7979 if (index != 0) {\r
7980 PyErr_SetString(PyExc_SystemError,\r
7981 "accessing non-existent unicode segment");\r
7982 return -1;\r
7983 }\r
7984 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);\r
7985 if (str == NULL)\r
7986 return -1;\r
7987 *ptr = (void *) PyString_AS_STRING(str);\r
7988 return PyString_GET_SIZE(str);\r
7989}\r
7990\r
7991/* Helpers for PyUnicode_Format() */\r
7992\r
7993static PyObject *\r
7994getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)\r
7995{\r
7996 Py_ssize_t argidx = *p_argidx;\r
7997 if (argidx < arglen) {\r
7998 (*p_argidx)++;\r
7999 if (arglen < 0)\r
8000 return args;\r
8001 else\r
8002 return PyTuple_GetItem(args, argidx);\r
8003 }\r
8004 PyErr_SetString(PyExc_TypeError,\r
8005 "not enough arguments for format string");\r
8006 return NULL;\r
8007}\r
8008\r
8009#define F_LJUST (1<<0)\r
8010#define F_SIGN (1<<1)\r
8011#define F_BLANK (1<<2)\r
8012#define F_ALT (1<<3)\r
8013#define F_ZERO (1<<4)\r
8014\r
8015static Py_ssize_t\r
8016strtounicode(Py_UNICODE *buffer, const char *charbuffer)\r
8017{\r
8018 register Py_ssize_t i;\r
8019 Py_ssize_t len = strlen(charbuffer);\r
8020 for (i = len - 1; i >= 0; i--)\r
8021 buffer[i] = (Py_UNICODE) charbuffer[i];\r
8022\r
8023 return len;\r
8024}\r
8025\r
8026static int\r
8027longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)\r
8028{\r
8029 Py_ssize_t result;\r
8030\r
8031 PyOS_snprintf((char *)buffer, len, format, x);\r
8032 result = strtounicode(buffer, (char *)buffer);\r
8033 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);\r
8034}\r
8035\r
8036/* XXX To save some code duplication, formatfloat/long/int could have been\r
8037 shared with stringobject.c, converting from 8-bit to Unicode after the\r
8038 formatting is done. */\r
8039\r
8040/* Returns a new reference to a PyUnicode object, or NULL on failure. */\r
8041\r
8042static PyObject *\r
8043formatfloat(PyObject *v, int flags, int prec, int type)\r
8044{\r
8045 char *p;\r
8046 PyObject *result;\r
8047 double x;\r
8048\r
8049 x = PyFloat_AsDouble(v);\r
8050 if (x == -1.0 && PyErr_Occurred())\r
8051 return NULL;\r
8052\r
8053 if (prec < 0)\r
8054 prec = 6;\r
8055\r
8056 p = PyOS_double_to_string(x, type, prec,\r
8057 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);\r
8058 if (p == NULL)\r
8059 return NULL;\r
8060 result = PyUnicode_FromStringAndSize(p, strlen(p));\r
8061 PyMem_Free(p);\r
8062 return result;\r
8063}\r
8064\r
8065static PyObject*\r
8066formatlong(PyObject *val, int flags, int prec, int type)\r
8067{\r
8068 char *buf;\r
8069 int i, len;\r
8070 PyObject *str; /* temporary string object. */\r
8071 PyUnicodeObject *result;\r
8072\r
8073 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);\r
8074 if (!str)\r
8075 return NULL;\r
8076 result = _PyUnicode_New(len);\r
8077 if (!result) {\r
8078 Py_DECREF(str);\r
8079 return NULL;\r
8080 }\r
8081 for (i = 0; i < len; i++)\r
8082 result->str[i] = buf[i];\r
8083 result->str[len] = 0;\r
8084 Py_DECREF(str);\r
8085 return (PyObject*)result;\r
8086}\r
8087\r
8088static int\r
8089formatint(Py_UNICODE *buf,\r
8090 size_t buflen,\r
8091 int flags,\r
8092 int prec,\r
8093 int type,\r
8094 PyObject *v)\r
8095{\r
8096 /* fmt = '%#.' + `prec` + 'l' + `type`\r
8097 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)\r
8098 * + 1 + 1\r
8099 * = 24\r
8100 */\r
8101 char fmt[64]; /* plenty big enough! */\r
8102 char *sign;\r
8103 long x;\r
8104\r
8105 x = PyInt_AsLong(v);\r
8106 if (x == -1 && PyErr_Occurred())\r
8107 return -1;\r
8108 if (x < 0 && type == 'u') {\r
8109 type = 'd';\r
8110 }\r
8111 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))\r
8112 sign = "-";\r
8113 else\r
8114 sign = "";\r
8115 if (prec < 0)\r
8116 prec = 1;\r
8117\r
8118 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))\r
8119 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11\r
8120 */\r
8121 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {\r
8122 PyErr_SetString(PyExc_OverflowError,\r
8123 "formatted integer is too long (precision too large?)");\r
8124 return -1;\r
8125 }\r
8126\r
8127 if ((flags & F_ALT) &&\r
8128 (type == 'x' || type == 'X')) {\r
8129 /* When converting under %#x or %#X, there are a number\r
8130 * of issues that cause pain:\r
8131 * - when 0 is being converted, the C standard leaves off\r
8132 * the '0x' or '0X', which is inconsistent with other\r
8133 * %#x/%#X conversions and inconsistent with Python's\r
8134 * hex() function\r
8135 * - there are platforms that violate the standard and\r
8136 * convert 0 with the '0x' or '0X'\r
8137 * (Metrowerks, Compaq Tru64)\r
8138 * - there are platforms that give '0x' when converting\r
8139 * under %#X, but convert 0 in accordance with the\r
8140 * standard (OS/2 EMX)\r
8141 *\r
8142 * We can achieve the desired consistency by inserting our\r
8143 * own '0x' or '0X' prefix, and substituting %x/%X in place\r
8144 * of %#x/%#X.\r
8145 *\r
8146 * Note that this is the same approach as used in\r
8147 * formatint() in stringobject.c\r
8148 */\r
8149 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",\r
8150 sign, type, prec, type);\r
8151 }\r
8152 else {\r
8153 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",\r
8154 sign, (flags&F_ALT) ? "#" : "",\r
8155 prec, type);\r
8156 }\r
8157 if (sign[0])\r
8158 return longtounicode(buf, buflen, fmt, -x);\r
8159 else\r
8160 return longtounicode(buf, buflen, fmt, x);\r
8161}\r
8162\r
8163static int\r
8164formatchar(Py_UNICODE *buf,\r
8165 size_t buflen,\r
8166 PyObject *v)\r
8167{\r
8168 PyObject *unistr;\r
8169 char *str;\r
8170 /* presume that the buffer is at least 2 characters long */\r
8171 if (PyUnicode_Check(v)) {\r
8172 if (PyUnicode_GET_SIZE(v) != 1)\r
8173 goto onError;\r
8174 buf[0] = PyUnicode_AS_UNICODE(v)[0];\r
8175 }\r
8176\r
8177 else if (PyString_Check(v)) {\r
8178 if (PyString_GET_SIZE(v) != 1)\r
8179 goto onError;\r
8180 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail\r
8181 with a UnicodeDecodeError if 'char' is not decodable with the\r
8182 default encoding (usually ASCII, but it might be something else) */\r
8183 str = PyString_AS_STRING(v);\r
8184 if ((unsigned char)str[0] > 0x7F) {\r
8185 /* the char is not ASCII; try to decode the string using the\r
8186 default encoding and return -1 to let the UnicodeDecodeError\r
8187 be raised if the string can't be decoded */\r
8188 unistr = PyUnicode_Decode(str, 1, NULL, "strict");\r
8189 if (unistr == NULL)\r
8190 return -1;\r
8191 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];\r
8192 Py_DECREF(unistr);\r
8193 }\r
8194 else\r
8195 buf[0] = (Py_UNICODE)str[0];\r
8196 }\r
8197\r
8198 else {\r
8199 /* Integer input truncated to a character */\r
8200 long x;\r
8201 x = PyInt_AsLong(v);\r
8202 if (x == -1 && PyErr_Occurred())\r
8203 goto onError;\r
8204#ifdef Py_UNICODE_WIDE\r
8205 if (x < 0 || x > 0x10ffff) {\r
8206 PyErr_SetString(PyExc_OverflowError,\r
8207 "%c arg not in range(0x110000) "\r
8208 "(wide Python build)");\r
8209 return -1;\r
8210 }\r
8211#else\r
8212 if (x < 0 || x > 0xffff) {\r
8213 PyErr_SetString(PyExc_OverflowError,\r
8214 "%c arg not in range(0x10000) "\r
8215 "(narrow Python build)");\r
8216 return -1;\r
8217 }\r
8218#endif\r
8219 buf[0] = (Py_UNICODE) x;\r
8220 }\r
8221 buf[1] = '\0';\r
8222 return 1;\r
8223\r
8224 onError:\r
8225 PyErr_SetString(PyExc_TypeError,\r
8226 "%c requires int or char");\r
8227 return -1;\r
8228}\r
8229\r
8230/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)\r
8231\r
8232 FORMATBUFLEN is the length of the buffer in which the ints &\r
8233 chars are formatted. XXX This is a magic number. Each formatting\r
8234 routine does bounds checking to ensure no overflow, but a better\r
8235 solution may be to malloc a buffer of appropriate size for each\r
8236 format. For now, the current solution is sufficient.\r
8237*/\r
8238#define FORMATBUFLEN (size_t)120\r
8239\r
8240PyObject *PyUnicode_Format(PyObject *format,\r
8241 PyObject *args)\r
8242{\r
8243 Py_UNICODE *fmt, *res;\r
8244 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;\r
8245 int args_owned = 0;\r
8246 PyUnicodeObject *result = NULL;\r
8247 PyObject *dict = NULL;\r
8248 PyObject *uformat;\r
8249\r
8250 if (format == NULL || args == NULL) {\r
8251 PyErr_BadInternalCall();\r
8252 return NULL;\r
8253 }\r
8254 uformat = PyUnicode_FromObject(format);\r
8255 if (uformat == NULL)\r
8256 return NULL;\r
8257 fmt = PyUnicode_AS_UNICODE(uformat);\r
8258 fmtcnt = PyUnicode_GET_SIZE(uformat);\r
8259\r
8260 reslen = rescnt = fmtcnt + 100;\r
8261 result = _PyUnicode_New(reslen);\r
8262 if (result == NULL)\r
8263 goto onError;\r
8264 res = PyUnicode_AS_UNICODE(result);\r
8265\r
8266 if (PyTuple_Check(args)) {\r
8267 arglen = PyTuple_Size(args);\r
8268 argidx = 0;\r
8269 }\r
8270 else {\r
8271 arglen = -1;\r
8272 argidx = -2;\r
8273 }\r
8274 if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&\r
8275 !PyObject_TypeCheck(args, &PyBaseString_Type))\r
8276 dict = args;\r
8277\r
8278 while (--fmtcnt >= 0) {\r
8279 if (*fmt != '%') {\r
8280 if (--rescnt < 0) {\r
8281 rescnt = fmtcnt + 100;\r
8282 reslen += rescnt;\r
8283 if (_PyUnicode_Resize(&result, reslen) < 0)\r
8284 goto onError;\r
8285 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;\r
8286 --rescnt;\r
8287 }\r
8288 *res++ = *fmt++;\r
8289 }\r
8290 else {\r
8291 /* Got a format specifier */\r
8292 int flags = 0;\r
8293 Py_ssize_t width = -1;\r
8294 int prec = -1;\r
8295 Py_UNICODE c = '\0';\r
8296 Py_UNICODE fill;\r
8297 int isnumok;\r
de08c53b
DM
8298 PyObject *v = NULL;\r
8299 PyObject *temp = NULL;\r
8300 Py_UNICODE *pbuf = NULL;\r
4710c53d 8301 Py_UNICODE sign;\r
8302 Py_ssize_t len;\r
8303 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */\r
8304\r
8305 fmt++;\r
8306 if (*fmt == '(') {\r
8307 Py_UNICODE *keystart;\r
8308 Py_ssize_t keylen;\r
8309 PyObject *key;\r
8310 int pcount = 1;\r
8311\r
8312 if (dict == NULL) {\r
8313 PyErr_SetString(PyExc_TypeError,\r
8314 "format requires a mapping");\r
8315 goto onError;\r
8316 }\r
8317 ++fmt;\r
8318 --fmtcnt;\r
8319 keystart = fmt;\r
8320 /* Skip over balanced parentheses */\r
8321 while (pcount > 0 && --fmtcnt >= 0) {\r
8322 if (*fmt == ')')\r
8323 --pcount;\r
8324 else if (*fmt == '(')\r
8325 ++pcount;\r
8326 fmt++;\r
8327 }\r
8328 keylen = fmt - keystart - 1;\r
8329 if (fmtcnt < 0 || pcount > 0) {\r
8330 PyErr_SetString(PyExc_ValueError,\r
8331 "incomplete format key");\r
8332 goto onError;\r
8333 }\r
8334#if 0\r
8335 /* keys are converted to strings using UTF-8 and\r
8336 then looked up since Python uses strings to hold\r
8337 variables names etc. in its namespaces and we\r
8338 wouldn't want to break common idioms. */\r
8339 key = PyUnicode_EncodeUTF8(keystart,\r
8340 keylen,\r
8341 NULL);\r
8342#else\r
8343 key = PyUnicode_FromUnicode(keystart, keylen);\r
8344#endif\r
8345 if (key == NULL)\r
8346 goto onError;\r
8347 if (args_owned) {\r
8348 Py_DECREF(args);\r
8349 args_owned = 0;\r
8350 }\r
8351 args = PyObject_GetItem(dict, key);\r
8352 Py_DECREF(key);\r
8353 if (args == NULL) {\r
8354 goto onError;\r
8355 }\r
8356 args_owned = 1;\r
8357 arglen = -1;\r
8358 argidx = -2;\r
8359 }\r
8360 while (--fmtcnt >= 0) {\r
8361 switch (c = *fmt++) {\r
8362 case '-': flags |= F_LJUST; continue;\r
8363 case '+': flags |= F_SIGN; continue;\r
8364 case ' ': flags |= F_BLANK; continue;\r
8365 case '#': flags |= F_ALT; continue;\r
8366 case '0': flags |= F_ZERO; continue;\r
8367 }\r
8368 break;\r
8369 }\r
8370 if (c == '*') {\r
8371 v = getnextarg(args, arglen, &argidx);\r
8372 if (v == NULL)\r
8373 goto onError;\r
8374 if (!PyInt_Check(v)) {\r
8375 PyErr_SetString(PyExc_TypeError,\r
8376 "* wants int");\r
8377 goto onError;\r
8378 }\r
8379 width = PyInt_AsLong(v);\r
8380 if (width < 0) {\r
8381 flags |= F_LJUST;\r
8382 width = -width;\r
8383 }\r
8384 if (--fmtcnt >= 0)\r
8385 c = *fmt++;\r
8386 }\r
8387 else if (c >= '0' && c <= '9') {\r
8388 width = c - '0';\r
8389 while (--fmtcnt >= 0) {\r
8390 c = *fmt++;\r
8391 if (c < '0' || c > '9')\r
8392 break;\r
8393 if ((width*10) / 10 != width) {\r
8394 PyErr_SetString(PyExc_ValueError,\r
8395 "width too big");\r
8396 goto onError;\r
8397 }\r
8398 width = width*10 + (c - '0');\r
8399 }\r
8400 }\r
8401 if (c == '.') {\r
8402 prec = 0;\r
8403 if (--fmtcnt >= 0)\r
8404 c = *fmt++;\r
8405 if (c == '*') {\r
8406 v = getnextarg(args, arglen, &argidx);\r
8407 if (v == NULL)\r
8408 goto onError;\r
8409 if (!PyInt_Check(v)) {\r
8410 PyErr_SetString(PyExc_TypeError,\r
8411 "* wants int");\r
8412 goto onError;\r
8413 }\r
8414 prec = PyInt_AsLong(v);\r
8415 if (prec < 0)\r
8416 prec = 0;\r
8417 if (--fmtcnt >= 0)\r
8418 c = *fmt++;\r
8419 }\r
8420 else if (c >= '0' && c <= '9') {\r
8421 prec = c - '0';\r
8422 while (--fmtcnt >= 0) {\r
8423 c = *fmt++;\r
8424 if (c < '0' || c > '9')\r
8425 break;\r
8426 if ((prec*10) / 10 != prec) {\r
8427 PyErr_SetString(PyExc_ValueError,\r
8428 "prec too big");\r
8429 goto onError;\r
8430 }\r
8431 prec = prec*10 + (c - '0');\r
8432 }\r
8433 }\r
8434 } /* prec */\r
8435 if (fmtcnt >= 0) {\r
8436 if (c == 'h' || c == 'l' || c == 'L') {\r
8437 if (--fmtcnt >= 0)\r
8438 c = *fmt++;\r
8439 }\r
8440 }\r
8441 if (fmtcnt < 0) {\r
8442 PyErr_SetString(PyExc_ValueError,\r
8443 "incomplete format");\r
8444 goto onError;\r
8445 }\r
8446 if (c != '%') {\r
8447 v = getnextarg(args, arglen, &argidx);\r
8448 if (v == NULL)\r
8449 goto onError;\r
8450 }\r
8451 sign = 0;\r
8452 fill = ' ';\r
8453 switch (c) {\r
8454\r
8455 case '%':\r
8456 pbuf = formatbuf;\r
8457 /* presume that buffer length is at least 1 */\r
8458 pbuf[0] = '%';\r
8459 len = 1;\r
8460 break;\r
8461\r
8462 case 's':\r
8463 case 'r':\r
8464 if (PyUnicode_CheckExact(v) && c == 's') {\r
8465 temp = v;\r
8466 Py_INCREF(temp);\r
8467 }\r
8468 else {\r
8469 PyObject *unicode;\r
8470 if (c == 's')\r
8471 temp = PyObject_Unicode(v);\r
8472 else\r
8473 temp = PyObject_Repr(v);\r
8474 if (temp == NULL)\r
8475 goto onError;\r
8476 if (PyUnicode_Check(temp))\r
8477 /* nothing to do */;\r
8478 else if (PyString_Check(temp)) {\r
8479 /* convert to string to Unicode */\r
8480 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),\r
8481 PyString_GET_SIZE(temp),\r
8482 NULL,\r
8483 "strict");\r
8484 Py_DECREF(temp);\r
8485 temp = unicode;\r
8486 if (temp == NULL)\r
8487 goto onError;\r
8488 }\r
8489 else {\r
8490 Py_DECREF(temp);\r
8491 PyErr_SetString(PyExc_TypeError,\r
8492 "%s argument has non-string str()");\r
8493 goto onError;\r
8494 }\r
8495 }\r
8496 pbuf = PyUnicode_AS_UNICODE(temp);\r
8497 len = PyUnicode_GET_SIZE(temp);\r
8498 if (prec >= 0 && len > prec)\r
8499 len = prec;\r
8500 break;\r
8501\r
8502 case 'i':\r
8503 case 'd':\r
8504 case 'u':\r
8505 case 'o':\r
8506 case 'x':\r
8507 case 'X':\r
8508 if (c == 'i')\r
8509 c = 'd';\r
8510 isnumok = 0;\r
8511 if (PyNumber_Check(v)) {\r
8512 PyObject *iobj=NULL;\r
8513\r
8514 if (PyInt_Check(v) || (PyLong_Check(v))) {\r
8515 iobj = v;\r
8516 Py_INCREF(iobj);\r
8517 }\r
8518 else {\r
8519 iobj = PyNumber_Int(v);\r
8520 if (iobj==NULL) iobj = PyNumber_Long(v);\r
8521 }\r
8522 if (iobj!=NULL) {\r
8523 if (PyInt_Check(iobj)) {\r
8524 isnumok = 1;\r
8525 pbuf = formatbuf;\r
8526 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),\r
8527 flags, prec, c, iobj);\r
8528 Py_DECREF(iobj);\r
8529 if (len < 0)\r
8530 goto onError;\r
8531 sign = 1;\r
8532 }\r
8533 else if (PyLong_Check(iobj)) {\r
8534 isnumok = 1;\r
8535 temp = formatlong(iobj, flags, prec, c);\r
8536 Py_DECREF(iobj);\r
8537 if (!temp)\r
8538 goto onError;\r
8539 pbuf = PyUnicode_AS_UNICODE(temp);\r
8540 len = PyUnicode_GET_SIZE(temp);\r
8541 sign = 1;\r
8542 }\r
8543 else {\r
8544 Py_DECREF(iobj);\r
8545 }\r
8546 }\r
8547 }\r
8548 if (!isnumok) {\r
8549 PyErr_Format(PyExc_TypeError,\r
8550 "%%%c format: a number is required, "\r
8551 "not %.200s", (char)c, Py_TYPE(v)->tp_name);\r
8552 goto onError;\r
8553 }\r
8554 if (flags & F_ZERO)\r
8555 fill = '0';\r
8556 break;\r
8557\r
8558 case 'e':\r
8559 case 'E':\r
8560 case 'f':\r
8561 case 'F':\r
8562 case 'g':\r
8563 case 'G':\r
8564 temp = formatfloat(v, flags, prec, c);\r
8565 if (temp == NULL)\r
8566 goto onError;\r
8567 pbuf = PyUnicode_AS_UNICODE(temp);\r
8568 len = PyUnicode_GET_SIZE(temp);\r
8569 sign = 1;\r
8570 if (flags & F_ZERO)\r
8571 fill = '0';\r
8572 break;\r
8573\r
8574 case 'c':\r
8575 pbuf = formatbuf;\r
8576 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);\r
8577 if (len < 0)\r
8578 goto onError;\r
8579 break;\r
8580\r
8581 default:\r
8582 PyErr_Format(PyExc_ValueError,\r
8583 "unsupported format character '%c' (0x%x) "\r
8584 "at index %zd",\r
8585 (31<=c && c<=126) ? (char)c : '?',\r
8586 (int)c,\r
8587 (Py_ssize_t)(fmt - 1 -\r
8588 PyUnicode_AS_UNICODE(uformat)));\r
8589 goto onError;\r
8590 }\r
8591 if (sign) {\r
8592 if (*pbuf == '-' || *pbuf == '+') {\r
8593 sign = *pbuf++;\r
8594 len--;\r
8595 }\r
8596 else if (flags & F_SIGN)\r
8597 sign = '+';\r
8598 else if (flags & F_BLANK)\r
8599 sign = ' ';\r
8600 else\r
8601 sign = 0;\r
8602 }\r
8603 if (width < len)\r
8604 width = len;\r
8605 if (rescnt - (sign != 0) < width) {\r
8606 reslen -= rescnt;\r
8607 rescnt = width + fmtcnt + 100;\r
8608 reslen += rescnt;\r
8609 if (reslen < 0) {\r
8610 Py_XDECREF(temp);\r
8611 PyErr_NoMemory();\r
8612 goto onError;\r
8613 }\r
8614 if (_PyUnicode_Resize(&result, reslen) < 0) {\r
8615 Py_XDECREF(temp);\r
8616 goto onError;\r
8617 }\r
8618 res = PyUnicode_AS_UNICODE(result)\r
8619 + reslen - rescnt;\r
8620 }\r
8621 if (sign) {\r
8622 if (fill != ' ')\r
8623 *res++ = sign;\r
8624 rescnt--;\r
8625 if (width > len)\r
8626 width--;\r
8627 }\r
8628 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {\r
8629 assert(pbuf[0] == '0');\r
8630 assert(pbuf[1] == c);\r
8631 if (fill != ' ') {\r
8632 *res++ = *pbuf++;\r
8633 *res++ = *pbuf++;\r
8634 }\r
8635 rescnt -= 2;\r
8636 width -= 2;\r
8637 if (width < 0)\r
8638 width = 0;\r
8639 len -= 2;\r
8640 }\r
8641 if (width > len && !(flags & F_LJUST)) {\r
8642 do {\r
8643 --rescnt;\r
8644 *res++ = fill;\r
8645 } while (--width > len);\r
8646 }\r
8647 if (fill == ' ') {\r
8648 if (sign)\r
8649 *res++ = sign;\r
8650 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {\r
8651 assert(pbuf[0] == '0');\r
8652 assert(pbuf[1] == c);\r
8653 *res++ = *pbuf++;\r
8654 *res++ = *pbuf++;\r
8655 }\r
8656 }\r
8657 Py_UNICODE_COPY(res, pbuf, len);\r
8658 res += len;\r
8659 rescnt -= len;\r
8660 while (--width >= len) {\r
8661 --rescnt;\r
8662 *res++ = ' ';\r
8663 }\r
8664 if (dict && (argidx < arglen) && c != '%') {\r
8665 PyErr_SetString(PyExc_TypeError,\r
8666 "not all arguments converted during string formatting");\r
8667 Py_XDECREF(temp);\r
8668 goto onError;\r
8669 }\r
8670 Py_XDECREF(temp);\r
8671 } /* '%' */\r
8672 } /* until end */\r
8673 if (argidx < arglen && !dict) {\r
8674 PyErr_SetString(PyExc_TypeError,\r
8675 "not all arguments converted during string formatting");\r
8676 goto onError;\r
8677 }\r
8678\r
8679 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)\r
8680 goto onError;\r
8681 if (args_owned) {\r
8682 Py_DECREF(args);\r
8683 }\r
8684 Py_DECREF(uformat);\r
8685 return (PyObject *)result;\r
8686\r
8687 onError:\r
8688 Py_XDECREF(result);\r
8689 Py_DECREF(uformat);\r
8690 if (args_owned) {\r
8691 Py_DECREF(args);\r
8692 }\r
8693 return NULL;\r
8694}\r
8695\r
8696static PyBufferProcs unicode_as_buffer = {\r
8697 (readbufferproc) unicode_buffer_getreadbuf,\r
8698 (writebufferproc) unicode_buffer_getwritebuf,\r
8699 (segcountproc) unicode_buffer_getsegcount,\r
8700 (charbufferproc) unicode_buffer_getcharbuf,\r
8701};\r
8702\r
8703static PyObject *\r
8704unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);\r
8705\r
8706static PyObject *\r
8707unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)\r
8708{\r
8709 PyObject *x = NULL;\r
8710 static char *kwlist[] = {"string", "encoding", "errors", 0};\r
8711 char *encoding = NULL;\r
8712 char *errors = NULL;\r
8713\r
8714 if (type != &PyUnicode_Type)\r
8715 return unicode_subtype_new(type, args, kwds);\r
8716 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",\r
8717 kwlist, &x, &encoding, &errors))\r
8718 return NULL;\r
8719 if (x == NULL)\r
8720 return (PyObject *)_PyUnicode_New(0);\r
8721 if (encoding == NULL && errors == NULL)\r
8722 return PyObject_Unicode(x);\r
8723 else\r
8724 return PyUnicode_FromEncodedObject(x, encoding, errors);\r
8725}\r
8726\r
8727static PyObject *\r
8728unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)\r
8729{\r
8730 PyUnicodeObject *tmp, *pnew;\r
8731 Py_ssize_t n;\r
8732\r
8733 assert(PyType_IsSubtype(type, &PyUnicode_Type));\r
8734 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);\r
8735 if (tmp == NULL)\r
8736 return NULL;\r
8737 assert(PyUnicode_Check(tmp));\r
8738 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);\r
8739 if (pnew == NULL) {\r
8740 Py_DECREF(tmp);\r
8741 return NULL;\r
8742 }\r
8743 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));\r
8744 if (pnew->str == NULL) {\r
8745 _Py_ForgetReference((PyObject *)pnew);\r
8746 PyObject_Del(pnew);\r
8747 Py_DECREF(tmp);\r
8748 return PyErr_NoMemory();\r
8749 }\r
8750 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);\r
8751 pnew->length = n;\r
8752 pnew->hash = tmp->hash;\r
8753 Py_DECREF(tmp);\r
8754 return (PyObject *)pnew;\r
8755}\r
8756\r
8757PyDoc_STRVAR(unicode_doc,\r
8758 "unicode(string [, encoding[, errors]]) -> object\n\\r
8759\n\\r
8760Create a new Unicode object from the given encoded string.\n\\r
8761encoding defaults to the current default string encoding.\n\\r
8762errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");\r
8763\r
8764PyTypeObject PyUnicode_Type = {\r
8765 PyVarObject_HEAD_INIT(&PyType_Type, 0)\r
8766 "unicode", /* tp_name */\r
8767 sizeof(PyUnicodeObject), /* tp_size */\r
8768 0, /* tp_itemsize */\r
8769 /* Slots */\r
8770 (destructor)unicode_dealloc, /* tp_dealloc */\r
8771 0, /* tp_print */\r
8772 0, /* tp_getattr */\r
8773 0, /* tp_setattr */\r
8774 0, /* tp_compare */\r
8775 unicode_repr, /* tp_repr */\r
8776 &unicode_as_number, /* tp_as_number */\r
8777 &unicode_as_sequence, /* tp_as_sequence */\r
8778 &unicode_as_mapping, /* tp_as_mapping */\r
8779 (hashfunc) unicode_hash, /* tp_hash*/\r
8780 0, /* tp_call*/\r
8781 (reprfunc) unicode_str, /* tp_str */\r
8782 PyObject_GenericGetAttr, /* tp_getattro */\r
8783 0, /* tp_setattro */\r
8784 &unicode_as_buffer, /* tp_as_buffer */\r
8785 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |\r
8786 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */\r
8787 unicode_doc, /* tp_doc */\r
8788 0, /* tp_traverse */\r
8789 0, /* tp_clear */\r
8790 PyUnicode_RichCompare, /* tp_richcompare */\r
8791 0, /* tp_weaklistoffset */\r
8792 0, /* tp_iter */\r
8793 0, /* tp_iternext */\r
8794 unicode_methods, /* tp_methods */\r
8795 0, /* tp_members */\r
8796 0, /* tp_getset */\r
8797 &PyBaseString_Type, /* tp_base */\r
8798 0, /* tp_dict */\r
8799 0, /* tp_descr_get */\r
8800 0, /* tp_descr_set */\r
8801 0, /* tp_dictoffset */\r
8802 0, /* tp_init */\r
8803 0, /* tp_alloc */\r
8804 unicode_new, /* tp_new */\r
8805 PyObject_Del, /* tp_free */\r
8806};\r
8807\r
8808/* Initialize the Unicode implementation */\r
8809\r
8810void _PyUnicode_Init(void)\r
8811{\r
8812 int i;\r
8813\r
8814 /* XXX - move this array to unicodectype.c ? */\r
8815 Py_UNICODE linebreak[] = {\r
8816 0x000A, /* LINE FEED */\r
8817 0x000D, /* CARRIAGE RETURN */\r
8818 0x001C, /* FILE SEPARATOR */\r
8819 0x001D, /* GROUP SEPARATOR */\r
8820 0x001E, /* RECORD SEPARATOR */\r
8821 0x0085, /* NEXT LINE */\r
8822 0x2028, /* LINE SEPARATOR */\r
8823 0x2029, /* PARAGRAPH SEPARATOR */\r
8824 };\r
8825\r
8826 /* Init the implementation */\r
8827 free_list = NULL;\r
8828 numfree = 0;\r
8829 unicode_empty = _PyUnicode_New(0);\r
8830 if (!unicode_empty)\r
8831 return;\r
8832\r
8833 strcpy(unicode_default_encoding, "ascii");\r
8834 for (i = 0; i < 256; i++)\r
8835 unicode_latin1[i] = NULL;\r
8836 if (PyType_Ready(&PyUnicode_Type) < 0)\r
8837 Py_FatalError("Can't initialize 'unicode'");\r
8838\r
8839 /* initialize the linebreak bloom filter */\r
8840 bloom_linebreak = make_bloom_mask(\r
8841 linebreak, sizeof(linebreak) / sizeof(linebreak[0])\r
8842 );\r
8843\r
8844 PyType_Ready(&EncodingMapType);\r
8845}\r
8846\r
8847/* Finalize the Unicode implementation */\r
8848\r
8849int\r
8850PyUnicode_ClearFreeList(void)\r
8851{\r
8852 int freelist_size = numfree;\r
8853 PyUnicodeObject *u;\r
8854\r
8855 for (u = free_list; u != NULL;) {\r
8856 PyUnicodeObject *v = u;\r
8857 u = *(PyUnicodeObject **)u;\r
8858 if (v->str)\r
8859 PyObject_DEL(v->str);\r
8860 Py_XDECREF(v->defenc);\r
8861 PyObject_Del(v);\r
8862 numfree--;\r
8863 }\r
8864 free_list = NULL;\r
8865 assert(numfree == 0);\r
8866 return freelist_size;\r
8867}\r
8868\r
8869void\r
8870_PyUnicode_Fini(void)\r
8871{\r
8872 int i;\r
8873\r
8874 Py_XDECREF(unicode_empty);\r
8875 unicode_empty = NULL;\r
8876\r
8877 for (i = 0; i < 256; i++) {\r
8878 if (unicode_latin1[i]) {\r
8879 Py_DECREF(unicode_latin1[i]);\r
8880 unicode_latin1[i] = NULL;\r
8881 }\r
8882 }\r
8883 (void)PyUnicode_ClearFreeList();\r
8884}\r
8885\r
8886#ifdef __cplusplus\r
8887}\r
8888#endif\r