]> git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Objects/unicodeobject.c
EmbeddedPkg: Extend NvVarStoreFormattedLib LIBRARY_CLASS
[mirror_edk2.git] / AppPkg / Applications / Python / Python-2.7.10 / Objects / unicodeobject.c
1 /*
2
3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
6
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10 Copyright (c) Corporation for National Research Initiatives.
11
12 --------------------------------------------------------------------
13 The original string type implementation is:
14
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
17
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
21
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
29 permission.
30
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
39
40 */
41
42 #define PY_SSIZE_T_CLEAN
43 #include "Python.h"
44
45 #include "unicodeobject.h"
46 #include "ucnhash.h"
47
48 #ifdef MS_WINDOWS
49 #include <windows.h>
50 #endif
51
52 /* Limit for the Unicode object free list */
53
54 #define PyUnicode_MAXFREELIST 1024
55
56 /* Limit for the Unicode object free list stay alive optimization.
57
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
61
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
65
66 Setting the limit to 0 effectively turns the feature off.
67
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
70
71 */
72
73 #define KEEPALIVE_SIZE_LIMIT 9
74
75 /* Endianness switches; defaults to little endian */
76
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
79 #else
80 # define BYTEORDER_IS_LITTLE_ENDIAN
81 #endif
82
83 /* --- Globals ------------------------------------------------------------
84
85 NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
88
89 */
90
91
92 #ifdef __cplusplus
93 extern "C" {
94 #endif
95
96 /* Free list for Unicode objects */
97 static PyUnicodeObject *free_list = NULL;
98 static int numfree = 0;
99
100 /* The empty Unicode object is shared to improve performance. */
101 static PyUnicodeObject *unicode_empty = NULL;
102
103 #define _Py_RETURN_UNICODE_EMPTY() \
104 do { \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
107 else { \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
111 } \
112 return (PyObject *)unicode_empty; \
113 } while (0)
114
115 /* Single character Unicode strings in the Latin-1 range are being
116 shared as well. */
117 static PyUnicodeObject *unicode_latin1[256] = {NULL};
118
119 /* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
121
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125 */
126 static char unicode_default_encoding[100 + 1] = "ascii";
127
128 /* Fast detection of the most frequent whitespace characters */
129 const unsigned char _Py_ascii_whitespace[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131 /* case 0x0009: * CHARACTER TABULATION */
132 /* case 0x000A: * LINE FEED */
133 /* case 0x000B: * LINE TABULATION */
134 /* case 0x000C: * FORM FEED */
135 /* case 0x000D: * CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 /* case 0x001C: * FILE SEPARATOR */
139 /* case 0x001D: * GROUP SEPARATOR */
140 /* case 0x001E: * RECORD SEPARATOR */
141 /* case 0x001F: * UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143 /* case 0x0020: * SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
148
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
157 };
158
159 /* Same for linebreaks */
160 static unsigned char ascii_linebreak[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0x000A, * LINE FEED */
163 /* 0x000B, * LINE TABULATION */
164 /* 0x000C, * FORM FEED */
165 /* 0x000D, * CARRIAGE RETURN */
166 0, 0, 1, 1, 1, 1, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 /* 0x001C, * FILE SEPARATOR */
169 /* 0x001D, * GROUP SEPARATOR */
170 /* 0x001E, * RECORD SEPARATOR */
171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
176
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
185 };
186
187
188 Py_UNICODE
189 PyUnicode_GetMax(void)
190 {
191 #ifdef Py_UNICODE_WIDE
192 return 0x10FFFF;
193 #else
194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
196 return 0xFFFF;
197 #endif
198 }
199
200 /* --- Bloom Filters ----------------------------------------------------- */
201
202 /* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
205
206 /* the linebreak mask is set up by Unicode_Init below */
207
208 #if LONG_BIT >= 128
209 #define BLOOM_WIDTH 128
210 #elif LONG_BIT >= 64
211 #define BLOOM_WIDTH 64
212 #elif LONG_BIT >= 32
213 #define BLOOM_WIDTH 32
214 #else
215 #error "LONG_BIT is smaller than 32"
216 #endif
217
218 #define BLOOM_MASK unsigned long
219
220 static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
221
222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224
225 #define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228
229 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230 {
231 /* calculate simple bloom-style bitmask for a given unicode string */
232
233 BLOOM_MASK mask;
234 Py_ssize_t i;
235
236 mask = 0;
237 for (i = 0; i < len; i++)
238 BLOOM_ADD(mask, ptr[i]);
239
240 return mask;
241 }
242
243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244 {
245 Py_ssize_t i;
246
247 for (i = 0; i < setlen; i++)
248 if (set[i] == chr)
249 return 1;
250
251 return 0;
252 }
253
254 #define BLOOM_MEMBER(mask, chr, set, setlen) \
255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
257 /* --- Unicode Object ----------------------------------------------------- */
258
259 static
260 int unicode_resize(register PyUnicodeObject *unicode,
261 Py_ssize_t length)
262 {
263 void *oldstr;
264
265 /* Shortcut if there's nothing much to do. */
266 if (unicode->length == length)
267 goto reset;
268
269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
271 instead ! */
272
273 if (unicode == unicode_empty ||
274 (unicode->length == 1 &&
275 unicode->str[0] < 256U &&
276 unicode_latin1[unicode->str[0]] == unicode)) {
277 PyErr_SetString(PyExc_SystemError,
278 "can't resize shared unicode objects");
279 return -1;
280 }
281
282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
285 it contains). */
286
287 oldstr = unicode->str;
288 unicode->str = PyObject_REALLOC(unicode->str,
289 sizeof(Py_UNICODE) * (length + 1));
290 if (!unicode->str) {
291 unicode->str = (Py_UNICODE *)oldstr;
292 PyErr_NoMemory();
293 return -1;
294 }
295 unicode->str[length] = 0;
296 unicode->length = length;
297
298 reset:
299 /* Reset the object caches */
300 if (unicode->defenc) {
301 Py_CLEAR(unicode->defenc);
302 }
303 unicode->hash = -1;
304
305 return 0;
306 }
307
308 /* We allocate one more byte to make sure the string is
309 Ux0000 terminated; some code relies on that.
310
311 XXX This allocator could further be enhanced by assuring that the
312 free list never reduces its size below 1.
313
314 */
315
316 static
317 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
318 {
319 register PyUnicodeObject *unicode;
320
321 /* Optimization for empty strings */
322 if (length == 0 && unicode_empty != NULL) {
323 Py_INCREF(unicode_empty);
324 return unicode_empty;
325 }
326
327 /* Ensure we won't overflow the size. */
328 if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329 return (PyUnicodeObject *)PyErr_NoMemory();
330 }
331
332 /* Unicode freelist & memory allocation */
333 if (free_list) {
334 unicode = free_list;
335 free_list = *(PyUnicodeObject **)unicode;
336 numfree--;
337 if (unicode->str) {
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode->length < length) &&
341 unicode_resize(unicode, length) < 0) {
342 PyObject_DEL(unicode->str);
343 unicode->str = NULL;
344 }
345 }
346 else {
347 size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349 }
350 PyObject_INIT(unicode, &PyUnicode_Type);
351 }
352 else {
353 size_t new_size;
354 unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
355 if (unicode == NULL)
356 return NULL;
357 new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358 unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
359 }
360
361 if (!unicode->str) {
362 PyErr_NoMemory();
363 goto onError;
364 }
365 /* Initialize the first element to guard against cases where
366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
370 * that case.
371 */
372 unicode->str[0] = 0;
373 unicode->str[length] = 0;
374 unicode->length = length;
375 unicode->hash = -1;
376 unicode->defenc = NULL;
377 return unicode;
378
379 onError:
380 /* XXX UNREF/NEWREF interface should be more symmetrical */
381 _Py_DEC_REFTOTAL;
382 _Py_ForgetReference((PyObject *)unicode);
383 PyObject_Del(unicode);
384 return NULL;
385 }
386
387 static
388 void unicode_dealloc(register PyUnicodeObject *unicode)
389 {
390 if (PyUnicode_CheckExact(unicode) &&
391 numfree < PyUnicode_MAXFREELIST) {
392 /* Keep-Alive optimization */
393 if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394 PyObject_DEL(unicode->str);
395 unicode->str = NULL;
396 unicode->length = 0;
397 }
398 if (unicode->defenc) {
399 Py_CLEAR(unicode->defenc);
400 }
401 /* Add to free list */
402 *(PyUnicodeObject **)unicode = free_list;
403 free_list = unicode;
404 numfree++;
405 }
406 else {
407 PyObject_DEL(unicode->str);
408 Py_XDECREF(unicode->defenc);
409 Py_TYPE(unicode)->tp_free((PyObject *)unicode);
410 }
411 }
412
413 static
414 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
415 {
416 register PyUnicodeObject *v;
417
418 /* Argument checks */
419 if (unicode == NULL) {
420 PyErr_BadInternalCall();
421 return -1;
422 }
423 v = *unicode;
424 if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425 PyErr_BadInternalCall();
426 return -1;
427 }
428
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
432 if (v->length != length &&
433 (v == unicode_empty || v->length == 1)) {
434 PyUnicodeObject *w = _PyUnicode_New(length);
435 if (w == NULL)
436 return -1;
437 Py_UNICODE_COPY(w->str, v->str,
438 length < v->length ? length : v->length);
439 Py_DECREF(*unicode);
440 *unicode = w;
441 return 0;
442 }
443
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v, length);
447 }
448
449 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450 {
451 return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452 }
453
454 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
455 Py_ssize_t size)
456 {
457 PyUnicodeObject *unicode;
458
459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
461 if (u != NULL) {
462
463 /* Optimization for empty strings */
464 if (size == 0)
465 _Py_RETURN_UNICODE_EMPTY();
466
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size == 1 && *u < 256) {
470 unicode = unicode_latin1[*u];
471 if (!unicode) {
472 unicode = _PyUnicode_New(1);
473 if (!unicode)
474 return NULL;
475 unicode->str[0] = *u;
476 unicode_latin1[*u] = unicode;
477 }
478 Py_INCREF(unicode);
479 return (PyObject *)unicode;
480 }
481 }
482
483 unicode = _PyUnicode_New(size);
484 if (!unicode)
485 return NULL;
486
487 /* Copy the Unicode data into the new object */
488 if (u != NULL)
489 Py_UNICODE_COPY(unicode->str, u, size);
490
491 return (PyObject *)unicode;
492 }
493
494 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495 {
496 PyUnicodeObject *unicode;
497
498 if (size < 0) {
499 PyErr_SetString(PyExc_SystemError,
500 "Negative size passed to PyUnicode_FromStringAndSize");
501 return NULL;
502 }
503
504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
508 if (u != NULL) {
509
510 /* Optimization for empty strings */
511 if (size == 0)
512 _Py_RETURN_UNICODE_EMPTY();
513
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size == 1 && Py_CHARMASK(*u) < 128) {
517 unicode = unicode_latin1[Py_CHARMASK(*u)];
518 if (!unicode) {
519 unicode = _PyUnicode_New(1);
520 if (!unicode)
521 return NULL;
522 unicode->str[0] = Py_CHARMASK(*u);
523 unicode_latin1[Py_CHARMASK(*u)] = unicode;
524 }
525 Py_INCREF(unicode);
526 return (PyObject *)unicode;
527 }
528
529 return PyUnicode_DecodeUTF8(u, size, NULL);
530 }
531
532 unicode = _PyUnicode_New(size);
533 if (!unicode)
534 return NULL;
535
536 return (PyObject *)unicode;
537 }
538
539 PyObject *PyUnicode_FromString(const char *u)
540 {
541 size_t size = strlen(u);
542 if (size > PY_SSIZE_T_MAX) {
543 PyErr_SetString(PyExc_OverflowError, "input too long");
544 return NULL;
545 }
546
547 return PyUnicode_FromStringAndSize(u, size);
548 }
549
550 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
561 */
562
563 /* helper macros used by _Py_UNICODE_NEXT */
564 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566 /* Join two surrogate characters and return a single Py_UCS4 value. */
567 #define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571 #ifdef Py_UNICODE_WIDE
572 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573 #else
574 #define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578 (Py_UCS4)*(ptr)++)
579 #endif
580
581 #ifdef HAVE_WCHAR_H
582
583 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584 # define CONVERT_WCHAR_TO_SURROGATES
585 #endif
586
587 #ifdef CONVERT_WCHAR_TO_SURROGATES
588
589 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
591
592 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593 Py_ssize_t size)
594 {
595 PyUnicodeObject *unicode;
596 register Py_ssize_t i;
597 Py_ssize_t alloc;
598 const wchar_t *orig_w;
599
600 if (w == NULL) {
601 PyErr_BadInternalCall();
602 return NULL;
603 }
604
605 alloc = size;
606 orig_w = w;
607 for (i = size; i > 0; i--) {
608 if (*w > 0xFFFF)
609 alloc++;
610 w++;
611 }
612 w = orig_w;
613 unicode = _PyUnicode_New(alloc);
614 if (!unicode)
615 return NULL;
616
617 /* Copy the wchar_t data into the new object */
618 {
619 register Py_UNICODE *u;
620 u = PyUnicode_AS_UNICODE(unicode);
621 for (i = size; i > 0; i--) {
622 if (*w > 0xFFFF) {
623 wchar_t ordinal = *w++;
624 ordinal -= 0x10000;
625 *u++ = 0xD800 | (ordinal >> 10);
626 *u++ = 0xDC00 | (ordinal & 0x3FF);
627 }
628 else
629 *u++ = *w++;
630 }
631 }
632 return (PyObject *)unicode;
633 }
634
635 #else
636
637 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
638 Py_ssize_t size)
639 {
640 PyUnicodeObject *unicode;
641
642 if (w == NULL) {
643 PyErr_BadInternalCall();
644 return NULL;
645 }
646
647 unicode = _PyUnicode_New(size);
648 if (!unicode)
649 return NULL;
650
651 /* Copy the wchar_t data into the new object */
652 #ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode->str, w, size * sizeof(wchar_t));
654 #else
655 {
656 register Py_UNICODE *u;
657 register Py_ssize_t i;
658 u = PyUnicode_AS_UNICODE(unicode);
659 for (i = size; i > 0; i--)
660 *u++ = *w++;
661 }
662 #endif
663
664 return (PyObject *)unicode;
665 }
666
667 #endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669 #undef CONVERT_WCHAR_TO_SURROGATES
670
671 static void
672 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673 {
674 *fmt++ = '%';
675 if (width) {
676 if (zeropad)
677 *fmt++ = '0';
678 fmt += sprintf(fmt, "%d", width);
679 }
680 if (precision)
681 fmt += sprintf(fmt, ".%d", precision);
682 if (longflag)
683 *fmt++ = 'l';
684 else if (size_tflag) {
685 char *f = PY_FORMAT_SIZE_T;
686 while (*f)
687 *fmt++ = *f++;
688 }
689 *fmt++ = c;
690 *fmt = '\0';
691 }
692
693 #define appendstring(string) \
694 do { \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
697 } \
698 } while (0)
699
700 PyObject *
701 PyUnicode_FromFormatV(const char *format, va_list vargs)
702 {
703 va_list count;
704 Py_ssize_t callcount = 0;
705 PyObject **callresults = NULL;
706 PyObject **callresult = NULL;
707 Py_ssize_t n = 0;
708 int width = 0;
709 int precision = 0;
710 int zeropad;
711 const char* f;
712 Py_UNICODE *s;
713 PyObject *string;
714 /* used by sprintf */
715 char buffer[21];
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer = NULL;
719 char *realbuffer;
720 Py_ssize_t abuffersize = 0;
721 char fmt[60]; /* should be enough for %0width.precisionld */
722 const char *copy;
723
724 #ifdef VA_LIST_IS_ARRAY
725 Py_MEMCPY(count, vargs, sizeof(va_list));
726 #else
727 #ifdef __va_copy
728 __va_copy(count, vargs);
729 #else
730 count = vargs;
731 #endif
732 #endif
733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
736 for (f = format; *f; f++) {
737 if (*f == '%') {
738 f++;
739 while (*f && *f != '%' && !isalpha((unsigned)*f))
740 f++;
741 if (!*f)
742 break;
743 if (*f == 's' || *f=='S' || *f=='R')
744 ++callcount;
745 }
746 }
747 /* step 2: allocate memory for the results of
748 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
749 if (callcount) {
750 callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
751 if (!callresults) {
752 PyErr_NoMemory();
753 return NULL;
754 }
755 callresult = callresults;
756 }
757 /* step 3: figure out how large a buffer we need */
758 for (f = format; *f; f++) {
759 if (*f == '%') {
760 const char* p = f++;
761 width = 0;
762 while (isdigit((unsigned)*f))
763 width = (width*10) + *f++ - '0';
764 precision = 0;
765 if (*f == '.') {
766 f++;
767 while (isdigit((unsigned)*f))
768 precision = (precision*10) + *f++ - '0';
769 }
770
771 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772 * they don't affect the amount of space we reserve.
773 */
774 if ((*f == 'l' || *f == 'z') &&
775 (f[1] == 'd' || f[1] == 'u'))
776 ++f;
777
778 switch (*f) {
779 case 'c':
780 {
781 int ordinal = va_arg(count, int);
782 #ifdef Py_UNICODE_WIDE
783 if (ordinal < 0 || ordinal > 0x10ffff) {
784 PyErr_SetString(PyExc_OverflowError,
785 "%c arg not in range(0x110000) "
786 "(wide Python build)");
787 goto fail;
788 }
789 #else
790 if (ordinal < 0 || ordinal > 0xffff) {
791 PyErr_SetString(PyExc_OverflowError,
792 "%c arg not in range(0x10000) "
793 "(narrow Python build)");
794 goto fail;
795 }
796 #endif
797 /* fall through... */
798 }
799 case '%':
800 n++;
801 break;
802 case 'd': case 'u': case 'i': case 'x':
803 (void) va_arg(count, int);
804 if (width < precision)
805 width = precision;
806 /* 20 bytes is enough to hold a 64-bit
807 integer. Decimal takes the most space.
808 This isn't enough for octal.
809 If a width is specified we need more
810 (which we allocate later). */
811 if (width < 20)
812 width = 20;
813 n += width;
814 if (abuffersize < width)
815 abuffersize = width;
816 break;
817 case 's':
818 {
819 /* UTF-8 */
820 const char *s = va_arg(count, const char*);
821 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
822 if (!str)
823 goto fail;
824 n += PyUnicode_GET_SIZE(str);
825 /* Remember the str and switch to the next slot */
826 *callresult++ = str;
827 break;
828 }
829 case 'U':
830 {
831 PyObject *obj = va_arg(count, PyObject *);
832 assert(obj && PyUnicode_Check(obj));
833 n += PyUnicode_GET_SIZE(obj);
834 break;
835 }
836 case 'V':
837 {
838 PyObject *obj = va_arg(count, PyObject *);
839 const char *str = va_arg(count, const char *);
840 assert(obj || str);
841 assert(!obj || PyUnicode_Check(obj));
842 if (obj)
843 n += PyUnicode_GET_SIZE(obj);
844 else
845 n += strlen(str);
846 break;
847 }
848 case 'S':
849 {
850 PyObject *obj = va_arg(count, PyObject *);
851 PyObject *str;
852 assert(obj);
853 str = PyObject_Str(obj);
854 if (!str)
855 goto fail;
856 n += PyString_GET_SIZE(str);
857 /* Remember the str and switch to the next slot */
858 *callresult++ = str;
859 break;
860 }
861 case 'R':
862 {
863 PyObject *obj = va_arg(count, PyObject *);
864 PyObject *repr;
865 assert(obj);
866 repr = PyObject_Repr(obj);
867 if (!repr)
868 goto fail;
869 n += PyUnicode_GET_SIZE(repr);
870 /* Remember the repr and switch to the next slot */
871 *callresult++ = repr;
872 break;
873 }
874 case 'p':
875 (void) va_arg(count, int);
876 /* maximum 64-bit pointer representation:
877 * 0xffffffffffffffff
878 * so 19 characters is enough.
879 * XXX I count 18 -- what's the extra for?
880 */
881 n += 19;
882 break;
883 default:
884 /* if we stumble upon an unknown
885 formatting code, copy the rest of
886 the format string to the output
887 string. (we cannot just skip the
888 code, since there's no way to know
889 what's in the argument list) */
890 n += strlen(p);
891 goto expand;
892 }
893 } else
894 n++;
895 }
896 expand:
897 if (abuffersize > 20) {
898 /* add 1 for sprintf's trailing null byte */
899 abuffer = PyObject_Malloc(abuffersize + 1);
900 if (!abuffer) {
901 PyErr_NoMemory();
902 goto fail;
903 }
904 realbuffer = abuffer;
905 }
906 else
907 realbuffer = buffer;
908 /* step 4: fill the buffer */
909 /* Since we've analyzed how much space we need for the worst case,
910 we don't have to resize the string.
911 There can be no errors beyond this point. */
912 string = PyUnicode_FromUnicode(NULL, n);
913 if (!string)
914 goto fail;
915
916 s = PyUnicode_AS_UNICODE(string);
917 callresult = callresults;
918
919 for (f = format; *f; f++) {
920 if (*f == '%') {
921 const char* p = f++;
922 int longflag = 0;
923 int size_tflag = 0;
924 zeropad = (*f == '0');
925 /* parse the width.precision part */
926 width = 0;
927 while (isdigit((unsigned)*f))
928 width = (width*10) + *f++ - '0';
929 precision = 0;
930 if (*f == '.') {
931 f++;
932 while (isdigit((unsigned)*f))
933 precision = (precision*10) + *f++ - '0';
934 }
935 /* handle the long flag, but only for %ld and %lu.
936 others can be added when necessary. */
937 if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
938 longflag = 1;
939 ++f;
940 }
941 /* handle the size_t flag. */
942 if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
943 size_tflag = 1;
944 ++f;
945 }
946
947 switch (*f) {
948 case 'c':
949 *s++ = va_arg(vargs, int);
950 break;
951 case 'd':
952 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
953 if (longflag)
954 sprintf(realbuffer, fmt, va_arg(vargs, long));
955 else if (size_tflag)
956 sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
957 else
958 sprintf(realbuffer, fmt, va_arg(vargs, int));
959 appendstring(realbuffer);
960 break;
961 case 'u':
962 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
963 if (longflag)
964 sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
965 else if (size_tflag)
966 sprintf(realbuffer, fmt, va_arg(vargs, size_t));
967 else
968 sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
969 appendstring(realbuffer);
970 break;
971 case 'i':
972 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
973 sprintf(realbuffer, fmt, va_arg(vargs, int));
974 appendstring(realbuffer);
975 break;
976 case 'x':
977 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
978 sprintf(realbuffer, fmt, va_arg(vargs, int));
979 appendstring(realbuffer);
980 break;
981 case 's':
982 {
983 /* unused, since we already have the result */
984 (void) va_arg(vargs, char *);
985 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
986 PyUnicode_GET_SIZE(*callresult));
987 s += PyUnicode_GET_SIZE(*callresult);
988 /* We're done with the unicode()/repr() => forget it */
989 Py_DECREF(*callresult);
990 /* switch to next unicode()/repr() result */
991 ++callresult;
992 break;
993 }
994 case 'U':
995 {
996 PyObject *obj = va_arg(vargs, PyObject *);
997 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999 s += size;
1000 break;
1001 }
1002 case 'V':
1003 {
1004 PyObject *obj = va_arg(vargs, PyObject *);
1005 const char *str = va_arg(vargs, const char *);
1006 if (obj) {
1007 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1008 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1009 s += size;
1010 } else {
1011 appendstring(str);
1012 }
1013 break;
1014 }
1015 case 'S':
1016 case 'R':
1017 {
1018 const char *str = PyString_AS_STRING(*callresult);
1019 /* unused, since we already have the result */
1020 (void) va_arg(vargs, PyObject *);
1021 appendstring(str);
1022 /* We're done with the unicode()/repr() => forget it */
1023 Py_DECREF(*callresult);
1024 /* switch to next unicode()/repr() result */
1025 ++callresult;
1026 break;
1027 }
1028 case 'p':
1029 sprintf(buffer, "%p", va_arg(vargs, void*));
1030 /* %p is ill-defined: ensure leading 0x. */
1031 if (buffer[1] == 'X')
1032 buffer[1] = 'x';
1033 else if (buffer[1] != 'x') {
1034 memmove(buffer+2, buffer, strlen(buffer)+1);
1035 buffer[0] = '0';
1036 buffer[1] = 'x';
1037 }
1038 appendstring(buffer);
1039 break;
1040 case '%':
1041 *s++ = '%';
1042 break;
1043 default:
1044 appendstring(p);
1045 goto end;
1046 }
1047 } else
1048 *s++ = *f;
1049 }
1050
1051 end:
1052 if (callresults)
1053 PyObject_Free(callresults);
1054 if (abuffer)
1055 PyObject_Free(abuffer);
1056 PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1057 return string;
1058 fail:
1059 if (callresults) {
1060 PyObject **callresult2 = callresults;
1061 while (callresult2 < callresult) {
1062 Py_DECREF(*callresult2);
1063 ++callresult2;
1064 }
1065 PyObject_Free(callresults);
1066 }
1067 if (abuffer)
1068 PyObject_Free(abuffer);
1069 return NULL;
1070 }
1071
1072 #undef appendstring
1073
1074 PyObject *
1075 PyUnicode_FromFormat(const char *format, ...)
1076 {
1077 PyObject* ret;
1078 va_list vargs;
1079
1080 #ifdef HAVE_STDARG_PROTOTYPES
1081 va_start(vargs, format);
1082 #else
1083 va_start(vargs);
1084 #endif
1085 ret = PyUnicode_FromFormatV(format, vargs);
1086 va_end(vargs);
1087 return ret;
1088 }
1089
1090 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1091 wchar_t *w,
1092 Py_ssize_t size)
1093 {
1094 if (unicode == NULL) {
1095 PyErr_BadInternalCall();
1096 return -1;
1097 }
1098
1099 /* If possible, try to copy the 0-termination as well */
1100 if (size > PyUnicode_GET_SIZE(unicode))
1101 size = PyUnicode_GET_SIZE(unicode) + 1;
1102
1103 #ifdef HAVE_USABLE_WCHAR_T
1104 memcpy(w, unicode->str, size * sizeof(wchar_t));
1105 #else
1106 {
1107 register Py_UNICODE *u;
1108 register Py_ssize_t i;
1109 u = PyUnicode_AS_UNICODE(unicode);
1110 for (i = size; i > 0; i--)
1111 *w++ = *u++;
1112 }
1113 #endif
1114
1115 if (size > PyUnicode_GET_SIZE(unicode))
1116 return PyUnicode_GET_SIZE(unicode);
1117 else
1118 return size;
1119 }
1120
1121 #endif
1122
1123 PyObject *PyUnicode_FromOrdinal(int ordinal)
1124 {
1125 Py_UNICODE s[1];
1126
1127 #ifdef Py_UNICODE_WIDE
1128 if (ordinal < 0 || ordinal > 0x10ffff) {
1129 PyErr_SetString(PyExc_ValueError,
1130 "unichr() arg not in range(0x110000) "
1131 "(wide Python build)");
1132 return NULL;
1133 }
1134 #else
1135 if (ordinal < 0 || ordinal > 0xffff) {
1136 PyErr_SetString(PyExc_ValueError,
1137 "unichr() arg not in range(0x10000) "
1138 "(narrow Python build)");
1139 return NULL;
1140 }
1141 #endif
1142
1143 s[0] = (Py_UNICODE)ordinal;
1144 return PyUnicode_FromUnicode(s, 1);
1145 }
1146
1147 PyObject *PyUnicode_FromObject(register PyObject *obj)
1148 {
1149 /* XXX Perhaps we should make this API an alias of
1150 PyObject_Unicode() instead ?! */
1151 if (PyUnicode_CheckExact(obj)) {
1152 Py_INCREF(obj);
1153 return obj;
1154 }
1155 if (PyUnicode_Check(obj)) {
1156 /* For a Unicode subtype that's not a Unicode object,
1157 return a true Unicode object with the same data. */
1158 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1159 PyUnicode_GET_SIZE(obj));
1160 }
1161 return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1162 }
1163
1164 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1165 const char *encoding,
1166 const char *errors)
1167 {
1168 const char *s = NULL;
1169 Py_ssize_t len;
1170 PyObject *v;
1171
1172 if (obj == NULL) {
1173 PyErr_BadInternalCall();
1174 return NULL;
1175 }
1176
1177 #if 0
1178 /* For b/w compatibility we also accept Unicode objects provided
1179 that no encodings is given and then redirect to
1180 PyObject_Unicode() which then applies the additional logic for
1181 Unicode subclasses.
1182
1183 NOTE: This API should really only be used for object which
1184 represent *encoded* Unicode !
1185
1186 */
1187 if (PyUnicode_Check(obj)) {
1188 if (encoding) {
1189 PyErr_SetString(PyExc_TypeError,
1190 "decoding Unicode is not supported");
1191 return NULL;
1192 }
1193 return PyObject_Unicode(obj);
1194 }
1195 #else
1196 if (PyUnicode_Check(obj)) {
1197 PyErr_SetString(PyExc_TypeError,
1198 "decoding Unicode is not supported");
1199 return NULL;
1200 }
1201 #endif
1202
1203 /* Coerce object */
1204 if (PyString_Check(obj)) {
1205 s = PyString_AS_STRING(obj);
1206 len = PyString_GET_SIZE(obj);
1207 }
1208 else if (PyByteArray_Check(obj)) {
1209 /* Python 2.x specific */
1210 PyErr_Format(PyExc_TypeError,
1211 "decoding bytearray is not supported");
1212 return NULL;
1213 }
1214 else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1215 /* Overwrite the error message with something more useful in
1216 case of a TypeError. */
1217 if (PyErr_ExceptionMatches(PyExc_TypeError))
1218 PyErr_Format(PyExc_TypeError,
1219 "coercing to Unicode: need string or buffer, "
1220 "%.80s found",
1221 Py_TYPE(obj)->tp_name);
1222 goto onError;
1223 }
1224
1225 /* Convert to Unicode */
1226 if (len == 0)
1227 _Py_RETURN_UNICODE_EMPTY();
1228
1229 v = PyUnicode_Decode(s, len, encoding, errors);
1230 return v;
1231
1232 onError:
1233 return NULL;
1234 }
1235
1236 PyObject *PyUnicode_Decode(const char *s,
1237 Py_ssize_t size,
1238 const char *encoding,
1239 const char *errors)
1240 {
1241 PyObject *buffer = NULL, *unicode;
1242
1243 if (encoding == NULL)
1244 encoding = PyUnicode_GetDefaultEncoding();
1245
1246 /* Shortcuts for common default encodings */
1247 if (strcmp(encoding, "utf-8") == 0)
1248 return PyUnicode_DecodeUTF8(s, size, errors);
1249 else if (strcmp(encoding, "latin-1") == 0)
1250 return PyUnicode_DecodeLatin1(s, size, errors);
1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252 else if (strcmp(encoding, "mbcs") == 0)
1253 return PyUnicode_DecodeMBCS(s, size, errors);
1254 #endif
1255 else if (strcmp(encoding, "ascii") == 0)
1256 return PyUnicode_DecodeASCII(s, size, errors);
1257
1258 /* Decode via the codec registry */
1259 buffer = PyBuffer_FromMemory((void *)s, size);
1260 if (buffer == NULL)
1261 goto onError;
1262 unicode = PyCodec_Decode(buffer, encoding, errors);
1263 if (unicode == NULL)
1264 goto onError;
1265 if (!PyUnicode_Check(unicode)) {
1266 PyErr_Format(PyExc_TypeError,
1267 "decoder did not return an unicode object (type=%.400s)",
1268 Py_TYPE(unicode)->tp_name);
1269 Py_DECREF(unicode);
1270 goto onError;
1271 }
1272 Py_DECREF(buffer);
1273 return unicode;
1274
1275 onError:
1276 Py_XDECREF(buffer);
1277 return NULL;
1278 }
1279
1280 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1281 const char *encoding,
1282 const char *errors)
1283 {
1284 PyObject *v;
1285
1286 if (!PyUnicode_Check(unicode)) {
1287 PyErr_BadArgument();
1288 goto onError;
1289 }
1290
1291 if (encoding == NULL)
1292 encoding = PyUnicode_GetDefaultEncoding();
1293
1294 /* Decode via the codec registry */
1295 v = PyCodec_Decode(unicode, encoding, errors);
1296 if (v == NULL)
1297 goto onError;
1298 return v;
1299
1300 onError:
1301 return NULL;
1302 }
1303
1304 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1305 Py_ssize_t size,
1306 const char *encoding,
1307 const char *errors)
1308 {
1309 PyObject *v, *unicode;
1310
1311 unicode = PyUnicode_FromUnicode(s, size);
1312 if (unicode == NULL)
1313 return NULL;
1314 v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1315 Py_DECREF(unicode);
1316 return v;
1317 }
1318
1319 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1320 const char *encoding,
1321 const char *errors)
1322 {
1323 PyObject *v;
1324
1325 if (!PyUnicode_Check(unicode)) {
1326 PyErr_BadArgument();
1327 goto onError;
1328 }
1329
1330 if (encoding == NULL)
1331 encoding = PyUnicode_GetDefaultEncoding();
1332
1333 /* Encode via the codec registry */
1334 v = PyCodec_Encode(unicode, encoding, errors);
1335 if (v == NULL)
1336 goto onError;
1337 return v;
1338
1339 onError:
1340 return NULL;
1341 }
1342
1343 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1344 const char *encoding,
1345 const char *errors)
1346 {
1347 PyObject *v;
1348
1349 if (!PyUnicode_Check(unicode)) {
1350 PyErr_BadArgument();
1351 goto onError;
1352 }
1353
1354 if (encoding == NULL)
1355 encoding = PyUnicode_GetDefaultEncoding();
1356
1357 /* Shortcuts for common default encodings */
1358 if (errors == NULL) {
1359 if (strcmp(encoding, "utf-8") == 0)
1360 return PyUnicode_AsUTF8String(unicode);
1361 else if (strcmp(encoding, "latin-1") == 0)
1362 return PyUnicode_AsLatin1String(unicode);
1363 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1364 else if (strcmp(encoding, "mbcs") == 0)
1365 return PyUnicode_AsMBCSString(unicode);
1366 #endif
1367 else if (strcmp(encoding, "ascii") == 0)
1368 return PyUnicode_AsASCIIString(unicode);
1369 }
1370
1371 /* Encode via the codec registry */
1372 v = PyCodec_Encode(unicode, encoding, errors);
1373 if (v == NULL)
1374 goto onError;
1375 if (!PyString_Check(v)) {
1376 PyErr_Format(PyExc_TypeError,
1377 "encoder did not return a string object (type=%.400s)",
1378 Py_TYPE(v)->tp_name);
1379 Py_DECREF(v);
1380 goto onError;
1381 }
1382 return v;
1383
1384 onError:
1385 return NULL;
1386 }
1387
1388 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1389 const char *errors)
1390 {
1391 PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1392
1393 if (v)
1394 return v;
1395 v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1396 if (v && errors == NULL)
1397 ((PyUnicodeObject *)unicode)->defenc = v;
1398 return v;
1399 }
1400
1401 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1402 {
1403 if (!PyUnicode_Check(unicode)) {
1404 PyErr_BadArgument();
1405 goto onError;
1406 }
1407 return PyUnicode_AS_UNICODE(unicode);
1408
1409 onError:
1410 return NULL;
1411 }
1412
1413 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1414 {
1415 if (!PyUnicode_Check(unicode)) {
1416 PyErr_BadArgument();
1417 goto onError;
1418 }
1419 return PyUnicode_GET_SIZE(unicode);
1420
1421 onError:
1422 return -1;
1423 }
1424
1425 const char *PyUnicode_GetDefaultEncoding(void)
1426 {
1427 return unicode_default_encoding;
1428 }
1429
1430 int PyUnicode_SetDefaultEncoding(const char *encoding)
1431 {
1432 PyObject *v;
1433
1434 /* Make sure the encoding is valid. As side effect, this also
1435 loads the encoding into the codec registry cache. */
1436 v = _PyCodec_Lookup(encoding);
1437 if (v == NULL)
1438 goto onError;
1439 Py_DECREF(v);
1440 strncpy(unicode_default_encoding,
1441 encoding,
1442 sizeof(unicode_default_encoding) - 1);
1443 return 0;
1444
1445 onError:
1446 return -1;
1447 }
1448
1449 /* error handling callback helper:
1450 build arguments, call the callback and check the arguments,
1451 if no exception occurred, copy the replacement to the output
1452 and adjust various state variables.
1453 return 0 on success, -1 on error
1454 */
1455
1456 static
1457 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1458 const char *encoding, const char *reason,
1459 const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1460 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1461 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1462 {
1463 static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1464
1465 PyObject *restuple = NULL;
1466 PyObject *repunicode = NULL;
1467 Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1468 Py_ssize_t requiredsize;
1469 Py_ssize_t newpos;
1470 Py_UNICODE *repptr;
1471 Py_ssize_t repsize;
1472 int res = -1;
1473
1474 if (*errorHandler == NULL) {
1475 *errorHandler = PyCodec_LookupError(errors);
1476 if (*errorHandler == NULL)
1477 goto onError;
1478 }
1479
1480 if (*exceptionObject == NULL) {
1481 *exceptionObject = PyUnicodeDecodeError_Create(
1482 encoding, input, insize, *startinpos, *endinpos, reason);
1483 if (*exceptionObject == NULL)
1484 goto onError;
1485 }
1486 else {
1487 if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1488 goto onError;
1489 if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1490 goto onError;
1491 if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1492 goto onError;
1493 }
1494
1495 restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1496 if (restuple == NULL)
1497 goto onError;
1498 if (!PyTuple_Check(restuple)) {
1499 PyErr_SetString(PyExc_TypeError, &argparse[4]);
1500 goto onError;
1501 }
1502 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1503 goto onError;
1504 if (newpos<0)
1505 newpos = insize+newpos;
1506 if (newpos<0 || newpos>insize) {
1507 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1508 goto onError;
1509 }
1510
1511 /* need more space? (at least enough for what we
1512 have+the replacement+the rest of the string (starting
1513 at the new input position), so we won't have to check space
1514 when there are no errors in the rest of the string) */
1515 repptr = PyUnicode_AS_UNICODE(repunicode);
1516 repsize = PyUnicode_GET_SIZE(repunicode);
1517 requiredsize = *outpos;
1518 if (requiredsize > PY_SSIZE_T_MAX - repsize)
1519 goto overflow;
1520 requiredsize += repsize;
1521 if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1522 goto overflow;
1523 requiredsize += insize - newpos;
1524 if (requiredsize > outsize) {
1525 if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
1526 requiredsize = 2*outsize;
1527 if (_PyUnicode_Resize(output, requiredsize) < 0)
1528 goto onError;
1529 *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1530 }
1531 *endinpos = newpos;
1532 *inptr = input + newpos;
1533 Py_UNICODE_COPY(*outptr, repptr, repsize);
1534 *outptr += repsize;
1535 *outpos += repsize;
1536 /* we made it! */
1537 res = 0;
1538
1539 onError:
1540 Py_XDECREF(restuple);
1541 return res;
1542
1543 overflow:
1544 PyErr_SetString(PyExc_OverflowError,
1545 "decoded result is too long for a Python string");
1546 goto onError;
1547 }
1548
1549 /* --- UTF-7 Codec -------------------------------------------------------- */
1550
1551 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1552
1553 /* Three simple macros defining base-64. */
1554
1555 /* Is c a base-64 character? */
1556
1557 #define IS_BASE64(c) \
1558 (isalnum(c) || (c) == '+' || (c) == '/')
1559
1560 /* given that c is a base-64 character, what is its base-64 value? */
1561
1562 #define FROM_BASE64(c) \
1563 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1564 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1565 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1566 (c) == '+' ? 62 : 63)
1567
1568 /* What is the base-64 character of the bottom 6 bits of n? */
1569
1570 #define TO_BASE64(n) \
1571 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1572
1573 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1574 * decoded as itself. We are permissive on decoding; the only ASCII
1575 * byte not decoding to itself is the + which begins a base64
1576 * string. */
1577
1578 #define DECODE_DIRECT(c) \
1579 ((c) <= 127 && (c) != '+')
1580
1581 /* The UTF-7 encoder treats ASCII characters differently according to
1582 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1583 * the above). See RFC2152. This array identifies these different
1584 * sets:
1585 * 0 : "Set D"
1586 * alphanumeric and '(),-./:?
1587 * 1 : "Set O"
1588 * !"#$%&*;<=>@[]^_`{|}
1589 * 2 : "whitespace"
1590 * ht nl cr sp
1591 * 3 : special (must be base64 encoded)
1592 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1593 */
1594
1595 static
1596 char utf7_category[128] = {
1597 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1598 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1599 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1600 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1601 /* sp ! " # $ % & ' ( ) * + , - . / */
1602 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1603 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1605 /* @ A B C D E F G H I J K L M N O */
1606 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1607 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1609 /* ` a b c d e f g h i j k l m n o */
1610 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1611 /* p q r s t u v w x y z { | } ~ del */
1612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1613 };
1614
1615 /* ENCODE_DIRECT: this character should be encoded as itself. The
1616 * answer depends on whether we are encoding set O as itself, and also
1617 * on whether we are encoding whitespace as itself. RFC2152 makes it
1618 * clear that the answers to these questions vary between
1619 * applications, so this code needs to be flexible. */
1620
1621 #define ENCODE_DIRECT(c, directO, directWS) \
1622 ((c) < 128 && (c) > 0 && \
1623 ((utf7_category[(c)] == 0) || \
1624 (directWS && (utf7_category[(c)] == 2)) || \
1625 (directO && (utf7_category[(c)] == 1))))
1626
1627 PyObject *PyUnicode_DecodeUTF7(const char *s,
1628 Py_ssize_t size,
1629 const char *errors)
1630 {
1631 return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1632 }
1633
1634 /* The decoder. The only state we preserve is our read position,
1635 * i.e. how many characters we have consumed. So if we end in the
1636 * middle of a shift sequence we have to back off the read position
1637 * and the output to the beginning of the sequence, otherwise we lose
1638 * all the shift state (seen bits, number of bits seen, high
1639 * surrogate). */
1640
1641 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1642 Py_ssize_t size,
1643 const char *errors,
1644 Py_ssize_t *consumed)
1645 {
1646 const char *starts = s;
1647 Py_ssize_t startinpos;
1648 Py_ssize_t endinpos;
1649 Py_ssize_t outpos;
1650 const char *e;
1651 PyUnicodeObject *unicode;
1652 Py_UNICODE *p;
1653 const char *errmsg = "";
1654 int inShift = 0;
1655 Py_UNICODE *shiftOutStart;
1656 unsigned int base64bits = 0;
1657 unsigned long base64buffer = 0;
1658 Py_UNICODE surrogate = 0;
1659 PyObject *errorHandler = NULL;
1660 PyObject *exc = NULL;
1661
1662 unicode = _PyUnicode_New(size);
1663 if (!unicode)
1664 return NULL;
1665 if (size == 0) {
1666 if (consumed)
1667 *consumed = 0;
1668 return (PyObject *)unicode;
1669 }
1670
1671 p = unicode->str;
1672 shiftOutStart = p;
1673 e = s + size;
1674
1675 while (s < e) {
1676 Py_UNICODE ch = (unsigned char) *s;
1677
1678 if (inShift) { /* in a base-64 section */
1679 if (IS_BASE64(ch)) { /* consume a base-64 character */
1680 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1681 base64bits += 6;
1682 s++;
1683 if (base64bits >= 16) {
1684 /* we have enough bits for a UTF-16 value */
1685 Py_UNICODE outCh = (Py_UNICODE)
1686 (base64buffer >> (base64bits-16));
1687 base64bits -= 16;
1688 base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1689 assert(outCh <= 0xffff);
1690 if (surrogate) {
1691 /* expecting a second surrogate */
1692 if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1693 #ifdef Py_UNICODE_WIDE
1694 *p++ = (((surrogate & 0x3FF)<<10)
1695 | (outCh & 0x3FF)) + 0x10000;
1696 #else
1697 *p++ = surrogate;
1698 *p++ = outCh;
1699 #endif
1700 surrogate = 0;
1701 continue;
1702 }
1703 else {
1704 *p++ = surrogate;
1705 surrogate = 0;
1706 }
1707 }
1708 if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1709 /* first surrogate */
1710 surrogate = outCh;
1711 }
1712 else {
1713 *p++ = outCh;
1714 }
1715 }
1716 }
1717 else { /* now leaving a base-64 section */
1718 inShift = 0;
1719 s++;
1720 if (surrogate) {
1721 *p++ = surrogate;
1722 surrogate = 0;
1723 }
1724 if (base64bits > 0) { /* left-over bits */
1725 if (base64bits >= 6) {
1726 /* We've seen at least one base-64 character */
1727 errmsg = "partial character in shift sequence";
1728 goto utf7Error;
1729 }
1730 else {
1731 /* Some bits remain; they should be zero */
1732 if (base64buffer != 0) {
1733 errmsg = "non-zero padding bits in shift sequence";
1734 goto utf7Error;
1735 }
1736 }
1737 }
1738 if (ch != '-') {
1739 /* '-' is absorbed; other terminating
1740 characters are preserved */
1741 *p++ = ch;
1742 }
1743 }
1744 }
1745 else if ( ch == '+' ) {
1746 startinpos = s-starts;
1747 s++; /* consume '+' */
1748 if (s < e && *s == '-') { /* '+-' encodes '+' */
1749 s++;
1750 *p++ = '+';
1751 }
1752 else { /* begin base64-encoded section */
1753 inShift = 1;
1754 shiftOutStart = p;
1755 base64bits = 0;
1756 base64buffer = 0;
1757 }
1758 }
1759 else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1760 *p++ = ch;
1761 s++;
1762 }
1763 else {
1764 startinpos = s-starts;
1765 s++;
1766 errmsg = "unexpected special character";
1767 goto utf7Error;
1768 }
1769 continue;
1770 utf7Error:
1771 outpos = p-PyUnicode_AS_UNICODE(unicode);
1772 endinpos = s-starts;
1773 if (unicode_decode_call_errorhandler(
1774 errors, &errorHandler,
1775 "utf7", errmsg,
1776 starts, size, &startinpos, &endinpos, &exc, &s,
1777 &unicode, &outpos, &p))
1778 goto onError;
1779 }
1780
1781 /* end of string */
1782
1783 if (inShift && !consumed) { /* in shift sequence, no more to follow */
1784 /* if we're in an inconsistent state, that's an error */
1785 if (surrogate ||
1786 (base64bits >= 6) ||
1787 (base64bits > 0 && base64buffer != 0)) {
1788 outpos = p-PyUnicode_AS_UNICODE(unicode);
1789 endinpos = size;
1790 if (unicode_decode_call_errorhandler(
1791 errors, &errorHandler,
1792 "utf7", "unterminated shift sequence",
1793 starts, size, &startinpos, &endinpos, &exc, &s,
1794 &unicode, &outpos, &p))
1795 goto onError;
1796 }
1797 }
1798
1799 /* return state */
1800 if (consumed) {
1801 if (inShift) {
1802 p = shiftOutStart; /* back off output */
1803 *consumed = startinpos;
1804 }
1805 else {
1806 *consumed = s-starts;
1807 }
1808 }
1809
1810 if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1811 goto onError;
1812
1813 Py_XDECREF(errorHandler);
1814 Py_XDECREF(exc);
1815 return (PyObject *)unicode;
1816
1817 onError:
1818 Py_XDECREF(errorHandler);
1819 Py_XDECREF(exc);
1820 Py_DECREF(unicode);
1821 return NULL;
1822 }
1823
1824
1825 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1826 Py_ssize_t size,
1827 int base64SetO,
1828 int base64WhiteSpace,
1829 const char *errors)
1830 {
1831 PyObject *v;
1832 /* It might be possible to tighten this worst case */
1833 Py_ssize_t allocated = 8 * size;
1834 int inShift = 0;
1835 Py_ssize_t i = 0;
1836 unsigned int base64bits = 0;
1837 unsigned long base64buffer = 0;
1838 char * out;
1839 char * start;
1840
1841 if (allocated / 8 != size)
1842 return PyErr_NoMemory();
1843
1844 if (size == 0)
1845 return PyString_FromStringAndSize(NULL, 0);
1846
1847 v = PyString_FromStringAndSize(NULL, allocated);
1848 if (v == NULL)
1849 return NULL;
1850
1851 start = out = PyString_AS_STRING(v);
1852 for (;i < size; ++i) {
1853 Py_UNICODE ch = s[i];
1854
1855 if (inShift) {
1856 if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1857 /* shifting out */
1858 if (base64bits) { /* output remaining bits */
1859 *out++ = TO_BASE64(base64buffer << (6-base64bits));
1860 base64buffer = 0;
1861 base64bits = 0;
1862 }
1863 inShift = 0;
1864 /* Characters not in the BASE64 set implicitly unshift the sequence
1865 so no '-' is required, except if the character is itself a '-' */
1866 if (IS_BASE64(ch) || ch == '-') {
1867 *out++ = '-';
1868 }
1869 *out++ = (char) ch;
1870 }
1871 else {
1872 goto encode_char;
1873 }
1874 }
1875 else { /* not in a shift sequence */
1876 if (ch == '+') {
1877 *out++ = '+';
1878 *out++ = '-';
1879 }
1880 else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1881 *out++ = (char) ch;
1882 }
1883 else {
1884 *out++ = '+';
1885 inShift = 1;
1886 goto encode_char;
1887 }
1888 }
1889 continue;
1890 encode_char:
1891 #ifdef Py_UNICODE_WIDE
1892 if (ch >= 0x10000) {
1893 /* code first surrogate */
1894 base64bits += 16;
1895 base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1896 while (base64bits >= 6) {
1897 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1898 base64bits -= 6;
1899 }
1900 /* prepare second surrogate */
1901 ch = 0xDC00 | ((ch-0x10000) & 0x3FF);
1902 }
1903 #endif
1904 base64bits += 16;
1905 base64buffer = (base64buffer << 16) | ch;
1906 while (base64bits >= 6) {
1907 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1908 base64bits -= 6;
1909 }
1910 }
1911 if (base64bits)
1912 *out++= TO_BASE64(base64buffer << (6-base64bits) );
1913 if (inShift)
1914 *out++ = '-';
1915
1916 if (_PyString_Resize(&v, out - start))
1917 return NULL;
1918 return v;
1919 }
1920
1921 #undef IS_BASE64
1922 #undef FROM_BASE64
1923 #undef TO_BASE64
1924 #undef DECODE_DIRECT
1925 #undef ENCODE_DIRECT
1926
1927 /* --- UTF-8 Codec -------------------------------------------------------- */
1928
1929 static
1930 char utf8_code_length[256] = {
1931 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1932 illegal prefix. See RFC 3629 for details */
1933 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1934 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1941 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1942 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1945 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1946 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1947 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1948 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1949 };
1950
1951 PyObject *PyUnicode_DecodeUTF8(const char *s,
1952 Py_ssize_t size,
1953 const char *errors)
1954 {
1955 return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1956 }
1957
1958 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1959 Py_ssize_t size,
1960 const char *errors,
1961 Py_ssize_t *consumed)
1962 {
1963 const char *starts = s;
1964 int n;
1965 int k;
1966 Py_ssize_t startinpos;
1967 Py_ssize_t endinpos;
1968 Py_ssize_t outpos;
1969 const char *e;
1970 PyUnicodeObject *unicode;
1971 Py_UNICODE *p;
1972 const char *errmsg = "";
1973 PyObject *errorHandler = NULL;
1974 PyObject *exc = NULL;
1975
1976 /* Note: size will always be longer than the resulting Unicode
1977 character count */
1978 unicode = _PyUnicode_New(size);
1979 if (!unicode)
1980 return NULL;
1981 if (size == 0) {
1982 if (consumed)
1983 *consumed = 0;
1984 return (PyObject *)unicode;
1985 }
1986
1987 /* Unpack UTF-8 encoded data */
1988 p = unicode->str;
1989 e = s + size;
1990
1991 while (s < e) {
1992 Py_UCS4 ch = (unsigned char)*s;
1993
1994 if (ch < 0x80) {
1995 *p++ = (Py_UNICODE)ch;
1996 s++;
1997 continue;
1998 }
1999
2000 n = utf8_code_length[ch];
2001
2002 if (s + n > e) {
2003 if (consumed)
2004 break;
2005 else {
2006 errmsg = "unexpected end of data";
2007 startinpos = s-starts;
2008 endinpos = startinpos+1;
2009 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2010 endinpos++;
2011 goto utf8Error;
2012 }
2013 }
2014
2015 switch (n) {
2016
2017 case 0:
2018 errmsg = "invalid start byte";
2019 startinpos = s-starts;
2020 endinpos = startinpos+1;
2021 goto utf8Error;
2022
2023 case 1:
2024 errmsg = "internal error";
2025 startinpos = s-starts;
2026 endinpos = startinpos+1;
2027 goto utf8Error;
2028
2029 case 2:
2030 if ((s[1] & 0xc0) != 0x80) {
2031 errmsg = "invalid continuation byte";
2032 startinpos = s-starts;
2033 endinpos = startinpos + 1;
2034 goto utf8Error;
2035 }
2036 ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2037 assert ((ch > 0x007F) && (ch <= 0x07FF));
2038 *p++ = (Py_UNICODE)ch;
2039 break;
2040
2041 case 3:
2042 /* XXX: surrogates shouldn't be valid UTF-8!
2043 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2044 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2045 Uncomment the 2 lines below to make them invalid,
2046 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2047 if ((s[1] & 0xc0) != 0x80 ||
2048 (s[2] & 0xc0) != 0x80 ||
2049 ((unsigned char)s[0] == 0xE0 &&
2050 (unsigned char)s[1] < 0xA0)/* ||
2051 ((unsigned char)s[0] == 0xED &&
2052 (unsigned char)s[1] > 0x9F)*/) {
2053 errmsg = "invalid continuation byte";
2054 startinpos = s-starts;
2055 endinpos = startinpos + 1;
2056
2057 /* if s[1] first two bits are 1 and 0, then the invalid
2058 continuation byte is s[2], so increment endinpos by 1,
2059 if not, s[1] is invalid and endinpos doesn't need to
2060 be incremented. */
2061 if ((s[1] & 0xC0) == 0x80)
2062 endinpos++;
2063 goto utf8Error;
2064 }
2065 ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2066 assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2067 *p++ = (Py_UNICODE)ch;
2068 break;
2069
2070 case 4:
2071 if ((s[1] & 0xc0) != 0x80 ||
2072 (s[2] & 0xc0) != 0x80 ||
2073 (s[3] & 0xc0) != 0x80 ||
2074 ((unsigned char)s[0] == 0xF0 &&
2075 (unsigned char)s[1] < 0x90) ||
2076 ((unsigned char)s[0] == 0xF4 &&
2077 (unsigned char)s[1] > 0x8F)) {
2078 errmsg = "invalid continuation byte";
2079 startinpos = s-starts;
2080 endinpos = startinpos + 1;
2081 if ((s[1] & 0xC0) == 0x80) {
2082 endinpos++;
2083 if ((s[2] & 0xC0) == 0x80)
2084 endinpos++;
2085 }
2086 goto utf8Error;
2087 }
2088 ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2089 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2090 assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2091
2092 #ifdef Py_UNICODE_WIDE
2093 *p++ = (Py_UNICODE)ch;
2094 #else
2095 /* compute and append the two surrogates: */
2096
2097 /* translate from 10000..10FFFF to 0..FFFF */
2098 ch -= 0x10000;
2099
2100 /* high surrogate = top 10 bits added to D800 */
2101 *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2102
2103 /* low surrogate = bottom 10 bits added to DC00 */
2104 *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2105 #endif
2106 break;
2107 }
2108 s += n;
2109 continue;
2110
2111 utf8Error:
2112 outpos = p-PyUnicode_AS_UNICODE(unicode);
2113 if (unicode_decode_call_errorhandler(
2114 errors, &errorHandler,
2115 "utf8", errmsg,
2116 starts, size, &startinpos, &endinpos, &exc, &s,
2117 &unicode, &outpos, &p))
2118 goto onError;
2119 }
2120 if (consumed)
2121 *consumed = s-starts;
2122
2123 /* Adjust length */
2124 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2125 goto onError;
2126
2127 Py_XDECREF(errorHandler);
2128 Py_XDECREF(exc);
2129 return (PyObject *)unicode;
2130
2131 onError:
2132 Py_XDECREF(errorHandler);
2133 Py_XDECREF(exc);
2134 Py_DECREF(unicode);
2135 return NULL;
2136 }
2137
2138 /* Allocation strategy: if the string is short, convert into a stack buffer
2139 and allocate exactly as much space needed at the end. Else allocate the
2140 maximum possible needed (4 result bytes per Unicode character), and return
2141 the excess memory at the end.
2142 */
2143 PyObject *
2144 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2145 Py_ssize_t size,
2146 const char *errors)
2147 {
2148 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2149
2150 Py_ssize_t i; /* index into s of next input byte */
2151 PyObject *v; /* result string object */
2152 char *p; /* next free byte in output buffer */
2153 Py_ssize_t nallocated; /* number of result bytes allocated */
2154 Py_ssize_t nneeded; /* number of result bytes needed */
2155 char stackbuf[MAX_SHORT_UNICHARS * 4];
2156
2157 assert(s != NULL);
2158 assert(size >= 0);
2159
2160 if (size <= MAX_SHORT_UNICHARS) {
2161 /* Write into the stack buffer; nallocated can't overflow.
2162 * At the end, we'll allocate exactly as much heap space as it
2163 * turns out we need.
2164 */
2165 nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2166 v = NULL; /* will allocate after we're done */
2167 p = stackbuf;
2168 }
2169 else {
2170 /* Overallocate on the heap, and give the excess back at the end. */
2171 nallocated = size * 4;
2172 if (nallocated / 4 != size) /* overflow! */
2173 return PyErr_NoMemory();
2174 v = PyString_FromStringAndSize(NULL, nallocated);
2175 if (v == NULL)
2176 return NULL;
2177 p = PyString_AS_STRING(v);
2178 }
2179
2180 for (i = 0; i < size;) {
2181 Py_UCS4 ch = s[i++];
2182
2183 if (ch < 0x80)
2184 /* Encode ASCII */
2185 *p++ = (char) ch;
2186
2187 else if (ch < 0x0800) {
2188 /* Encode Latin-1 */
2189 *p++ = (char)(0xc0 | (ch >> 6));
2190 *p++ = (char)(0x80 | (ch & 0x3f));
2191 }
2192 else {
2193 /* Encode UCS2 Unicode ordinals */
2194 if (ch < 0x10000) {
2195 /* Special case: check for high surrogate */
2196 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2197 Py_UCS4 ch2 = s[i];
2198 /* Check for low surrogate and combine the two to
2199 form a UCS4 value */
2200 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2201 ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2202 i++;
2203 goto encodeUCS4;
2204 }
2205 /* Fall through: handles isolated high surrogates */
2206 }
2207 *p++ = (char)(0xe0 | (ch >> 12));
2208 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2209 *p++ = (char)(0x80 | (ch & 0x3f));
2210 continue;
2211 }
2212 encodeUCS4:
2213 /* Encode UCS4 Unicode ordinals */
2214 *p++ = (char)(0xf0 | (ch >> 18));
2215 *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2216 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2217 *p++ = (char)(0x80 | (ch & 0x3f));
2218 }
2219 }
2220
2221 if (v == NULL) {
2222 /* This was stack allocated. */
2223 nneeded = p - stackbuf;
2224 assert(nneeded <= nallocated);
2225 v = PyString_FromStringAndSize(stackbuf, nneeded);
2226 }
2227 else {
2228 /* Cut back to size actually needed. */
2229 nneeded = p - PyString_AS_STRING(v);
2230 assert(nneeded <= nallocated);
2231 if (_PyString_Resize(&v, nneeded))
2232 return NULL;
2233 }
2234 return v;
2235
2236 #undef MAX_SHORT_UNICHARS
2237 }
2238
2239 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2240 {
2241 if (!PyUnicode_Check(unicode)) {
2242 PyErr_BadArgument();
2243 return NULL;
2244 }
2245 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2246 PyUnicode_GET_SIZE(unicode),
2247 NULL);
2248 }
2249
2250 /* --- UTF-32 Codec ------------------------------------------------------- */
2251
2252 PyObject *
2253 PyUnicode_DecodeUTF32(const char *s,
2254 Py_ssize_t size,
2255 const char *errors,
2256 int *byteorder)
2257 {
2258 return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2259 }
2260
2261 PyObject *
2262 PyUnicode_DecodeUTF32Stateful(const char *s,
2263 Py_ssize_t size,
2264 const char *errors,
2265 int *byteorder,
2266 Py_ssize_t *consumed)
2267 {
2268 const char *starts = s;
2269 Py_ssize_t startinpos;
2270 Py_ssize_t endinpos;
2271 Py_ssize_t outpos;
2272 PyUnicodeObject *unicode;
2273 Py_UNICODE *p;
2274 #ifndef Py_UNICODE_WIDE
2275 int pairs = 0;
2276 const unsigned char *qq;
2277 #else
2278 const int pairs = 0;
2279 #endif
2280 const unsigned char *q, *e;
2281 int bo = 0; /* assume native ordering by default */
2282 const char *errmsg = "";
2283 /* Offsets from q for retrieving bytes in the right order. */
2284 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2285 int iorder[] = {0, 1, 2, 3};
2286 #else
2287 int iorder[] = {3, 2, 1, 0};
2288 #endif
2289 PyObject *errorHandler = NULL;
2290 PyObject *exc = NULL;
2291
2292 q = (unsigned char *)s;
2293 e = q + size;
2294
2295 if (byteorder)
2296 bo = *byteorder;
2297
2298 /* Check for BOM marks (U+FEFF) in the input and adjust current
2299 byte order setting accordingly. In native mode, the leading BOM
2300 mark is skipped, in all other modes, it is copied to the output
2301 stream as-is (giving a ZWNBSP character). */
2302 if (bo == 0) {
2303 if (size >= 4) {
2304 const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2305 (q[iorder[1]] << 8) | q[iorder[0]];
2306 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2307 if (bom == 0x0000FEFF) {
2308 q += 4;
2309 bo = -1;
2310 }
2311 else if (bom == 0xFFFE0000) {
2312 q += 4;
2313 bo = 1;
2314 }
2315 #else
2316 if (bom == 0x0000FEFF) {
2317 q += 4;
2318 bo = 1;
2319 }
2320 else if (bom == 0xFFFE0000) {
2321 q += 4;
2322 bo = -1;
2323 }
2324 #endif
2325 }
2326 }
2327
2328 if (bo == -1) {
2329 /* force LE */
2330 iorder[0] = 0;
2331 iorder[1] = 1;
2332 iorder[2] = 2;
2333 iorder[3] = 3;
2334 }
2335 else if (bo == 1) {
2336 /* force BE */
2337 iorder[0] = 3;
2338 iorder[1] = 2;
2339 iorder[2] = 1;
2340 iorder[3] = 0;
2341 }
2342
2343 /* On narrow builds we split characters outside the BMP into two
2344 code points => count how much extra space we need. */
2345 #ifndef Py_UNICODE_WIDE
2346 for (qq = q; e - qq >= 4; qq += 4)
2347 if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2348 pairs++;
2349 #endif
2350
2351 /* This might be one to much, because of a BOM */
2352 unicode = _PyUnicode_New((size+3)/4+pairs);
2353 if (!unicode)
2354 return NULL;
2355 if (size == 0)
2356 return (PyObject *)unicode;
2357
2358 /* Unpack UTF-32 encoded data */
2359 p = unicode->str;
2360
2361 while (q < e) {
2362 Py_UCS4 ch;
2363 /* remaining bytes at the end? (size should be divisible by 4) */
2364 if (e-q<4) {
2365 if (consumed)
2366 break;
2367 errmsg = "truncated data";
2368 startinpos = ((const char *)q)-starts;
2369 endinpos = ((const char *)e)-starts;
2370 goto utf32Error;
2371 /* The remaining input chars are ignored if the callback
2372 chooses to skip the input */
2373 }
2374 ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2375 (q[iorder[1]] << 8) | q[iorder[0]];
2376
2377 if (ch >= 0x110000)
2378 {
2379 errmsg = "code point not in range(0x110000)";
2380 startinpos = ((const char *)q)-starts;
2381 endinpos = startinpos+4;
2382 goto utf32Error;
2383 }
2384 #ifndef Py_UNICODE_WIDE
2385 if (ch >= 0x10000)
2386 {
2387 *p++ = 0xD800 | ((ch-0x10000) >> 10);
2388 *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2389 }
2390 else
2391 #endif
2392 *p++ = ch;
2393 q += 4;
2394 continue;
2395 utf32Error:
2396 outpos = p-PyUnicode_AS_UNICODE(unicode);
2397 if (unicode_decode_call_errorhandler(
2398 errors, &errorHandler,
2399 "utf32", errmsg,
2400 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2401 &unicode, &outpos, &p))
2402 goto onError;
2403 }
2404
2405 if (byteorder)
2406 *byteorder = bo;
2407
2408 if (consumed)
2409 *consumed = (const char *)q-starts;
2410
2411 /* Adjust length */
2412 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2413 goto onError;
2414
2415 Py_XDECREF(errorHandler);
2416 Py_XDECREF(exc);
2417 return (PyObject *)unicode;
2418
2419 onError:
2420 Py_DECREF(unicode);
2421 Py_XDECREF(errorHandler);
2422 Py_XDECREF(exc);
2423 return NULL;
2424 }
2425
2426 PyObject *
2427 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2428 Py_ssize_t size,
2429 const char *errors,
2430 int byteorder)
2431 {
2432 PyObject *v;
2433 unsigned char *p;
2434 Py_ssize_t nsize, bytesize;
2435 #ifndef Py_UNICODE_WIDE
2436 Py_ssize_t i, pairs;
2437 #else
2438 const int pairs = 0;
2439 #endif
2440 /* Offsets from p for storing byte pairs in the right order. */
2441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2442 int iorder[] = {0, 1, 2, 3};
2443 #else
2444 int iorder[] = {3, 2, 1, 0};
2445 #endif
2446
2447 #define STORECHAR(CH) \
2448 do { \
2449 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2450 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2451 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2452 p[iorder[0]] = (CH) & 0xff; \
2453 p += 4; \
2454 } while(0)
2455
2456 /* In narrow builds we can output surrogate pairs as one code point,
2457 so we need less space. */
2458 #ifndef Py_UNICODE_WIDE
2459 for (i = pairs = 0; i < size-1; i++)
2460 if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2461 0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2462 pairs++;
2463 #endif
2464 nsize = (size - pairs + (byteorder == 0));
2465 bytesize = nsize * 4;
2466 if (bytesize / 4 != nsize)
2467 return PyErr_NoMemory();
2468 v = PyString_FromStringAndSize(NULL, bytesize);
2469 if (v == NULL)
2470 return NULL;
2471
2472 p = (unsigned char *)PyString_AS_STRING(v);
2473 if (byteorder == 0)
2474 STORECHAR(0xFEFF);
2475 if (size == 0)
2476 return v;
2477
2478 if (byteorder == -1) {
2479 /* force LE */
2480 iorder[0] = 0;
2481 iorder[1] = 1;
2482 iorder[2] = 2;
2483 iorder[3] = 3;
2484 }
2485 else if (byteorder == 1) {
2486 /* force BE */
2487 iorder[0] = 3;
2488 iorder[1] = 2;
2489 iorder[2] = 1;
2490 iorder[3] = 0;
2491 }
2492
2493 while (size-- > 0) {
2494 Py_UCS4 ch = *s++;
2495 #ifndef Py_UNICODE_WIDE
2496 if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2497 Py_UCS4 ch2 = *s;
2498 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2499 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2500 s++;
2501 size--;
2502 }
2503 }
2504 #endif
2505 STORECHAR(ch);
2506 }
2507 return v;
2508 #undef STORECHAR
2509 }
2510
2511 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2512 {
2513 if (!PyUnicode_Check(unicode)) {
2514 PyErr_BadArgument();
2515 return NULL;
2516 }
2517 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2518 PyUnicode_GET_SIZE(unicode),
2519 NULL,
2520 0);
2521 }
2522
2523 /* --- UTF-16 Codec ------------------------------------------------------- */
2524
2525 PyObject *
2526 PyUnicode_DecodeUTF16(const char *s,
2527 Py_ssize_t size,
2528 const char *errors,
2529 int *byteorder)
2530 {
2531 return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2532 }
2533
2534 PyObject *
2535 PyUnicode_DecodeUTF16Stateful(const char *s,
2536 Py_ssize_t size,
2537 const char *errors,
2538 int *byteorder,
2539 Py_ssize_t *consumed)
2540 {
2541 const char *starts = s;
2542 Py_ssize_t startinpos;
2543 Py_ssize_t endinpos;
2544 Py_ssize_t outpos;
2545 PyUnicodeObject *unicode;
2546 Py_UNICODE *p;
2547 const unsigned char *q, *e;
2548 int bo = 0; /* assume native ordering by default */
2549 const char *errmsg = "";
2550 /* Offsets from q for retrieving byte pairs in the right order. */
2551 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2552 int ihi = 1, ilo = 0;
2553 #else
2554 int ihi = 0, ilo = 1;
2555 #endif
2556 PyObject *errorHandler = NULL;
2557 PyObject *exc = NULL;
2558
2559 /* Note: size will always be longer than the resulting Unicode
2560 character count */
2561 unicode = _PyUnicode_New(size);
2562 if (!unicode)
2563 return NULL;
2564 if (size == 0)
2565 return (PyObject *)unicode;
2566
2567 /* Unpack UTF-16 encoded data */
2568 p = unicode->str;
2569 q = (unsigned char *)s;
2570 e = q + size;
2571
2572 if (byteorder)
2573 bo = *byteorder;
2574
2575 /* Check for BOM marks (U+FEFF) in the input and adjust current
2576 byte order setting accordingly. In native mode, the leading BOM
2577 mark is skipped, in all other modes, it is copied to the output
2578 stream as-is (giving a ZWNBSP character). */
2579 if (bo == 0) {
2580 if (size >= 2) {
2581 const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2582 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2583 if (bom == 0xFEFF) {
2584 q += 2;
2585 bo = -1;
2586 }
2587 else if (bom == 0xFFFE) {
2588 q += 2;
2589 bo = 1;
2590 }
2591 #else
2592 if (bom == 0xFEFF) {
2593 q += 2;
2594 bo = 1;
2595 }
2596 else if (bom == 0xFFFE) {
2597 q += 2;
2598 bo = -1;
2599 }
2600 #endif
2601 }
2602 }
2603
2604 if (bo == -1) {
2605 /* force LE */
2606 ihi = 1;
2607 ilo = 0;
2608 }
2609 else if (bo == 1) {
2610 /* force BE */
2611 ihi = 0;
2612 ilo = 1;
2613 }
2614
2615 while (q < e) {
2616 Py_UNICODE ch;
2617 /* remaining bytes at the end? (size should be even) */
2618 if (e-q<2) {
2619 if (consumed)
2620 break;
2621 errmsg = "truncated data";
2622 startinpos = ((const char *)q)-starts;
2623 endinpos = ((const char *)e)-starts;
2624 goto utf16Error;
2625 /* The remaining input chars are ignored if the callback
2626 chooses to skip the input */
2627 }
2628 ch = (q[ihi] << 8) | q[ilo];
2629
2630 q += 2;
2631
2632 if (ch < 0xD800 || ch > 0xDFFF) {
2633 *p++ = ch;
2634 continue;
2635 }
2636
2637 /* UTF-16 code pair: */
2638 if (e - q < 2) {
2639 q -= 2;
2640 if (consumed)
2641 break;
2642 errmsg = "unexpected end of data";
2643 startinpos = ((const char *)q)-starts;
2644 endinpos = ((const char *)e)-starts;
2645 goto utf16Error;
2646 }
2647 if (0xD800 <= ch && ch <= 0xDBFF) {
2648 Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2649 q += 2;
2650 if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2651 #ifndef Py_UNICODE_WIDE
2652 *p++ = ch;
2653 *p++ = ch2;
2654 #else
2655 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2656 #endif
2657 continue;
2658 }
2659 else {
2660 errmsg = "illegal UTF-16 surrogate";
2661 startinpos = (((const char *)q)-4)-starts;
2662 endinpos = startinpos+2;
2663 goto utf16Error;
2664 }
2665
2666 }
2667 errmsg = "illegal encoding";
2668 startinpos = (((const char *)q)-2)-starts;
2669 endinpos = startinpos+2;
2670 /* Fall through to report the error */
2671
2672 utf16Error:
2673 outpos = p-PyUnicode_AS_UNICODE(unicode);
2674 if (unicode_decode_call_errorhandler(
2675 errors, &errorHandler,
2676 "utf16", errmsg,
2677 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2678 &unicode, &outpos, &p))
2679 goto onError;
2680 }
2681
2682 if (byteorder)
2683 *byteorder = bo;
2684
2685 if (consumed)
2686 *consumed = (const char *)q-starts;
2687
2688 /* Adjust length */
2689 if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2690 goto onError;
2691
2692 Py_XDECREF(errorHandler);
2693 Py_XDECREF(exc);
2694 return (PyObject *)unicode;
2695
2696 onError:
2697 Py_DECREF(unicode);
2698 Py_XDECREF(errorHandler);
2699 Py_XDECREF(exc);
2700 return NULL;
2701 }
2702
2703 PyObject *
2704 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2705 Py_ssize_t size,
2706 const char *errors,
2707 int byteorder)
2708 {
2709 PyObject *v;
2710 unsigned char *p;
2711 Py_ssize_t nsize, bytesize;
2712 #ifdef Py_UNICODE_WIDE
2713 Py_ssize_t i, pairs;
2714 #else
2715 const int pairs = 0;
2716 #endif
2717 /* Offsets from p for storing byte pairs in the right order. */
2718 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2719 int ihi = 1, ilo = 0;
2720 #else
2721 int ihi = 0, ilo = 1;
2722 #endif
2723
2724 #define STORECHAR(CH) \
2725 do { \
2726 p[ihi] = ((CH) >> 8) & 0xff; \
2727 p[ilo] = (CH) & 0xff; \
2728 p += 2; \
2729 } while(0)
2730
2731 #ifdef Py_UNICODE_WIDE
2732 for (i = pairs = 0; i < size; i++)
2733 if (s[i] >= 0x10000)
2734 pairs++;
2735 #endif
2736 /* 2 * (size + pairs + (byteorder == 0)) */
2737 if (size > PY_SSIZE_T_MAX ||
2738 size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2739 return PyErr_NoMemory();
2740 nsize = size + pairs + (byteorder == 0);
2741 bytesize = nsize * 2;
2742 if (bytesize / 2 != nsize)
2743 return PyErr_NoMemory();
2744 v = PyString_FromStringAndSize(NULL, bytesize);
2745 if (v == NULL)
2746 return NULL;
2747
2748 p = (unsigned char *)PyString_AS_STRING(v);
2749 if (byteorder == 0)
2750 STORECHAR(0xFEFF);
2751 if (size == 0)
2752 return v;
2753
2754 if (byteorder == -1) {
2755 /* force LE */
2756 ihi = 1;
2757 ilo = 0;
2758 }
2759 else if (byteorder == 1) {
2760 /* force BE */
2761 ihi = 0;
2762 ilo = 1;
2763 }
2764
2765 while (size-- > 0) {
2766 Py_UNICODE ch = *s++;
2767 Py_UNICODE ch2 = 0;
2768 #ifdef Py_UNICODE_WIDE
2769 if (ch >= 0x10000) {
2770 ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2771 ch = 0xD800 | ((ch-0x10000) >> 10);
2772 }
2773 #endif
2774 STORECHAR(ch);
2775 if (ch2)
2776 STORECHAR(ch2);
2777 }
2778 return v;
2779 #undef STORECHAR
2780 }
2781
2782 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2783 {
2784 if (!PyUnicode_Check(unicode)) {
2785 PyErr_BadArgument();
2786 return NULL;
2787 }
2788 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2789 PyUnicode_GET_SIZE(unicode),
2790 NULL,
2791 0);
2792 }
2793
2794 /* --- Unicode Escape Codec ----------------------------------------------- */
2795
2796 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2797
2798 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2799 Py_ssize_t size,
2800 const char *errors)
2801 {
2802 const char *starts = s;
2803 Py_ssize_t startinpos;
2804 Py_ssize_t endinpos;
2805 Py_ssize_t outpos;
2806 PyUnicodeObject *v;
2807 Py_UNICODE *p;
2808 const char *end;
2809 char* message;
2810 Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2811 PyObject *errorHandler = NULL;
2812 PyObject *exc = NULL;
2813
2814 /* Escaped strings will always be longer than the resulting
2815 Unicode string, so we start with size here and then reduce the
2816 length after conversion to the true value.
2817 (but if the error callback returns a long replacement string
2818 we'll have to allocate more space) */
2819 v = _PyUnicode_New(size);
2820 if (v == NULL)
2821 goto onError;
2822 if (size == 0)
2823 return (PyObject *)v;
2824
2825 p = PyUnicode_AS_UNICODE(v);
2826 end = s + size;
2827
2828 while (s < end) {
2829 unsigned char c;
2830 Py_UNICODE x;
2831 int digits;
2832
2833 /* Non-escape characters are interpreted as Unicode ordinals */
2834 if (*s != '\\') {
2835 *p++ = (unsigned char) *s++;
2836 continue;
2837 }
2838
2839 startinpos = s-starts;
2840 /* \ - Escapes */
2841 s++;
2842 c = *s++;
2843 if (s > end)
2844 c = '\0'; /* Invalid after \ */
2845 switch (c) {
2846
2847 /* \x escapes */
2848 case '\n': break;
2849 case '\\': *p++ = '\\'; break;
2850 case '\'': *p++ = '\''; break;
2851 case '\"': *p++ = '\"'; break;
2852 case 'b': *p++ = '\b'; break;
2853 case 'f': *p++ = '\014'; break; /* FF */
2854 case 't': *p++ = '\t'; break;
2855 case 'n': *p++ = '\n'; break;
2856 case 'r': *p++ = '\r'; break;
2857 case 'v': *p++ = '\013'; break; /* VT */
2858 case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2859
2860 /* \OOO (octal) escapes */
2861 case '0': case '1': case '2': case '3':
2862 case '4': case '5': case '6': case '7':
2863 x = s[-1] - '0';
2864 if (s < end && '0' <= *s && *s <= '7') {
2865 x = (x<<3) + *s++ - '0';
2866 if (s < end && '0' <= *s && *s <= '7')
2867 x = (x<<3) + *s++ - '0';
2868 }
2869 *p++ = x;
2870 break;
2871
2872 /* hex escapes */
2873 /* \xXX */
2874 case 'x':
2875 digits = 2;
2876 message = "truncated \\xXX escape";
2877 goto hexescape;
2878
2879 /* \uXXXX */
2880 case 'u':
2881 digits = 4;
2882 message = "truncated \\uXXXX escape";
2883 goto hexescape;
2884
2885 /* \UXXXXXXXX */
2886 case 'U':
2887 digits = 8;
2888 message = "truncated \\UXXXXXXXX escape";
2889 hexescape:
2890 chr = 0;
2891 if (end - s < digits) {
2892 /* count only hex digits */
2893 for (; s < end; ++s) {
2894 c = (unsigned char)*s;
2895 if (!Py_ISXDIGIT(c))
2896 goto error;
2897 }
2898 goto error;
2899 }
2900 for (; digits--; ++s) {
2901 c = (unsigned char)*s;
2902 if (!Py_ISXDIGIT(c))
2903 goto error;
2904 chr = (chr<<4) & ~0xF;
2905 if (c >= '0' && c <= '9')
2906 chr += c - '0';
2907 else if (c >= 'a' && c <= 'f')
2908 chr += 10 + c - 'a';
2909 else
2910 chr += 10 + c - 'A';
2911 }
2912 if (chr == 0xffffffff && PyErr_Occurred())
2913 /* _decoding_error will have already written into the
2914 target buffer. */
2915 break;
2916 store:
2917 /* when we get here, chr is a 32-bit unicode character */
2918 if (chr <= 0xffff)
2919 /* UCS-2 character */
2920 *p++ = (Py_UNICODE) chr;
2921 else if (chr <= 0x10ffff) {
2922 /* UCS-4 character. Either store directly, or as
2923 surrogate pair. */
2924 #ifdef Py_UNICODE_WIDE
2925 *p++ = chr;
2926 #else
2927 chr -= 0x10000L;
2928 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2929 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2930 #endif
2931 } else {
2932 message = "illegal Unicode character";
2933 goto error;
2934 }
2935 break;
2936
2937 /* \N{name} */
2938 case 'N':
2939 message = "malformed \\N character escape";
2940 if (ucnhash_CAPI == NULL) {
2941 /* load the unicode data module */
2942 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2943 if (ucnhash_CAPI == NULL)
2944 goto ucnhashError;
2945 }
2946 if (*s == '{') {
2947 const char *start = s+1;
2948 /* look for the closing brace */
2949 while (*s != '}' && s < end)
2950 s++;
2951 if (s > start && s < end && *s == '}') {
2952 /* found a name. look it up in the unicode database */
2953 message = "unknown Unicode character name";
2954 s++;
2955 if (s - start - 1 <= INT_MAX &&
2956 ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2957 goto store;
2958 }
2959 }
2960 goto error;
2961
2962 default:
2963 if (s > end) {
2964 message = "\\ at end of string";
2965 s--;
2966 goto error;
2967 }
2968 else {
2969 *p++ = '\\';
2970 *p++ = (unsigned char)s[-1];
2971 }
2972 break;
2973 }
2974 continue;
2975
2976 error:
2977 endinpos = s-starts;
2978 outpos = p-PyUnicode_AS_UNICODE(v);
2979 if (unicode_decode_call_errorhandler(
2980 errors, &errorHandler,
2981 "unicodeescape", message,
2982 starts, size, &startinpos, &endinpos, &exc, &s,
2983 &v, &outpos, &p))
2984 goto onError;
2985 continue;
2986 }
2987 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2988 goto onError;
2989 Py_XDECREF(errorHandler);
2990 Py_XDECREF(exc);
2991 return (PyObject *)v;
2992
2993 ucnhashError:
2994 PyErr_SetString(
2995 PyExc_UnicodeError,
2996 "\\N escapes not supported (can't load unicodedata module)"
2997 );
2998 Py_XDECREF(v);
2999 Py_XDECREF(errorHandler);
3000 Py_XDECREF(exc);
3001 return NULL;
3002
3003 onError:
3004 Py_XDECREF(v);
3005 Py_XDECREF(errorHandler);
3006 Py_XDECREF(exc);
3007 return NULL;
3008 }
3009
3010 /* Return a Unicode-Escape string version of the Unicode object.
3011
3012 If quotes is true, the string is enclosed in u"" or u'' quotes as
3013 appropriate.
3014
3015 */
3016
3017 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3018 Py_ssize_t size,
3019 Py_UNICODE ch)
3020 {
3021 /* like wcschr, but doesn't stop at NULL characters */
3022
3023 while (size-- > 0) {
3024 if (*s == ch)
3025 return s;
3026 s++;
3027 }
3028
3029 return NULL;
3030 }
3031
3032 static
3033 PyObject *unicodeescape_string(const Py_UNICODE *s,
3034 Py_ssize_t size,
3035 int quotes)
3036 {
3037 PyObject *repr;
3038 char *p;
3039
3040 static const char *hexdigit = "0123456789abcdef";
3041 #ifdef Py_UNICODE_WIDE
3042 const Py_ssize_t expandsize = 10;
3043 #else
3044 const Py_ssize_t expandsize = 6;
3045 #endif
3046
3047 /* XXX(nnorwitz): rather than over-allocating, it would be
3048 better to choose a different scheme. Perhaps scan the
3049 first N-chars of the string and allocate based on that size.
3050 */
3051 /* Initial allocation is based on the longest-possible unichr
3052 escape.
3053
3054 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3055 unichr, so in this case it's the longest unichr escape. In
3056 narrow (UTF-16) builds this is five chars per source unichr
3057 since there are two unichrs in the surrogate pair, so in narrow
3058 (UTF-16) builds it's not the longest unichr escape.
3059
3060 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3061 so in the narrow (UTF-16) build case it's the longest unichr
3062 escape.
3063 */
3064
3065 if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3066 return PyErr_NoMemory();
3067
3068 repr = PyString_FromStringAndSize(NULL,
3069 2
3070 + expandsize*size
3071 + 1);
3072 if (repr == NULL)
3073 return NULL;
3074
3075 p = PyString_AS_STRING(repr);
3076
3077 if (quotes) {
3078 *p++ = 'u';
3079 *p++ = (findchar(s, size, '\'') &&
3080 !findchar(s, size, '"')) ? '"' : '\'';
3081 }
3082 while (size-- > 0) {
3083 Py_UNICODE ch = *s++;
3084
3085 /* Escape quotes and backslashes */
3086 if ((quotes &&
3087 ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3088 *p++ = '\\';
3089 *p++ = (char) ch;
3090 continue;
3091 }
3092
3093 #ifdef Py_UNICODE_WIDE
3094 /* Map 21-bit characters to '\U00xxxxxx' */
3095 else if (ch >= 0x10000) {
3096 *p++ = '\\';
3097 *p++ = 'U';
3098 *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3099 *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3100 *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3101 *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3102 *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3103 *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3104 *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3105 *p++ = hexdigit[ch & 0x0000000F];
3106 continue;
3107 }
3108 #else
3109 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3110 else if (ch >= 0xD800 && ch < 0xDC00) {
3111 Py_UNICODE ch2;
3112 Py_UCS4 ucs;
3113
3114 ch2 = *s++;
3115 size--;
3116 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3117 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3118 *p++ = '\\';
3119 *p++ = 'U';
3120 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3121 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3122 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3123 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3124 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3125 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3126 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3127 *p++ = hexdigit[ucs & 0x0000000F];
3128 continue;
3129 }
3130 /* Fall through: isolated surrogates are copied as-is */
3131 s--;
3132 size++;
3133 }
3134 #endif
3135
3136 /* Map 16-bit characters to '\uxxxx' */
3137 if (ch >= 256) {
3138 *p++ = '\\';
3139 *p++ = 'u';
3140 *p++ = hexdigit[(ch >> 12) & 0x000F];
3141 *p++ = hexdigit[(ch >> 8) & 0x000F];
3142 *p++ = hexdigit[(ch >> 4) & 0x000F];
3143 *p++ = hexdigit[ch & 0x000F];
3144 }
3145
3146 /* Map special whitespace to '\t', \n', '\r' */
3147 else if (ch == '\t') {
3148 *p++ = '\\';
3149 *p++ = 't';
3150 }
3151 else if (ch == '\n') {
3152 *p++ = '\\';
3153 *p++ = 'n';
3154 }
3155 else if (ch == '\r') {
3156 *p++ = '\\';
3157 *p++ = 'r';
3158 }
3159
3160 /* Map non-printable US ASCII to '\xhh' */
3161 else if (ch < ' ' || ch >= 0x7F) {
3162 *p++ = '\\';
3163 *p++ = 'x';
3164 *p++ = hexdigit[(ch >> 4) & 0x000F];
3165 *p++ = hexdigit[ch & 0x000F];
3166 }
3167
3168 /* Copy everything else as-is */
3169 else
3170 *p++ = (char) ch;
3171 }
3172 if (quotes)
3173 *p++ = PyString_AS_STRING(repr)[1];
3174
3175 *p = '\0';
3176 if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3177 return NULL;
3178 return repr;
3179 }
3180
3181 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3182 Py_ssize_t size)
3183 {
3184 return unicodeescape_string(s, size, 0);
3185 }
3186
3187 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3188 {
3189 if (!PyUnicode_Check(unicode)) {
3190 PyErr_BadArgument();
3191 return NULL;
3192 }
3193 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3194 PyUnicode_GET_SIZE(unicode));
3195 }
3196
3197 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3198
3199 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3200 Py_ssize_t size,
3201 const char *errors)
3202 {
3203 const char *starts = s;
3204 Py_ssize_t startinpos;
3205 Py_ssize_t endinpos;
3206 Py_ssize_t outpos;
3207 PyUnicodeObject *v;
3208 Py_UNICODE *p;
3209 const char *end;
3210 const char *bs;
3211 PyObject *errorHandler = NULL;
3212 PyObject *exc = NULL;
3213
3214 /* Escaped strings will always be longer than the resulting
3215 Unicode string, so we start with size here and then reduce the
3216 length after conversion to the true value. (But decoding error
3217 handler might have to resize the string) */
3218 v = _PyUnicode_New(size);
3219 if (v == NULL)
3220 goto onError;
3221 if (size == 0)
3222 return (PyObject *)v;
3223 p = PyUnicode_AS_UNICODE(v);
3224 end = s + size;
3225 while (s < end) {
3226 unsigned char c;
3227 Py_UCS4 x;
3228 int i;
3229 int count;
3230
3231 /* Non-escape characters are interpreted as Unicode ordinals */
3232 if (*s != '\\') {
3233 *p++ = (unsigned char)*s++;
3234 continue;
3235 }
3236 startinpos = s-starts;
3237
3238 /* \u-escapes are only interpreted iff the number of leading
3239 backslashes if odd */
3240 bs = s;
3241 for (;s < end;) {
3242 if (*s != '\\')
3243 break;
3244 *p++ = (unsigned char)*s++;
3245 }
3246 if (((s - bs) & 1) == 0 ||
3247 s >= end ||
3248 (*s != 'u' && *s != 'U')) {
3249 continue;
3250 }
3251 p--;
3252 count = *s=='u' ? 4 : 8;
3253 s++;
3254
3255 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3256 outpos = p-PyUnicode_AS_UNICODE(v);
3257 for (x = 0, i = 0; i < count; ++i, ++s) {
3258 c = (unsigned char)*s;
3259 if (!isxdigit(c)) {
3260 endinpos = s-starts;
3261 if (unicode_decode_call_errorhandler(
3262 errors, &errorHandler,
3263 "rawunicodeescape", "truncated \\uXXXX",
3264 starts, size, &startinpos, &endinpos, &exc, &s,
3265 &v, &outpos, &p))
3266 goto onError;
3267 goto nextByte;
3268 }
3269 x = (x<<4) & ~0xF;
3270 if (c >= '0' && c <= '9')
3271 x += c - '0';
3272 else if (c >= 'a' && c <= 'f')
3273 x += 10 + c - 'a';
3274 else
3275 x += 10 + c - 'A';
3276 }
3277 if (x <= 0xffff)
3278 /* UCS-2 character */
3279 *p++ = (Py_UNICODE) x;
3280 else if (x <= 0x10ffff) {
3281 /* UCS-4 character. Either store directly, or as
3282 surrogate pair. */
3283 #ifdef Py_UNICODE_WIDE
3284 *p++ = (Py_UNICODE) x;
3285 #else
3286 x -= 0x10000L;
3287 *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3288 *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3289 #endif
3290 } else {
3291 endinpos = s-starts;
3292 outpos = p-PyUnicode_AS_UNICODE(v);
3293 if (unicode_decode_call_errorhandler(
3294 errors, &errorHandler,
3295 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3296 starts, size, &startinpos, &endinpos, &exc, &s,
3297 &v, &outpos, &p))
3298 goto onError;
3299 }
3300 nextByte:
3301 ;
3302 }
3303 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3304 goto onError;
3305 Py_XDECREF(errorHandler);
3306 Py_XDECREF(exc);
3307 return (PyObject *)v;
3308
3309 onError:
3310 Py_XDECREF(v);
3311 Py_XDECREF(errorHandler);
3312 Py_XDECREF(exc);
3313 return NULL;
3314 }
3315
3316 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3317 Py_ssize_t size)
3318 {
3319 PyObject *repr;
3320 char *p;
3321 char *q;
3322
3323 static const char *hexdigit = "0123456789abcdef";
3324 #ifdef Py_UNICODE_WIDE
3325 const Py_ssize_t expandsize = 10;
3326 #else
3327 const Py_ssize_t expandsize = 6;
3328 #endif
3329
3330 if (size > PY_SSIZE_T_MAX / expandsize)
3331 return PyErr_NoMemory();
3332
3333 repr = PyString_FromStringAndSize(NULL, expandsize * size);
3334 if (repr == NULL)
3335 return NULL;
3336 if (size == 0)
3337 return repr;
3338
3339 p = q = PyString_AS_STRING(repr);
3340 while (size-- > 0) {
3341 Py_UNICODE ch = *s++;
3342 #ifdef Py_UNICODE_WIDE
3343 /* Map 32-bit characters to '\Uxxxxxxxx' */
3344 if (ch >= 0x10000) {
3345 *p++ = '\\';
3346 *p++ = 'U';
3347 *p++ = hexdigit[(ch >> 28) & 0xf];
3348 *p++ = hexdigit[(ch >> 24) & 0xf];
3349 *p++ = hexdigit[(ch >> 20) & 0xf];
3350 *p++ = hexdigit[(ch >> 16) & 0xf];
3351 *p++ = hexdigit[(ch >> 12) & 0xf];
3352 *p++ = hexdigit[(ch >> 8) & 0xf];
3353 *p++ = hexdigit[(ch >> 4) & 0xf];
3354 *p++ = hexdigit[ch & 15];
3355 }
3356 else
3357 #else
3358 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3359 if (ch >= 0xD800 && ch < 0xDC00) {
3360 Py_UNICODE ch2;
3361 Py_UCS4 ucs;
3362
3363 ch2 = *s++;
3364 size--;
3365 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3366 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3367 *p++ = '\\';
3368 *p++ = 'U';
3369 *p++ = hexdigit[(ucs >> 28) & 0xf];
3370 *p++ = hexdigit[(ucs >> 24) & 0xf];
3371 *p++ = hexdigit[(ucs >> 20) & 0xf];
3372 *p++ = hexdigit[(ucs >> 16) & 0xf];
3373 *p++ = hexdigit[(ucs >> 12) & 0xf];
3374 *p++ = hexdigit[(ucs >> 8) & 0xf];
3375 *p++ = hexdigit[(ucs >> 4) & 0xf];
3376 *p++ = hexdigit[ucs & 0xf];
3377 continue;
3378 }
3379 /* Fall through: isolated surrogates are copied as-is */
3380 s--;
3381 size++;
3382 }
3383 #endif
3384 /* Map 16-bit characters to '\uxxxx' */
3385 if (ch >= 256) {
3386 *p++ = '\\';
3387 *p++ = 'u';
3388 *p++ = hexdigit[(ch >> 12) & 0xf];
3389 *p++ = hexdigit[(ch >> 8) & 0xf];
3390 *p++ = hexdigit[(ch >> 4) & 0xf];
3391 *p++ = hexdigit[ch & 15];
3392 }
3393 /* Copy everything else as-is */
3394 else
3395 *p++ = (char) ch;
3396 }
3397 *p = '\0';
3398 if (_PyString_Resize(&repr, p - q))
3399 return NULL;
3400 return repr;
3401 }
3402
3403 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3404 {
3405 if (!PyUnicode_Check(unicode)) {
3406 PyErr_BadArgument();
3407 return NULL;
3408 }
3409 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3410 PyUnicode_GET_SIZE(unicode));
3411 }
3412
3413 /* --- Unicode Internal Codec ------------------------------------------- */
3414
3415 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3416 Py_ssize_t size,
3417 const char *errors)
3418 {
3419 const char *starts = s;
3420 Py_ssize_t startinpos;
3421 Py_ssize_t endinpos;
3422 Py_ssize_t outpos;
3423 PyUnicodeObject *v;
3424 Py_UNICODE *p;
3425 const char *end;
3426 const char *reason;
3427 PyObject *errorHandler = NULL;
3428 PyObject *exc = NULL;
3429
3430 #ifdef Py_UNICODE_WIDE
3431 Py_UNICODE unimax = PyUnicode_GetMax();
3432 #endif
3433
3434 /* XXX overflow detection missing */
3435 v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3436 if (v == NULL)
3437 goto onError;
3438 if (PyUnicode_GetSize((PyObject *)v) == 0)
3439 return (PyObject *)v;
3440 p = PyUnicode_AS_UNICODE(v);
3441 end = s + size;
3442
3443 while (s < end) {
3444 if (end-s < Py_UNICODE_SIZE) {
3445 endinpos = end-starts;
3446 reason = "truncated input";
3447 goto error;
3448 }
3449 memcpy(p, s, sizeof(Py_UNICODE));
3450 #ifdef Py_UNICODE_WIDE
3451 /* We have to sanity check the raw data, otherwise doom looms for
3452 some malformed UCS-4 data. */
3453 if (*p > unimax || *p < 0) {
3454 endinpos = s - starts + Py_UNICODE_SIZE;
3455 reason = "illegal code point (> 0x10FFFF)";
3456 goto error;
3457 }
3458 #endif
3459 p++;
3460 s += Py_UNICODE_SIZE;
3461 continue;
3462
3463 error:
3464 startinpos = s - starts;
3465 outpos = p - PyUnicode_AS_UNICODE(v);
3466 if (unicode_decode_call_errorhandler(
3467 errors, &errorHandler,
3468 "unicode_internal", reason,
3469 starts, size, &startinpos, &endinpos, &exc, &s,
3470 &v, &outpos, &p)) {
3471 goto onError;
3472 }
3473 }
3474
3475 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3476 goto onError;
3477 Py_XDECREF(errorHandler);
3478 Py_XDECREF(exc);
3479 return (PyObject *)v;
3480
3481 onError:
3482 Py_XDECREF(v);
3483 Py_XDECREF(errorHandler);
3484 Py_XDECREF(exc);
3485 return NULL;
3486 }
3487
3488 /* --- Latin-1 Codec ------------------------------------------------------ */
3489
3490 PyObject *PyUnicode_DecodeLatin1(const char *s,
3491 Py_ssize_t size,
3492 const char *errors)
3493 {
3494 PyUnicodeObject *v;
3495 Py_UNICODE *p;
3496
3497 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3498 if (size == 1) {
3499 Py_UNICODE r = *(unsigned char*)s;
3500 return PyUnicode_FromUnicode(&r, 1);
3501 }
3502
3503 v = _PyUnicode_New(size);
3504 if (v == NULL)
3505 goto onError;
3506 if (size == 0)
3507 return (PyObject *)v;
3508 p = PyUnicode_AS_UNICODE(v);
3509 while (size-- > 0)
3510 *p++ = (unsigned char)*s++;
3511 return (PyObject *)v;
3512
3513 onError:
3514 Py_XDECREF(v);
3515 return NULL;
3516 }
3517
3518 /* create or adjust a UnicodeEncodeError */
3519 static void make_encode_exception(PyObject **exceptionObject,
3520 const char *encoding,
3521 const Py_UNICODE *unicode, Py_ssize_t size,
3522 Py_ssize_t startpos, Py_ssize_t endpos,
3523 const char *reason)
3524 {
3525 if (*exceptionObject == NULL) {
3526 *exceptionObject = PyUnicodeEncodeError_Create(
3527 encoding, unicode, size, startpos, endpos, reason);
3528 }
3529 else {
3530 if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3531 goto onError;
3532 if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3533 goto onError;
3534 if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3535 goto onError;
3536 return;
3537 onError:
3538 Py_CLEAR(*exceptionObject);
3539 }
3540 }
3541
3542 /* raises a UnicodeEncodeError */
3543 static void raise_encode_exception(PyObject **exceptionObject,
3544 const char *encoding,
3545 const Py_UNICODE *unicode, Py_ssize_t size,
3546 Py_ssize_t startpos, Py_ssize_t endpos,
3547 const char *reason)
3548 {
3549 make_encode_exception(exceptionObject,
3550 encoding, unicode, size, startpos, endpos, reason);
3551 if (*exceptionObject != NULL)
3552 PyCodec_StrictErrors(*exceptionObject);
3553 }
3554
3555 /* error handling callback helper:
3556 build arguments, call the callback and check the arguments,
3557 put the result into newpos and return the replacement string, which
3558 has to be freed by the caller */
3559 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3560 PyObject **errorHandler,
3561 const char *encoding, const char *reason,
3562 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3563 Py_ssize_t startpos, Py_ssize_t endpos,
3564 Py_ssize_t *newpos)
3565 {
3566 static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3567
3568 PyObject *restuple;
3569 PyObject *resunicode;
3570
3571 if (*errorHandler == NULL) {
3572 *errorHandler = PyCodec_LookupError(errors);
3573 if (*errorHandler == NULL)
3574 return NULL;
3575 }
3576
3577 make_encode_exception(exceptionObject,
3578 encoding, unicode, size, startpos, endpos, reason);
3579 if (*exceptionObject == NULL)
3580 return NULL;
3581
3582 restuple = PyObject_CallFunctionObjArgs(
3583 *errorHandler, *exceptionObject, NULL);
3584 if (restuple == NULL)
3585 return NULL;
3586 if (!PyTuple_Check(restuple)) {
3587 PyErr_SetString(PyExc_TypeError, &argparse[4]);
3588 Py_DECREF(restuple);
3589 return NULL;
3590 }
3591 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3592 &resunicode, newpos)) {
3593 Py_DECREF(restuple);
3594 return NULL;
3595 }
3596 if (*newpos<0)
3597 *newpos = size+*newpos;
3598 if (*newpos<0 || *newpos>size) {
3599 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3600 Py_DECREF(restuple);
3601 return NULL;
3602 }
3603 Py_INCREF(resunicode);
3604 Py_DECREF(restuple);
3605 return resunicode;
3606 }
3607
3608 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3609 Py_ssize_t size,
3610 const char *errors,
3611 int limit)
3612 {
3613 /* output object */
3614 PyObject *res;
3615 /* pointers to the beginning and end+1 of input */
3616 const Py_UNICODE *startp = p;
3617 const Py_UNICODE *endp = p + size;
3618 /* pointer to the beginning of the unencodable characters */
3619 /* const Py_UNICODE *badp = NULL; */
3620 /* pointer into the output */
3621 char *str;
3622 /* current output position */
3623 Py_ssize_t respos = 0;
3624 Py_ssize_t ressize;
3625 const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3626 const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3627 PyObject *errorHandler = NULL;
3628 PyObject *exc = NULL;
3629 /* the following variable is used for caching string comparisons
3630 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3631 int known_errorHandler = -1;
3632
3633 /* allocate enough for a simple encoding without
3634 replacements, if we need more, we'll resize */
3635 res = PyString_FromStringAndSize(NULL, size);
3636 if (res == NULL)
3637 goto onError;
3638 if (size == 0)
3639 return res;
3640 str = PyString_AS_STRING(res);
3641 ressize = size;
3642
3643 while (p<endp) {
3644 Py_UNICODE c = *p;
3645
3646 /* can we encode this? */
3647 if (c<limit) {
3648 /* no overflow check, because we know that the space is enough */
3649 *str++ = (char)c;
3650 ++p;
3651 }
3652 else {
3653 Py_ssize_t unicodepos = p-startp;
3654 Py_ssize_t requiredsize;
3655 PyObject *repunicode;
3656 Py_ssize_t repsize;
3657 Py_ssize_t newpos;
3658 Py_ssize_t respos;
3659 Py_UNICODE *uni2;
3660 /* startpos for collecting unencodable chars */
3661 const Py_UNICODE *collstart = p;
3662 const Py_UNICODE *collend = p;
3663 /* find all unecodable characters */
3664 while ((collend < endp) && ((*collend) >= limit))
3665 ++collend;
3666 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3667 if (known_errorHandler==-1) {
3668 if ((errors==NULL) || (!strcmp(errors, "strict")))
3669 known_errorHandler = 1;
3670 else if (!strcmp(errors, "replace"))
3671 known_errorHandler = 2;
3672 else if (!strcmp(errors, "ignore"))
3673 known_errorHandler = 3;
3674 else if (!strcmp(errors, "xmlcharrefreplace"))
3675 known_errorHandler = 4;
3676 else
3677 known_errorHandler = 0;
3678 }
3679 switch (known_errorHandler) {
3680 case 1: /* strict */
3681 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3682 goto onError;
3683 case 2: /* replace */
3684 while (collstart++ < collend)
3685 *str++ = '?'; /* fall through */
3686 case 3: /* ignore */
3687 p = collend;
3688 break;
3689 case 4: /* xmlcharrefreplace */
3690 respos = str - PyString_AS_STRING(res);
3691 /* determine replacement size (temporarily (mis)uses p) */
3692 requiredsize = respos;
3693 for (p = collstart; p < collend;) {
3694 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3695 Py_ssize_t incr;
3696 if (ch < 10)
3697 incr = 2+1+1;
3698 else if (ch < 100)
3699 incr = 2+2+1;
3700 else if (ch < 1000)
3701 incr = 2+3+1;
3702 else if (ch < 10000)
3703 incr = 2+4+1;
3704 else if (ch < 100000)
3705 incr = 2+5+1;
3706 else if (ch < 1000000)
3707 incr = 2+6+1;
3708 else
3709 incr = 2+7+1;
3710 if (requiredsize > PY_SSIZE_T_MAX - incr)
3711 goto overflow;
3712 requiredsize += incr;
3713 }
3714 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3715 goto overflow;
3716 requiredsize += endp - collend;
3717 if (requiredsize > ressize) {
3718 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3719 requiredsize = 2*ressize;
3720 if (_PyString_Resize(&res, requiredsize))
3721 goto onError;
3722 str = PyString_AS_STRING(res) + respos;
3723 ressize = requiredsize;
3724 }
3725 /* generate replacement (temporarily (mis)uses p) */
3726 for (p = collstart; p < collend;) {
3727 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3728 str += sprintf(str, "&#%d;", (int)ch);
3729 }
3730 p = collend;
3731 break;
3732 default:
3733 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3734 encoding, reason, startp, size, &exc,
3735 collstart-startp, collend-startp, &newpos);
3736 if (repunicode == NULL)
3737 goto onError;
3738 /* need more space? (at least enough for what we have+the
3739 replacement+the rest of the string, so we won't have to
3740 check space for encodable characters) */
3741 respos = str - PyString_AS_STRING(res);
3742 repsize = PyUnicode_GET_SIZE(repunicode);
3743 if (respos > PY_SSIZE_T_MAX - repsize)
3744 goto overflow;
3745 requiredsize = respos + repsize;
3746 if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3747 goto overflow;
3748 requiredsize += endp - collend;
3749 if (requiredsize > ressize) {
3750 if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3751 requiredsize = 2*ressize;
3752 if (_PyString_Resize(&res, requiredsize)) {
3753 Py_DECREF(repunicode);
3754 goto onError;
3755 }
3756 str = PyString_AS_STRING(res) + respos;
3757 ressize = requiredsize;
3758 }
3759 /* check if there is anything unencodable in the replacement
3760 and copy it to the output */
3761 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
3762 c = *uni2;
3763 if (c >= limit) {
3764 raise_encode_exception(&exc, encoding, startp, size,
3765 unicodepos, unicodepos+1, reason);
3766 Py_DECREF(repunicode);
3767 goto onError;
3768 }
3769 *str = (char)c;
3770 }
3771 p = startp + newpos;
3772 Py_DECREF(repunicode);
3773 }
3774 }
3775 }
3776 /* Resize if we allocated to much */
3777 respos = str - PyString_AS_STRING(res);
3778 if (respos < ressize)
3779 /* If this falls res will be NULL */
3780 _PyString_Resize(&res, respos);
3781 Py_XDECREF(errorHandler);
3782 Py_XDECREF(exc);
3783 return res;
3784
3785 overflow:
3786 PyErr_SetString(PyExc_OverflowError,
3787 "encoded result is too long for a Python string");
3788
3789 onError:
3790 Py_XDECREF(res);
3791 Py_XDECREF(errorHandler);
3792 Py_XDECREF(exc);
3793 return NULL;
3794 }
3795
3796 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3797 Py_ssize_t size,
3798 const char *errors)
3799 {
3800 return unicode_encode_ucs1(p, size, errors, 256);
3801 }
3802
3803 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3804 {
3805 if (!PyUnicode_Check(unicode)) {
3806 PyErr_BadArgument();
3807 return NULL;
3808 }
3809 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3810 PyUnicode_GET_SIZE(unicode),
3811 NULL);
3812 }
3813
3814 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3815
3816 PyObject *PyUnicode_DecodeASCII(const char *s,
3817 Py_ssize_t size,
3818 const char *errors)
3819 {
3820 const char *starts = s;
3821 PyUnicodeObject *v;
3822 Py_UNICODE *p;
3823 Py_ssize_t startinpos;
3824 Py_ssize_t endinpos;
3825 Py_ssize_t outpos;
3826 const char *e;
3827 PyObject *errorHandler = NULL;
3828 PyObject *exc = NULL;
3829
3830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3831 if (size == 1 && *(unsigned char*)s < 128) {
3832 Py_UNICODE r = *(unsigned char*)s;
3833 return PyUnicode_FromUnicode(&r, 1);
3834 }
3835
3836 v = _PyUnicode_New(size);
3837 if (v == NULL)
3838 goto onError;
3839 if (size == 0)
3840 return (PyObject *)v;
3841 p = PyUnicode_AS_UNICODE(v);
3842 e = s + size;
3843 while (s < e) {
3844 register unsigned char c = (unsigned char)*s;
3845 if (c < 128) {
3846 *p++ = c;
3847 ++s;
3848 }
3849 else {
3850 startinpos = s-starts;
3851 endinpos = startinpos + 1;
3852 outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3853 if (unicode_decode_call_errorhandler(
3854 errors, &errorHandler,
3855 "ascii", "ordinal not in range(128)",
3856 starts, size, &startinpos, &endinpos, &exc, &s,
3857 &v, &outpos, &p))
3858 goto onError;
3859 }
3860 }
3861 if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3862 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3863 goto onError;
3864 Py_XDECREF(errorHandler);
3865 Py_XDECREF(exc);
3866 return (PyObject *)v;
3867
3868 onError:
3869 Py_XDECREF(v);
3870 Py_XDECREF(errorHandler);
3871 Py_XDECREF(exc);
3872 return NULL;
3873 }
3874
3875 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3876 Py_ssize_t size,
3877 const char *errors)
3878 {
3879 return unicode_encode_ucs1(p, size, errors, 128);
3880 }
3881
3882 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3883 {
3884 if (!PyUnicode_Check(unicode)) {
3885 PyErr_BadArgument();
3886 return NULL;
3887 }
3888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3889 PyUnicode_GET_SIZE(unicode),
3890 NULL);
3891 }
3892
3893 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3894
3895 /* --- MBCS codecs for Windows -------------------------------------------- */
3896
3897 #if SIZEOF_INT < SIZEOF_SIZE_T
3898 #define NEED_RETRY
3899 #endif
3900
3901 /* XXX This code is limited to "true" double-byte encodings, as
3902 a) it assumes an incomplete character consists of a single byte, and
3903 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3904 encodings, see IsDBCSLeadByteEx documentation. */
3905
3906 static int is_dbcs_lead_byte(const char *s, int offset)
3907 {
3908 const char *curr = s + offset;
3909
3910 if (IsDBCSLeadByte(*curr)) {
3911 const char *prev = CharPrev(s, curr);
3912 return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3913 }
3914 return 0;
3915 }
3916
3917 /*
3918 * Decode MBCS string into unicode object. If 'final' is set, converts
3919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3920 */
3921 static int decode_mbcs(PyUnicodeObject **v,
3922 const char *s, /* MBCS string */
3923 int size, /* sizeof MBCS string */
3924 int final)
3925 {
3926 Py_UNICODE *p;
3927 Py_ssize_t n = 0;
3928 int usize = 0;
3929
3930 assert(size >= 0);
3931
3932 /* Skip trailing lead-byte unless 'final' is set */
3933 if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3934 --size;
3935
3936 /* First get the size of the result */
3937 if (size > 0) {
3938 usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3939 if (usize == 0) {
3940 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3941 return -1;
3942 }
3943 }
3944
3945 if (*v == NULL) {
3946 /* Create unicode object */
3947 *v = _PyUnicode_New(usize);
3948 if (*v == NULL)
3949 return -1;
3950 }
3951 else {
3952 /* Extend unicode object */
3953 n = PyUnicode_GET_SIZE(*v);
3954 if (_PyUnicode_Resize(v, n + usize) < 0)
3955 return -1;
3956 }
3957
3958 /* Do the conversion */
3959 if (size > 0) {
3960 p = PyUnicode_AS_UNICODE(*v) + n;
3961 if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3962 PyErr_SetFromWindowsErrWithFilename(0, NULL);
3963 return -1;
3964 }
3965 }
3966
3967 return size;
3968 }
3969
3970 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3971 Py_ssize_t size,
3972 const char *errors,
3973 Py_ssize_t *consumed)
3974 {
3975 PyUnicodeObject *v = NULL;
3976 int done;
3977
3978 if (consumed)
3979 *consumed = 0;
3980
3981 #ifdef NEED_RETRY
3982 retry:
3983 if (size > INT_MAX)
3984 done = decode_mbcs(&v, s, INT_MAX, 0);
3985 else
3986 #endif
3987 done = decode_mbcs(&v, s, (int)size, !consumed);
3988
3989 if (done < 0) {
3990 Py_XDECREF(v);
3991 return NULL;
3992 }
3993
3994 if (consumed)
3995 *consumed += done;
3996
3997 #ifdef NEED_RETRY
3998 if (size > INT_MAX) {
3999 s += done;
4000 size -= done;
4001 goto retry;
4002 }
4003 #endif
4004
4005 return (PyObject *)v;
4006 }
4007
4008 PyObject *PyUnicode_DecodeMBCS(const char *s,
4009 Py_ssize_t size,
4010 const char *errors)
4011 {
4012 return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4013 }
4014
4015 /*
4016 * Convert unicode into string object (MBCS).
4017 * Returns 0 if succeed, -1 otherwise.
4018 */
4019 static int encode_mbcs(PyObject **repr,
4020 const Py_UNICODE *p, /* unicode */
4021 int size) /* size of unicode */
4022 {
4023 int mbcssize = 0;
4024 Py_ssize_t n = 0;
4025
4026 assert(size >= 0);
4027
4028 /* First get the size of the result */
4029 if (size > 0) {
4030 mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4031 if (mbcssize == 0) {
4032 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4033 return -1;
4034 }
4035 }
4036
4037 if (*repr == NULL) {
4038 /* Create string object */
4039 *repr = PyString_FromStringAndSize(NULL, mbcssize);
4040 if (*repr == NULL)
4041 return -1;
4042 }
4043 else {
4044 /* Extend string object */
4045 n = PyString_Size(*repr);
4046 if (_PyString_Resize(repr, n + mbcssize) < 0)
4047 return -1;
4048 }
4049
4050 /* Do the conversion */
4051 if (size > 0) {
4052 char *s = PyString_AS_STRING(*repr) + n;
4053 if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4054 PyErr_SetFromWindowsErrWithFilename(0, NULL);
4055 return -1;
4056 }
4057 }
4058
4059 return 0;
4060 }
4061
4062 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4063 Py_ssize_t size,
4064 const char *errors)
4065 {
4066 PyObject *repr = NULL;
4067 int ret;
4068
4069 #ifdef NEED_RETRY
4070 retry:
4071 if (size > INT_MAX)
4072 ret = encode_mbcs(&repr, p, INT_MAX);
4073 else
4074 #endif
4075 ret = encode_mbcs(&repr, p, (int)size);
4076
4077 if (ret < 0) {
4078 Py_XDECREF(repr);
4079 return NULL;
4080 }
4081
4082 #ifdef NEED_RETRY
4083 if (size > INT_MAX) {
4084 p += INT_MAX;
4085 size -= INT_MAX;
4086 goto retry;
4087 }
4088 #endif
4089
4090 return repr;
4091 }
4092
4093 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4094 {
4095 if (!PyUnicode_Check(unicode)) {
4096 PyErr_BadArgument();
4097 return NULL;
4098 }
4099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4100 PyUnicode_GET_SIZE(unicode),
4101 NULL);
4102 }
4103
4104 #undef NEED_RETRY
4105
4106 #endif /* MS_WINDOWS */
4107
4108 /* --- Character Mapping Codec -------------------------------------------- */
4109
4110 PyObject *PyUnicode_DecodeCharmap(const char *s,
4111 Py_ssize_t size,
4112 PyObject *mapping,
4113 const char *errors)
4114 {
4115 const char *starts = s;
4116 Py_ssize_t startinpos;
4117 Py_ssize_t endinpos;
4118 Py_ssize_t outpos;
4119 const char *e;
4120 PyUnicodeObject *v;
4121 Py_UNICODE *p;
4122 Py_ssize_t extrachars = 0;
4123 PyObject *errorHandler = NULL;
4124 PyObject *exc = NULL;
4125 Py_UNICODE *mapstring = NULL;
4126 Py_ssize_t maplen = 0;
4127
4128 /* Default to Latin-1 */
4129 if (mapping == NULL)
4130 return PyUnicode_DecodeLatin1(s, size, errors);
4131
4132 v = _PyUnicode_New(size);
4133 if (v == NULL)
4134 goto onError;
4135 if (size == 0)
4136 return (PyObject *)v;
4137 p = PyUnicode_AS_UNICODE(v);
4138 e = s + size;
4139 if (PyUnicode_CheckExact(mapping)) {
4140 mapstring = PyUnicode_AS_UNICODE(mapping);
4141 maplen = PyUnicode_GET_SIZE(mapping);
4142 while (s < e) {
4143 unsigned char ch = *s;
4144 Py_UNICODE x = 0xfffe; /* illegal value */
4145
4146 if (ch < maplen)
4147 x = mapstring[ch];
4148
4149 if (x == 0xfffe) {
4150 /* undefined mapping */
4151 outpos = p-PyUnicode_AS_UNICODE(v);
4152 startinpos = s-starts;
4153 endinpos = startinpos+1;
4154 if (unicode_decode_call_errorhandler(
4155 errors, &errorHandler,
4156 "charmap", "character maps to <undefined>",
4157 starts, size, &startinpos, &endinpos, &exc, &s,
4158 &v, &outpos, &p)) {
4159 goto onError;
4160 }
4161 continue;
4162 }
4163 *p++ = x;
4164 ++s;
4165 }
4166 }
4167 else {
4168 while (s < e) {
4169 unsigned char ch = *s;
4170 PyObject *w, *x;
4171
4172 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4173 w = PyInt_FromLong((long)ch);
4174 if (w == NULL)
4175 goto onError;
4176 x = PyObject_GetItem(mapping, w);
4177 Py_DECREF(w);
4178 if (x == NULL) {
4179 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4180 /* No mapping found means: mapping is undefined. */
4181 PyErr_Clear();
4182 goto Undefined;
4183 } else
4184 goto onError;
4185 }
4186
4187 /* Apply mapping */
4188 if (x == Py_None)
4189 goto Undefined;
4190 if (PyInt_Check(x)) {
4191 long value = PyInt_AS_LONG(x);
4192 if (value == 0xFFFE)
4193 goto Undefined;
4194 if (value < 0 || value > 0x10FFFF) {
4195 PyErr_SetString(PyExc_TypeError,
4196 "character mapping must be in range(0x110000)");
4197 Py_DECREF(x);
4198 goto onError;
4199 }
4200
4201 #ifndef Py_UNICODE_WIDE
4202 if (value > 0xFFFF) {
4203 /* see the code for 1-n mapping below */
4204 if (extrachars < 2) {
4205 /* resize first */
4206 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4207 Py_ssize_t needed = 10 - extrachars;
4208 extrachars += needed;
4209 /* XXX overflow detection missing */
4210 if (_PyUnicode_Resize(&v,
4211 PyUnicode_GET_SIZE(v) + needed) < 0) {
4212 Py_DECREF(x);
4213 goto onError;
4214 }
4215 p = PyUnicode_AS_UNICODE(v) + oldpos;
4216 }
4217 value -= 0x10000;
4218 *p++ = 0xD800 | (value >> 10);
4219 *p++ = 0xDC00 | (value & 0x3FF);
4220 extrachars -= 2;
4221 }
4222 else
4223 #endif
4224 *p++ = (Py_UNICODE)value;
4225 }
4226 else if (PyUnicode_Check(x)) {
4227 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4228
4229 if (targetsize == 1) {
4230 /* 1-1 mapping */
4231 Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4232 if (value == 0xFFFE)
4233 goto Undefined;
4234 *p++ = value;
4235 }
4236 else if (targetsize > 1) {
4237 /* 1-n mapping */
4238 if (targetsize > extrachars) {
4239 /* resize first */
4240 Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4241 Py_ssize_t needed = (targetsize - extrachars) + \
4242 (targetsize << 2);
4243 extrachars += needed;
4244 /* XXX overflow detection missing */
4245 if (_PyUnicode_Resize(&v,
4246 PyUnicode_GET_SIZE(v) + needed) < 0) {
4247 Py_DECREF(x);
4248 goto onError;
4249 }
4250 p = PyUnicode_AS_UNICODE(v) + oldpos;
4251 }
4252 Py_UNICODE_COPY(p,
4253 PyUnicode_AS_UNICODE(x),
4254 targetsize);
4255 p += targetsize;
4256 extrachars -= targetsize;
4257 }
4258 /* 1-0 mapping: skip the character */
4259 }
4260 else {
4261 /* wrong return value */
4262 PyErr_SetString(PyExc_TypeError,
4263 "character mapping must return integer, None or unicode");
4264 Py_DECREF(x);
4265 goto onError;
4266 }
4267 Py_DECREF(x);
4268 ++s;
4269 continue;
4270 Undefined:
4271 /* undefined mapping */
4272 Py_XDECREF(x);
4273 outpos = p-PyUnicode_AS_UNICODE(v);
4274 startinpos = s-starts;
4275 endinpos = startinpos+1;
4276 if (unicode_decode_call_errorhandler(
4277 errors, &errorHandler,
4278 "charmap", "character maps to <undefined>",
4279 starts, size, &startinpos, &endinpos, &exc, &s,
4280 &v, &outpos, &p)) {
4281 goto onError;
4282 }
4283 }
4284 }
4285 if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4286 if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4287 goto onError;
4288 Py_XDECREF(errorHandler);
4289 Py_XDECREF(exc);
4290 return (PyObject *)v;
4291
4292 onError:
4293 Py_XDECREF(errorHandler);
4294 Py_XDECREF(exc);
4295 Py_XDECREF(v);
4296 return NULL;
4297 }
4298
4299 /* Charmap encoding: the lookup table */
4300
4301 struct encoding_map{
4302 PyObject_HEAD
4303 unsigned char level1[32];
4304 int count2, count3;
4305 unsigned char level23[1];
4306 };
4307
4308 static PyObject*
4309 encoding_map_size(PyObject *obj, PyObject* args)
4310 {
4311 struct encoding_map *map = (struct encoding_map*)obj;
4312 return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4313 128*map->count3);
4314 }
4315
4316 static PyMethodDef encoding_map_methods[] = {
4317 {"size", encoding_map_size, METH_NOARGS,
4318 PyDoc_STR("Return the size (in bytes) of this object") },
4319 { 0 }
4320 };
4321
4322 static void
4323 encoding_map_dealloc(PyObject* o)
4324 {
4325 PyObject_FREE(o);
4326 }
4327
4328 static PyTypeObject EncodingMapType = {
4329 PyVarObject_HEAD_INIT(NULL, 0)
4330 "EncodingMap", /*tp_name*/
4331 sizeof(struct encoding_map), /*tp_basicsize*/
4332 0, /*tp_itemsize*/
4333 /* methods */
4334 encoding_map_dealloc, /*tp_dealloc*/
4335 0, /*tp_print*/
4336 0, /*tp_getattr*/
4337 0, /*tp_setattr*/
4338 0, /*tp_compare*/
4339 0, /*tp_repr*/
4340 0, /*tp_as_number*/
4341 0, /*tp_as_sequence*/
4342 0, /*tp_as_mapping*/
4343 0, /*tp_hash*/
4344 0, /*tp_call*/
4345 0, /*tp_str*/
4346 0, /*tp_getattro*/
4347 0, /*tp_setattro*/
4348 0, /*tp_as_buffer*/
4349 Py_TPFLAGS_DEFAULT, /*tp_flags*/
4350 0, /*tp_doc*/
4351 0, /*tp_traverse*/
4352 0, /*tp_clear*/
4353 0, /*tp_richcompare*/
4354 0, /*tp_weaklistoffset*/
4355 0, /*tp_iter*/
4356 0, /*tp_iternext*/
4357 encoding_map_methods, /*tp_methods*/
4358 0, /*tp_members*/
4359 0, /*tp_getset*/
4360 0, /*tp_base*/
4361 0, /*tp_dict*/
4362 0, /*tp_descr_get*/
4363 0, /*tp_descr_set*/
4364 0, /*tp_dictoffset*/
4365 0, /*tp_init*/
4366 0, /*tp_alloc*/
4367 0, /*tp_new*/
4368 0, /*tp_free*/
4369 0, /*tp_is_gc*/
4370 };
4371
4372 PyObject*
4373 PyUnicode_BuildEncodingMap(PyObject* string)
4374 {
4375 Py_UNICODE *decode;
4376 PyObject *result;
4377 struct encoding_map *mresult;
4378 int i;
4379 int need_dict = 0;
4380 unsigned char level1[32];
4381 unsigned char level2[512];
4382 unsigned char *mlevel1, *mlevel2, *mlevel3;
4383 int count2 = 0, count3 = 0;
4384
4385 if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4386 PyErr_BadArgument();
4387 return NULL;
4388 }
4389 decode = PyUnicode_AS_UNICODE(string);
4390 memset(level1, 0xFF, sizeof level1);
4391 memset(level2, 0xFF, sizeof level2);
4392
4393 /* If there isn't a one-to-one mapping of NULL to \0,
4394 or if there are non-BMP characters, we need to use
4395 a mapping dictionary. */
4396 if (decode[0] != 0)
4397 need_dict = 1;
4398 for (i = 1; i < 256; i++) {
4399 int l1, l2;
4400 if (decode[i] == 0
4401 #ifdef Py_UNICODE_WIDE
4402 || decode[i] > 0xFFFF
4403 #endif
4404 ) {
4405 need_dict = 1;
4406 break;
4407 }
4408 if (decode[i] == 0xFFFE)
4409 /* unmapped character */
4410 continue;
4411 l1 = decode[i] >> 11;
4412 l2 = decode[i] >> 7;
4413 if (level1[l1] == 0xFF)
4414 level1[l1] = count2++;
4415 if (level2[l2] == 0xFF)
4416 level2[l2] = count3++;
4417 }
4418
4419 if (count2 >= 0xFF || count3 >= 0xFF)
4420 need_dict = 1;
4421
4422 if (need_dict) {
4423 PyObject *result = PyDict_New();
4424 PyObject *key, *value;
4425 if (!result)
4426 return NULL;
4427 for (i = 0; i < 256; i++) {
4428 value = NULL;
4429 key = PyInt_FromLong(decode[i]);
4430 value = PyInt_FromLong(i);
4431 if (!key || !value)
4432 goto failed1;
4433 if (PyDict_SetItem(result, key, value) == -1)
4434 goto failed1;
4435 Py_DECREF(key);
4436 Py_DECREF(value);
4437 }
4438 return result;
4439 failed1:
4440 Py_XDECREF(key);
4441 Py_XDECREF(value);
4442 Py_DECREF(result);
4443 return NULL;
4444 }
4445
4446 /* Create a three-level trie */
4447 result = PyObject_MALLOC(sizeof(struct encoding_map) +
4448 16*count2 + 128*count3 - 1);
4449 if (!result)
4450 return PyErr_NoMemory();
4451 PyObject_Init(result, &EncodingMapType);
4452 mresult = (struct encoding_map*)result;
4453 mresult->count2 = count2;
4454 mresult->count3 = count3;
4455 mlevel1 = mresult->level1;
4456 mlevel2 = mresult->level23;
4457 mlevel3 = mresult->level23 + 16*count2;
4458 memcpy(mlevel1, level1, 32);
4459 memset(mlevel2, 0xFF, 16*count2);
4460 memset(mlevel3, 0, 128*count3);
4461 count3 = 0;
4462 for (i = 1; i < 256; i++) {
4463 int o1, o2, o3, i2, i3;
4464 if (decode[i] == 0xFFFE)
4465 /* unmapped character */
4466 continue;
4467 o1 = decode[i]>>11;
4468 o2 = (decode[i]>>7) & 0xF;
4469 i2 = 16*mlevel1[o1] + o2;
4470 if (mlevel2[i2] == 0xFF)
4471 mlevel2[i2] = count3++;
4472 o3 = decode[i] & 0x7F;
4473 i3 = 128*mlevel2[i2] + o3;
4474 mlevel3[i3] = i;
4475 }
4476 return result;
4477 }
4478
4479 static int
4480 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4481 {
4482 struct encoding_map *map = (struct encoding_map*)mapping;
4483 int l1 = c>>11;
4484 int l2 = (c>>7) & 0xF;
4485 int l3 = c & 0x7F;
4486 int i;
4487
4488 #ifdef Py_UNICODE_WIDE
4489 if (c > 0xFFFF) {
4490 return -1;
4491 }
4492 #endif
4493 if (c == 0)
4494 return 0;
4495 /* level 1*/
4496 i = map->level1[l1];
4497 if (i == 0xFF) {
4498 return -1;
4499 }
4500 /* level 2*/
4501 i = map->level23[16*i+l2];
4502 if (i == 0xFF) {
4503 return -1;
4504 }
4505 /* level 3 */
4506 i = map->level23[16*map->count2 + 128*i + l3];
4507 if (i == 0) {
4508 return -1;
4509 }
4510 return i;
4511 }
4512
4513 /* Lookup the character ch in the mapping. If the character
4514 can't be found, Py_None is returned (or NULL, if another
4515 error occurred). */
4516 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4517 {
4518 PyObject *w = PyInt_FromLong((long)c);
4519 PyObject *x;
4520
4521 if (w == NULL)
4522 return NULL;
4523 x = PyObject_GetItem(mapping, w);
4524 Py_DECREF(w);
4525 if (x == NULL) {
4526 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4527 /* No mapping found means: mapping is undefined. */
4528 PyErr_Clear();
4529 x = Py_None;
4530 Py_INCREF(x);
4531 return x;
4532 } else
4533 return NULL;
4534 }
4535 else if (x == Py_None)
4536 return x;
4537 else if (PyInt_Check(x)) {
4538 long value = PyInt_AS_LONG(x);
4539 if (value < 0 || value > 255) {
4540 PyErr_SetString(PyExc_TypeError,
4541 "character mapping must be in range(256)");
4542 Py_DECREF(x);
4543 return NULL;
4544 }
4545 return x;
4546 }
4547 else if (PyString_Check(x))
4548 return x;
4549 else {
4550 /* wrong return value */
4551 PyErr_SetString(PyExc_TypeError,
4552 "character mapping must return integer, None or str");
4553 Py_DECREF(x);
4554 return NULL;
4555 }
4556 }
4557
4558 static int
4559 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4560 {
4561 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4562 /* exponentially overallocate to minimize reallocations */
4563 if (requiredsize < 2*outsize)
4564 requiredsize = 2*outsize;
4565 if (_PyString_Resize(outobj, requiredsize)) {
4566 return 0;
4567 }
4568 return 1;
4569 }
4570
4571 typedef enum charmapencode_result {
4572 enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4573 }charmapencode_result;
4574 /* lookup the character, put the result in the output string and adjust
4575 various state variables. Reallocate the output string if not enough
4576 space is available. Return a new reference to the object that
4577 was put in the output buffer, or Py_None, if the mapping was undefined
4578 (in which case no character was written) or NULL, if a
4579 reallocation error occurred. The caller must decref the result */
4580 static
4581 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4582 PyObject **outobj, Py_ssize_t *outpos)
4583 {
4584 PyObject *rep;
4585 char *outstart;
4586 Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4587
4588 if (Py_TYPE(mapping) == &EncodingMapType) {
4589 int res = encoding_map_lookup(c, mapping);
4590 Py_ssize_t requiredsize = *outpos+1;
4591 if (res == -1)
4592 return enc_FAILED;
4593 if (outsize<requiredsize)
4594 if (!charmapencode_resize(outobj, outpos, requiredsize))
4595 return enc_EXCEPTION;
4596 outstart = PyString_AS_STRING(*outobj);
4597 outstart[(*outpos)++] = (char)res;
4598 return enc_SUCCESS;
4599 }
4600
4601 rep = charmapencode_lookup(c, mapping);
4602 if (rep==NULL)
4603 return enc_EXCEPTION;
4604 else if (rep==Py_None) {
4605 Py_DECREF(rep);
4606 return enc_FAILED;
4607 } else {
4608 if (PyInt_Check(rep)) {
4609 Py_ssize_t requiredsize = *outpos+1;
4610 if (outsize<requiredsize)
4611 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4612 Py_DECREF(rep);
4613 return enc_EXCEPTION;
4614 }
4615 outstart = PyString_AS_STRING(*outobj);
4616 outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4617 }
4618 else {
4619 const char *repchars = PyString_AS_STRING(rep);
4620 Py_ssize_t repsize = PyString_GET_SIZE(rep);
4621 Py_ssize_t requiredsize = *outpos+repsize;
4622 if (outsize<requiredsize)
4623 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4624 Py_DECREF(rep);
4625 return enc_EXCEPTION;
4626 }
4627 outstart = PyString_AS_STRING(*outobj);
4628 memcpy(outstart + *outpos, repchars, repsize);
4629 *outpos += repsize;
4630 }
4631 }
4632 Py_DECREF(rep);
4633 return enc_SUCCESS;
4634 }
4635
4636 /* handle an error in PyUnicode_EncodeCharmap
4637 Return 0 on success, -1 on error */
4638 static
4639 int charmap_encoding_error(
4640 const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4641 PyObject **exceptionObject,
4642 int *known_errorHandler, PyObject **errorHandler, const char *errors,
4643 PyObject **res, Py_ssize_t *respos)
4644 {
4645 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4646 Py_ssize_t repsize;
4647 Py_ssize_t newpos;
4648 Py_UNICODE *uni2;
4649 /* startpos for collecting unencodable chars */
4650 Py_ssize_t collstartpos = *inpos;
4651 Py_ssize_t collendpos = *inpos+1;
4652 Py_ssize_t collpos;
4653 char *encoding = "charmap";
4654 char *reason = "character maps to <undefined>";
4655 charmapencode_result x;
4656
4657 /* find all unencodable characters */
4658 while (collendpos < size) {
4659 PyObject *rep;
4660 if (Py_TYPE(mapping) == &EncodingMapType) {
4661 int res = encoding_map_lookup(p[collendpos], mapping);
4662 if (res != -1)
4663 break;
4664 ++collendpos;
4665 continue;
4666 }
4667
4668 rep = charmapencode_lookup(p[collendpos], mapping);
4669 if (rep==NULL)
4670 return -1;
4671 else if (rep!=Py_None) {
4672 Py_DECREF(rep);
4673 break;
4674 }
4675 Py_DECREF(rep);
4676 ++collendpos;
4677 }
4678 /* cache callback name lookup
4679 * (if not done yet, i.e. it's the first error) */
4680 if (*known_errorHandler==-1) {
4681 if ((errors==NULL) || (!strcmp(errors, "strict")))
4682 *known_errorHandler = 1;
4683 else if (!strcmp(errors, "replace"))
4684 *known_errorHandler = 2;
4685 else if (!strcmp(errors, "ignore"))
4686 *known_errorHandler = 3;
4687 else if (!strcmp(errors, "xmlcharrefreplace"))
4688 *known_errorHandler = 4;
4689 else
4690 *known_errorHandler = 0;
4691 }
4692 switch (*known_errorHandler) {
4693 case 1: /* strict */
4694 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4695 return -1;
4696 case 2: /* replace */
4697 for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4698 x = charmapencode_output('?', mapping, res, respos);
4699 if (x==enc_EXCEPTION) {
4700 return -1;
4701 }
4702 else if (x==enc_FAILED) {
4703 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4704 return -1;
4705 }
4706 }
4707 /* fall through */
4708 case 3: /* ignore */
4709 *inpos = collendpos;
4710 break;
4711 case 4: /* xmlcharrefreplace */
4712 /* generate replacement */
4713 for (collpos = collstartpos; collpos < collendpos;) {
4714 char buffer[2+29+1+1];
4715 char *cp;
4716 Py_UCS4 ch = p[collpos++];
4717 #ifndef Py_UNICODE_WIDE
4718 if ((0xD800 <= ch && ch <= 0xDBFF) &&
4719 (collpos < collendpos) &&
4720 (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4721 ch = ((((ch & 0x03FF) << 10) |
4722 ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4723 }
4724 #endif
4725 sprintf(buffer, "&#%d;", (int)ch);
4726 for (cp = buffer; *cp; ++cp) {
4727 x = charmapencode_output(*cp, mapping, res, respos);
4728 if (x==enc_EXCEPTION)
4729 return -1;
4730 else if (x==enc_FAILED) {
4731 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4732 return -1;
4733 }
4734 }
4735 }
4736 *inpos = collendpos;
4737 break;
4738 default:
4739 repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4740 encoding, reason, p, size, exceptionObject,
4741 collstartpos, collendpos, &newpos);
4742 if (repunicode == NULL)
4743 return -1;
4744 /* generate replacement */
4745 repsize = PyUnicode_GET_SIZE(repunicode);
4746 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4747 x = charmapencode_output(*uni2, mapping, res, respos);
4748 if (x==enc_EXCEPTION) {
4749 return -1;
4750 }
4751 else if (x==enc_FAILED) {
4752 Py_DECREF(repunicode);
4753 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4754 return -1;
4755 }
4756 }
4757 *inpos = newpos;
4758 Py_DECREF(repunicode);
4759 }
4760 return 0;
4761 }
4762
4763 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4764 Py_ssize_t size,
4765 PyObject *mapping,
4766 const char *errors)
4767 {
4768 /* output object */
4769 PyObject *res = NULL;
4770 /* current input position */
4771 Py_ssize_t inpos = 0;
4772 /* current output position */
4773 Py_ssize_t respos = 0;
4774 PyObject *errorHandler = NULL;
4775 PyObject *exc = NULL;
4776 /* the following variable is used for caching string comparisons
4777 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4778 * 3=ignore, 4=xmlcharrefreplace */
4779 int known_errorHandler = -1;
4780
4781 /* Default to Latin-1 */
4782 if (mapping == NULL)
4783 return PyUnicode_EncodeLatin1(p, size, errors);
4784
4785 /* allocate enough for a simple encoding without
4786 replacements, if we need more, we'll resize */
4787 res = PyString_FromStringAndSize(NULL, size);
4788 if (res == NULL)
4789 goto onError;
4790 if (size == 0)
4791 return res;
4792
4793 while (inpos<size) {
4794 /* try to encode it */
4795 charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4796 if (x==enc_EXCEPTION) /* error */
4797 goto onError;
4798 if (x==enc_FAILED) { /* unencodable character */
4799 if (charmap_encoding_error(p, size, &inpos, mapping,
4800 &exc,
4801 &known_errorHandler, &errorHandler, errors,
4802 &res, &respos)) {
4803 goto onError;
4804 }
4805 }
4806 else
4807 /* done with this character => adjust input position */
4808 ++inpos;
4809 }
4810
4811 /* Resize if we allocated to much */
4812 if (respos<PyString_GET_SIZE(res)) {
4813 if (_PyString_Resize(&res, respos))
4814 goto onError;
4815 }
4816 Py_XDECREF(exc);
4817 Py_XDECREF(errorHandler);
4818 return res;
4819
4820 onError:
4821 Py_XDECREF(res);
4822 Py_XDECREF(exc);
4823 Py_XDECREF(errorHandler);
4824 return NULL;
4825 }
4826
4827 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4828 PyObject *mapping)
4829 {
4830 if (!PyUnicode_Check(unicode) || mapping == NULL) {
4831 PyErr_BadArgument();
4832 return NULL;
4833 }
4834 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4835 PyUnicode_GET_SIZE(unicode),
4836 mapping,
4837 NULL);
4838 }
4839
4840 /* create or adjust a UnicodeTranslateError */
4841 static void make_translate_exception(PyObject **exceptionObject,
4842 const Py_UNICODE *unicode, Py_ssize_t size,
4843 Py_ssize_t startpos, Py_ssize_t endpos,
4844 const char *reason)
4845 {
4846 if (*exceptionObject == NULL) {
4847 *exceptionObject = PyUnicodeTranslateError_Create(
4848 unicode, size, startpos, endpos, reason);
4849 }
4850 else {
4851 if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4852 goto onError;
4853 if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4854 goto onError;
4855 if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4856 goto onError;
4857 return;
4858 onError:
4859 Py_CLEAR(*exceptionObject);
4860 }
4861 }
4862
4863 /* raises a UnicodeTranslateError */
4864 static void raise_translate_exception(PyObject **exceptionObject,
4865 const Py_UNICODE *unicode, Py_ssize_t size,
4866 Py_ssize_t startpos, Py_ssize_t endpos,
4867 const char *reason)
4868 {
4869 make_translate_exception(exceptionObject,
4870 unicode, size, startpos, endpos, reason);
4871 if (*exceptionObject != NULL)
4872 PyCodec_StrictErrors(*exceptionObject);
4873 }
4874
4875 /* error handling callback helper:
4876 build arguments, call the callback and check the arguments,
4877 put the result into newpos and return the replacement string, which
4878 has to be freed by the caller */
4879 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4880 PyObject **errorHandler,
4881 const char *reason,
4882 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4883 Py_ssize_t startpos, Py_ssize_t endpos,
4884 Py_ssize_t *newpos)
4885 {
4886 static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4887
4888 Py_ssize_t i_newpos;
4889 PyObject *restuple;
4890 PyObject *resunicode;
4891
4892 if (*errorHandler == NULL) {
4893 *errorHandler = PyCodec_LookupError(errors);
4894 if (*errorHandler == NULL)
4895 return NULL;
4896 }
4897
4898 make_translate_exception(exceptionObject,
4899 unicode, size, startpos, endpos, reason);
4900 if (*exceptionObject == NULL)
4901 return NULL;
4902
4903 restuple = PyObject_CallFunctionObjArgs(
4904 *errorHandler, *exceptionObject, NULL);
4905 if (restuple == NULL)
4906 return NULL;
4907 if (!PyTuple_Check(restuple)) {
4908 PyErr_SetString(PyExc_TypeError, &argparse[4]);
4909 Py_DECREF(restuple);
4910 return NULL;
4911 }
4912 if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4913 &resunicode, &i_newpos)) {
4914 Py_DECREF(restuple);
4915 return NULL;
4916 }
4917 if (i_newpos<0)
4918 *newpos = size+i_newpos;
4919 else
4920 *newpos = i_newpos;
4921 if (*newpos<0 || *newpos>size) {
4922 PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4923 Py_DECREF(restuple);
4924 return NULL;
4925 }
4926 Py_INCREF(resunicode);
4927 Py_DECREF(restuple);
4928 return resunicode;
4929 }
4930
4931 /* Lookup the character ch in the mapping and put the result in result,
4932 which must be decrefed by the caller.
4933 Return 0 on success, -1 on error */
4934 static
4935 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4936 {
4937 PyObject *w = PyInt_FromLong((long)c);
4938 PyObject *x;
4939
4940 if (w == NULL)
4941 return -1;
4942 x = PyObject_GetItem(mapping, w);
4943 Py_DECREF(w);
4944 if (x == NULL) {
4945 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4946 /* No mapping found means: use 1:1 mapping. */
4947 PyErr_Clear();
4948 *result = NULL;
4949 return 0;
4950 } else
4951 return -1;
4952 }
4953 else if (x == Py_None) {
4954 *result = x;
4955 return 0;
4956 }
4957 else if (PyInt_Check(x)) {
4958 long value = PyInt_AS_LONG(x);
4959 long max = PyUnicode_GetMax();
4960 if (value < 0 || value > max) {
4961 PyErr_Format(PyExc_TypeError,
4962 "character mapping must be in range(0x%lx)", max+1);
4963 Py_DECREF(x);
4964 return -1;
4965 }
4966 *result = x;
4967 return 0;
4968 }
4969 else if (PyUnicode_Check(x)) {
4970 *result = x;
4971 return 0;
4972 }
4973 else {
4974 /* wrong return value */
4975 PyErr_SetString(PyExc_TypeError,
4976 "character mapping must return integer, None or unicode");
4977 Py_DECREF(x);
4978 return -1;
4979 }
4980 }
4981 /* ensure that *outobj is at least requiredsize characters long,
4982 if not reallocate and adjust various state variables.
4983 Return 0 on success, -1 on error */
4984 static
4985 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4986 Py_ssize_t requiredsize)
4987 {
4988 Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4989 if (requiredsize > oldsize) {
4990 /* remember old output position */
4991 Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4992 /* exponentially overallocate to minimize reallocations */
4993 if (requiredsize < 2 * oldsize)
4994 requiredsize = 2 * oldsize;
4995 if (PyUnicode_Resize(outobj, requiredsize) < 0)
4996 return -1;
4997 *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4998 }
4999 return 0;
5000 }
5001 /* lookup the character, put the result in the output string and adjust
5002 various state variables. Return a new reference to the object that
5003 was put in the output buffer in *result, or Py_None, if the mapping was
5004 undefined (in which case no character was written).
5005 The called must decref result.
5006 Return 0 on success, -1 on error. */
5007 static
5008 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5009 Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5010 PyObject **res)
5011 {
5012 if (charmaptranslate_lookup(*curinp, mapping, res))
5013 return -1;
5014 if (*res==NULL) {
5015 /* not found => default to 1:1 mapping */
5016 *(*outp)++ = *curinp;
5017 }
5018 else if (*res==Py_None)
5019 ;
5020 else if (PyInt_Check(*res)) {
5021 /* no overflow check, because we know that the space is enough */
5022 *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
5023 }
5024 else if (PyUnicode_Check(*res)) {
5025 Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5026 if (repsize==1) {
5027 /* no overflow check, because we know that the space is enough */
5028 *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5029 }
5030 else if (repsize!=0) {
5031 /* more than one character */
5032 Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5033 (insize - (curinp-startinp)) +
5034 repsize - 1;
5035 if (charmaptranslate_makespace(outobj, outp, requiredsize))
5036 return -1;
5037 memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5038 *outp += repsize;
5039 }
5040 }
5041 else
5042 return -1;
5043 return 0;
5044 }
5045
5046 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5047 Py_ssize_t size,
5048 PyObject *mapping,
5049 const char *errors)
5050 {
5051 /* output object */
5052 PyObject *res = NULL;
5053 /* pointers to the beginning and end+1 of input */
5054 const Py_UNICODE *startp = p;
5055 const Py_UNICODE *endp = p + size;
5056 /* pointer into the output */
5057 Py_UNICODE *str;
5058 /* current output position */
5059 Py_ssize_t respos = 0;
5060 char *reason = "character maps to <undefined>";
5061 PyObject *errorHandler = NULL;
5062 PyObject *exc = NULL;
5063 /* the following variable is used for caching string comparisons
5064 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5065 * 3=ignore, 4=xmlcharrefreplace */
5066 int known_errorHandler = -1;
5067
5068 if (mapping == NULL) {
5069 PyErr_BadArgument();
5070 return NULL;
5071 }
5072
5073 /* allocate enough for a simple 1:1 translation without
5074 replacements, if we need more, we'll resize */
5075 res = PyUnicode_FromUnicode(NULL, size);
5076 if (res == NULL)
5077 goto onError;
5078 if (size == 0)
5079 return res;
5080 str = PyUnicode_AS_UNICODE(res);
5081
5082 while (p<endp) {
5083 /* try to encode it */
5084 PyObject *x = NULL;
5085 if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5086 Py_XDECREF(x);
5087 goto onError;
5088 }
5089 Py_XDECREF(x);
5090 if (x!=Py_None) /* it worked => adjust input pointer */
5091 ++p;
5092 else { /* untranslatable character */
5093 PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5094 Py_ssize_t repsize;
5095 Py_ssize_t newpos;
5096 Py_UNICODE *uni2;
5097 /* startpos for collecting untranslatable chars */
5098 const Py_UNICODE *collstart = p;
5099 const Py_UNICODE *collend = p+1;
5100 const Py_UNICODE *coll;
5101
5102 /* find all untranslatable characters */
5103 while (collend < endp) {
5104 if (charmaptranslate_lookup(*collend, mapping, &x))
5105 goto onError;
5106 Py_XDECREF(x);
5107 if (x!=Py_None)
5108 break;
5109 ++collend;
5110 }
5111 /* cache callback name lookup
5112 * (if not done yet, i.e. it's the first error) */
5113 if (known_errorHandler==-1) {
5114 if ((errors==NULL) || (!strcmp(errors, "strict")))
5115 known_errorHandler = 1;
5116 else if (!strcmp(errors, "replace"))
5117 known_errorHandler = 2;
5118 else if (!strcmp(errors, "ignore"))
5119 known_errorHandler = 3;
5120 else if (!strcmp(errors, "xmlcharrefreplace"))
5121 known_errorHandler = 4;
5122 else
5123 known_errorHandler = 0;
5124 }
5125 switch (known_errorHandler) {
5126 case 1: /* strict */
5127 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5128 goto onError;
5129 case 2: /* replace */
5130 /* No need to check for space, this is a 1:1 replacement */
5131 for (coll = collstart; coll<collend; ++coll)
5132 *str++ = '?';
5133 /* fall through */
5134 case 3: /* ignore */
5135 p = collend;
5136 break;
5137 case 4: /* xmlcharrefreplace */
5138 /* generate replacement (temporarily (mis)uses p) */
5139 for (p = collstart; p < collend;) {
5140 char buffer[2+29+1+1];
5141 char *cp;
5142 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5143 sprintf(buffer, "&#%d;", (int)ch);
5144 if (charmaptranslate_makespace(&res, &str,
5145 (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5146 goto onError;
5147 for (cp = buffer; *cp; ++cp)
5148 *str++ = *cp;
5149 }
5150 p = collend;
5151 break;
5152 default:
5153 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5154 reason, startp, size, &exc,
5155 collstart-startp, collend-startp, &newpos);
5156 if (repunicode == NULL)
5157 goto onError;
5158 /* generate replacement */
5159 repsize = PyUnicode_GET_SIZE(repunicode);
5160 if (charmaptranslate_makespace(&res, &str,
5161 (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5162 Py_DECREF(repunicode);
5163 goto onError;
5164 }
5165 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5166 *str++ = *uni2;
5167 p = startp + newpos;
5168 Py_DECREF(repunicode);
5169 }
5170 }
5171 }
5172 /* Resize if we allocated to much */
5173 respos = str-PyUnicode_AS_UNICODE(res);
5174 if (respos<PyUnicode_GET_SIZE(res)) {
5175 if (PyUnicode_Resize(&res, respos) < 0)
5176 goto onError;
5177 }
5178 Py_XDECREF(exc);
5179 Py_XDECREF(errorHandler);
5180 return res;
5181
5182 onError:
5183 Py_XDECREF(res);
5184 Py_XDECREF(exc);
5185 Py_XDECREF(errorHandler);
5186 return NULL;
5187 }
5188
5189 PyObject *PyUnicode_Translate(PyObject *str,
5190 PyObject *mapping,
5191 const char *errors)
5192 {
5193 PyObject *result;
5194
5195 str = PyUnicode_FromObject(str);
5196 if (str == NULL)
5197 goto onError;
5198 result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5199 PyUnicode_GET_SIZE(str),
5200 mapping,
5201 errors);
5202 Py_DECREF(str);
5203 return result;
5204
5205 onError:
5206 Py_XDECREF(str);
5207 return NULL;
5208 }
5209
5210 /* --- Decimal Encoder ---------------------------------------------------- */
5211
5212 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5213 Py_ssize_t length,
5214 char *output,
5215 const char *errors)
5216 {
5217 Py_UNICODE *p, *end;
5218 PyObject *errorHandler = NULL;
5219 PyObject *exc = NULL;
5220 const char *encoding = "decimal";
5221 const char *reason = "invalid decimal Unicode string";
5222 /* the following variable is used for caching string comparisons
5223 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5224 int known_errorHandler = -1;
5225
5226 if (output == NULL) {
5227 PyErr_BadArgument();
5228 return -1;
5229 }
5230
5231 p = s;
5232 end = s + length;
5233 while (p < end) {
5234 register Py_UNICODE ch = *p;
5235 int decimal;
5236 PyObject *repunicode;
5237 Py_ssize_t repsize;
5238 Py_ssize_t newpos;
5239 Py_UNICODE *uni2;
5240 Py_UNICODE *collstart;
5241 Py_UNICODE *collend;
5242
5243 if (Py_UNICODE_ISSPACE(ch)) {
5244 *output++ = ' ';
5245 ++p;
5246 continue;
5247 }
5248 decimal = Py_UNICODE_TODECIMAL(ch);
5249 if (decimal >= 0) {
5250 *output++ = '0' + decimal;
5251 ++p;
5252 continue;
5253 }
5254 if (0 < ch && ch < 256) {
5255 *output++ = (char)ch;
5256 ++p;
5257 continue;
5258 }
5259 /* All other characters are considered unencodable */
5260 collstart = p;
5261 for (collend = p+1; collend < end; collend++) {
5262 if ((0 < *collend && *collend < 256) ||
5263 Py_UNICODE_ISSPACE(*collend) ||
5264 0 <= Py_UNICODE_TODECIMAL(*collend))
5265 break;
5266 }
5267 /* cache callback name lookup
5268 * (if not done yet, i.e. it's the first error) */
5269 if (known_errorHandler==-1) {
5270 if ((errors==NULL) || (!strcmp(errors, "strict")))
5271 known_errorHandler = 1;
5272 else if (!strcmp(errors, "replace"))
5273 known_errorHandler = 2;
5274 else if (!strcmp(errors, "ignore"))
5275 known_errorHandler = 3;
5276 else if (!strcmp(errors, "xmlcharrefreplace"))
5277 known_errorHandler = 4;
5278 else
5279 known_errorHandler = 0;
5280 }
5281 switch (known_errorHandler) {
5282 case 1: /* strict */
5283 raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5284 goto onError;
5285 case 2: /* replace */
5286 for (p = collstart; p < collend; ++p)
5287 *output++ = '?';
5288 /* fall through */
5289 case 3: /* ignore */
5290 p = collend;
5291 break;
5292 case 4: /* xmlcharrefreplace */
5293 /* generate replacement (temporarily (mis)uses p) */
5294 for (p = collstart; p < collend;) {
5295 Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5296 output += sprintf(output, "&#%d;", ch);
5297 }
5298 p = collend;
5299 break;
5300 default:
5301 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5302 encoding, reason, s, length, &exc,
5303 collstart-s, collend-s, &newpos);
5304 if (repunicode == NULL)
5305 goto onError;
5306 /* generate replacement */
5307 repsize = PyUnicode_GET_SIZE(repunicode);
5308 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5309 Py_UNICODE ch = *uni2;
5310 if (Py_UNICODE_ISSPACE(ch))
5311 *output++ = ' ';
5312 else {
5313 decimal = Py_UNICODE_TODECIMAL(ch);
5314 if (decimal >= 0)
5315 *output++ = '0' + decimal;
5316 else if (0 < ch && ch < 256)
5317 *output++ = (char)ch;
5318 else {
5319 Py_DECREF(repunicode);
5320 raise_encode_exception(&exc, encoding,
5321 s, length, collstart-s, collend-s, reason);
5322 goto onError;
5323 }
5324 }
5325 }
5326 p = s + newpos;
5327 Py_DECREF(repunicode);
5328 }
5329 }
5330 /* 0-terminate the output string */
5331 *output++ = '\0';
5332 Py_XDECREF(exc);
5333 Py_XDECREF(errorHandler);
5334 return 0;
5335
5336 onError:
5337 Py_XDECREF(exc);
5338 Py_XDECREF(errorHandler);
5339 return -1;
5340 }
5341
5342 /* --- Helpers ------------------------------------------------------------ */
5343
5344 #include "stringlib/unicodedefs.h"
5345 #include "stringlib/fastsearch.h"
5346
5347 #include "stringlib/count.h"
5348 #include "stringlib/find.h"
5349 #include "stringlib/partition.h"
5350 #include "stringlib/split.h"
5351
5352 /* helper macro to fixup start/end slice values */
5353 #define ADJUST_INDICES(start, end, len) \
5354 if (end > len) \
5355 end = len; \
5356 else if (end < 0) { \
5357 end += len; \
5358 if (end < 0) \
5359 end = 0; \
5360 } \
5361 if (start < 0) { \
5362 start += len; \
5363 if (start < 0) \
5364 start = 0; \
5365 }
5366
5367 Py_ssize_t PyUnicode_Count(PyObject *str,
5368 PyObject *substr,
5369 Py_ssize_t start,
5370 Py_ssize_t end)
5371 {
5372 Py_ssize_t result;
5373 PyUnicodeObject* str_obj;
5374 PyUnicodeObject* sub_obj;
5375
5376 str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5377 if (!str_obj)
5378 return -1;
5379 sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5380 if (!sub_obj) {
5381 Py_DECREF(str_obj);
5382 return -1;
5383 }
5384
5385 ADJUST_INDICES(start, end, str_obj->length);
5386 result = stringlib_count(
5387 str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5388 PY_SSIZE_T_MAX
5389 );
5390
5391 Py_DECREF(sub_obj);
5392 Py_DECREF(str_obj);
5393
5394 return result;
5395 }
5396
5397 Py_ssize_t PyUnicode_Find(PyObject *str,
5398 PyObject *sub,
5399 Py_ssize_t start,
5400 Py_ssize_t end,
5401 int direction)
5402 {
5403 Py_ssize_t result;
5404
5405 str = PyUnicode_FromObject(str);
5406 if (!str)
5407 return -2;
5408 sub = PyUnicode_FromObject(sub);
5409 if (!sub) {
5410 Py_DECREF(str);
5411 return -2;
5412 }
5413
5414 if (direction > 0)
5415 result = stringlib_find_slice(
5416 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5417 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5418 start, end
5419 );
5420 else
5421 result = stringlib_rfind_slice(
5422 PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5423 PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5424 start, end
5425 );
5426
5427 Py_DECREF(str);
5428 Py_DECREF(sub);
5429
5430 return result;
5431 }
5432
5433 static
5434 int tailmatch(PyUnicodeObject *self,
5435 PyUnicodeObject *substring,
5436 Py_ssize_t start,
5437 Py_ssize_t end,
5438 int direction)
5439 {
5440 if (substring->length == 0)
5441 return 1;
5442
5443 ADJUST_INDICES(start, end, self->length);
5444 end -= substring->length;
5445 if (end < start)
5446 return 0;
5447
5448 if (direction > 0) {
5449 if (Py_UNICODE_MATCH(self, end, substring))
5450 return 1;
5451 } else {
5452 if (Py_UNICODE_MATCH(self, start, substring))
5453 return 1;
5454 }
5455
5456 return 0;
5457 }
5458
5459 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5460 PyObject *substr,
5461 Py_ssize_t start,
5462 Py_ssize_t end,
5463 int direction)
5464 {
5465 Py_ssize_t result;
5466
5467 str = PyUnicode_FromObject(str);
5468 if (str == NULL)
5469 return -1;
5470 substr = PyUnicode_FromObject(substr);
5471 if (substr == NULL) {
5472 Py_DECREF(str);
5473 return -1;
5474 }
5475
5476 result = tailmatch((PyUnicodeObject *)str,
5477 (PyUnicodeObject *)substr,
5478 start, end, direction);
5479 Py_DECREF(str);
5480 Py_DECREF(substr);
5481 return result;
5482 }
5483
5484 /* Apply fixfct filter to the Unicode object self and return a
5485 reference to the modified object */
5486
5487 static
5488 PyObject *fixup(PyUnicodeObject *self,
5489 int (*fixfct)(PyUnicodeObject *s))
5490 {
5491
5492 PyUnicodeObject *u;
5493
5494 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5495 if (u == NULL)
5496 return NULL;
5497
5498 Py_UNICODE_COPY(u->str, self->str, self->length);
5499
5500 if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5501 /* fixfct should return TRUE if it modified the buffer. If
5502 FALSE, return a reference to the original buffer instead
5503 (to save space, not time) */
5504 Py_INCREF(self);
5505 Py_DECREF(u);
5506 return (PyObject*) self;
5507 }
5508 return (PyObject*) u;
5509 }
5510
5511 static
5512 int fixupper(PyUnicodeObject *self)
5513 {
5514 Py_ssize_t len = self->length;
5515 Py_UNICODE *s = self->str;
5516 int status = 0;
5517
5518 while (len-- > 0) {
5519 register Py_UNICODE ch;
5520
5521 ch = Py_UNICODE_TOUPPER(*s);
5522 if (ch != *s) {
5523 status = 1;
5524 *s = ch;
5525 }
5526 s++;
5527 }
5528
5529 return status;
5530 }
5531
5532 static
5533 int fixlower(PyUnicodeObject *self)
5534 {
5535 Py_ssize_t len = self->length;
5536 Py_UNICODE *s = self->str;
5537 int status = 0;
5538
5539 while (len-- > 0) {
5540 register Py_UNICODE ch;
5541
5542 ch = Py_UNICODE_TOLOWER(*s);
5543 if (ch != *s) {
5544 status = 1;
5545 *s = ch;
5546 }
5547 s++;
5548 }
5549
5550 return status;
5551 }
5552
5553 static
5554 int fixswapcase(PyUnicodeObject *self)
5555 {
5556 Py_ssize_t len = self->length;
5557 Py_UNICODE *s = self->str;
5558 int status = 0;
5559
5560 while (len-- > 0) {
5561 if (Py_UNICODE_ISUPPER(*s)) {
5562 *s = Py_UNICODE_TOLOWER(*s);
5563 status = 1;
5564 } else if (Py_UNICODE_ISLOWER(*s)) {
5565 *s = Py_UNICODE_TOUPPER(*s);
5566 status = 1;
5567 }
5568 s++;
5569 }
5570
5571 return status;
5572 }
5573
5574 static
5575 int fixcapitalize(PyUnicodeObject *self)
5576 {
5577 Py_ssize_t len = self->length;
5578 Py_UNICODE *s = self->str;
5579 int status = 0;
5580
5581 if (len == 0)
5582 return 0;
5583 if (!Py_UNICODE_ISUPPER(*s)) {
5584 *s = Py_UNICODE_TOUPPER(*s);
5585 status = 1;
5586 }
5587 s++;
5588 while (--len > 0) {
5589 if (!Py_UNICODE_ISLOWER(*s)) {
5590 *s = Py_UNICODE_TOLOWER(*s);
5591 status = 1;
5592 }
5593 s++;
5594 }
5595 return status;
5596 }
5597
5598 static
5599 int fixtitle(PyUnicodeObject *self)
5600 {
5601 register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5602 register Py_UNICODE *e;
5603 int previous_is_cased;
5604
5605 /* Shortcut for single character strings */
5606 if (PyUnicode_GET_SIZE(self) == 1) {
5607 Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5608 if (*p != ch) {
5609 *p = ch;
5610 return 1;
5611 }
5612 else
5613 return 0;
5614 }
5615
5616 e = p + PyUnicode_GET_SIZE(self);
5617 previous_is_cased = 0;
5618 for (; p < e; p++) {
5619 register const Py_UNICODE ch = *p;
5620
5621 if (previous_is_cased)
5622 *p = Py_UNICODE_TOLOWER(ch);
5623 else
5624 *p = Py_UNICODE_TOTITLE(ch);
5625
5626 if (Py_UNICODE_ISLOWER(ch) ||
5627 Py_UNICODE_ISUPPER(ch) ||
5628 Py_UNICODE_ISTITLE(ch))
5629 previous_is_cased = 1;
5630 else
5631 previous_is_cased = 0;
5632 }
5633 return 1;
5634 }
5635
5636 PyObject *
5637 PyUnicode_Join(PyObject *separator, PyObject *seq)
5638 {
5639 PyObject *internal_separator = NULL;
5640 const Py_UNICODE blank = ' ';
5641 const Py_UNICODE *sep = &blank;
5642 Py_ssize_t seplen = 1;
5643 PyUnicodeObject *res = NULL; /* the result */
5644 Py_ssize_t res_alloc = 100; /* # allocated bytes for string in res */
5645 Py_ssize_t res_used; /* # used bytes */
5646 Py_UNICODE *res_p; /* pointer to free byte in res's string area */
5647 PyObject *fseq; /* PySequence_Fast(seq) */
5648 Py_ssize_t seqlen; /* len(fseq) -- number of items in sequence */
5649 PyObject *item;
5650 Py_ssize_t i;
5651
5652 fseq = PySequence_Fast(seq, "can only join an iterable");
5653 if (fseq == NULL) {
5654 return NULL;
5655 }
5656
5657 /* Grrrr. A codec may be invoked to convert str objects to
5658 * Unicode, and so it's possible to call back into Python code
5659 * during PyUnicode_FromObject(), and so it's possible for a sick
5660 * codec to change the size of fseq (if seq is a list). Therefore
5661 * we have to keep refetching the size -- can't assume seqlen
5662 * is invariant.
5663 */
5664 seqlen = PySequence_Fast_GET_SIZE(fseq);
5665 /* If empty sequence, return u"". */
5666 if (seqlen == 0) {
5667 res = _PyUnicode_New(0); /* empty sequence; return u"" */
5668 goto Done;
5669 }
5670 /* If singleton sequence with an exact Unicode, return that. */
5671 if (seqlen == 1) {
5672 item = PySequence_Fast_GET_ITEM(fseq, 0);
5673 if (PyUnicode_CheckExact(item)) {
5674 Py_INCREF(item);
5675 res = (PyUnicodeObject *)item;
5676 goto Done;
5677 }
5678 }
5679
5680 /* At least two items to join, or one that isn't exact Unicode. */
5681 if (seqlen > 1) {
5682 /* Set up sep and seplen -- they're needed. */
5683 if (separator == NULL) {
5684 sep = &blank;
5685 seplen = 1;
5686 }
5687 else {
5688 internal_separator = PyUnicode_FromObject(separator);
5689 if (internal_separator == NULL)
5690 goto onError;
5691 sep = PyUnicode_AS_UNICODE(internal_separator);
5692 seplen = PyUnicode_GET_SIZE(internal_separator);
5693 /* In case PyUnicode_FromObject() mutated seq. */
5694 seqlen = PySequence_Fast_GET_SIZE(fseq);
5695 }
5696 }
5697
5698 /* Get space. */
5699 res = _PyUnicode_New(res_alloc);
5700 if (res == NULL)
5701 goto onError;
5702 res_p = PyUnicode_AS_UNICODE(res);
5703 res_used = 0;
5704
5705 for (i = 0; i < seqlen; ++i) {
5706 Py_ssize_t itemlen;
5707 Py_ssize_t new_res_used;
5708
5709 item = PySequence_Fast_GET_ITEM(fseq, i);
5710 /* Convert item to Unicode. */
5711 if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5712 PyErr_Format(PyExc_TypeError,
5713 "sequence item %zd: expected string or Unicode,"
5714 " %.80s found",
5715 i, Py_TYPE(item)->tp_name);
5716 goto onError;
5717 }
5718 item = PyUnicode_FromObject(item);
5719 if (item == NULL)
5720 goto onError;
5721 /* We own a reference to item from here on. */
5722
5723 /* In case PyUnicode_FromObject() mutated seq. */
5724 seqlen = PySequence_Fast_GET_SIZE(fseq);
5725
5726 /* Make sure we have enough space for the separator and the item. */
5727 itemlen = PyUnicode_GET_SIZE(item);
5728 new_res_used = res_used + itemlen;
5729 if (new_res_used < 0)
5730 goto Overflow;
5731 if (i < seqlen - 1) {
5732 new_res_used += seplen;
5733 if (new_res_used < 0)
5734 goto Overflow;
5735 }
5736 if (new_res_used > res_alloc) {
5737 /* double allocated size until it's big enough */
5738 do {
5739 res_alloc += res_alloc;
5740 if (res_alloc <= 0)
5741 goto Overflow;
5742 } while (new_res_used > res_alloc);
5743 if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5744 Py_DECREF(item);
5745 goto onError;
5746 }
5747 res_p = PyUnicode_AS_UNICODE(res) + res_used;
5748 }
5749
5750 /* Copy item, and maybe the separator. */
5751 Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5752 res_p += itemlen;
5753 if (i < seqlen - 1) {
5754 Py_UNICODE_COPY(res_p, sep, seplen);
5755 res_p += seplen;
5756 }
5757 Py_DECREF(item);
5758 res_used = new_res_used;
5759 }
5760
5761 /* Shrink res to match the used area; this probably can't fail,
5762 * but it's cheap to check.
5763 */
5764 if (_PyUnicode_Resize(&res, res_used) < 0)
5765 goto onError;
5766
5767 Done:
5768 Py_XDECREF(internal_separator);
5769 Py_DECREF(fseq);
5770 return (PyObject *)res;
5771
5772 Overflow:
5773 PyErr_SetString(PyExc_OverflowError,
5774 "join() result is too long for a Python string");
5775 Py_DECREF(item);
5776 /* fall through */
5777
5778 onError:
5779 Py_XDECREF(internal_separator);
5780 Py_DECREF(fseq);
5781 Py_XDECREF(res);
5782 return NULL;
5783 }
5784
5785 static
5786 PyUnicodeObject *pad(PyUnicodeObject *self,
5787 Py_ssize_t left,
5788 Py_ssize_t right,
5789 Py_UNICODE fill)
5790 {
5791 PyUnicodeObject *u;
5792
5793 if (left < 0)
5794 left = 0;
5795 if (right < 0)
5796 right = 0;
5797
5798 if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5799 Py_INCREF(self);
5800 return self;
5801 }
5802
5803 if (left > PY_SSIZE_T_MAX - self->length ||
5804 right > PY_SSIZE_T_MAX - (left + self->length)) {
5805 PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5806 return NULL;
5807 }
5808 u = _PyUnicode_New(left + self->length + right);
5809 if (u) {
5810 if (left)
5811 Py_UNICODE_FILL(u->str, fill, left);
5812 Py_UNICODE_COPY(u->str + left, self->str, self->length);
5813 if (right)
5814 Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5815 }
5816
5817 return u;
5818 }
5819
5820 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5821 {
5822 PyObject *list;
5823
5824 string = PyUnicode_FromObject(string);
5825 if (string == NULL)
5826 return NULL;
5827
5828 list = stringlib_splitlines(
5829 (PyObject*) string, PyUnicode_AS_UNICODE(string),
5830 PyUnicode_GET_SIZE(string), keepends);
5831
5832 Py_DECREF(string);
5833 return list;
5834 }
5835
5836 static
5837 PyObject *split(PyUnicodeObject *self,
5838 PyUnicodeObject *substring,
5839 Py_ssize_t maxcount)
5840 {
5841 if (maxcount < 0)
5842 maxcount = PY_SSIZE_T_MAX;
5843
5844 if (substring == NULL)
5845 return stringlib_split_whitespace(
5846 (PyObject*) self, self->str, self->length, maxcount
5847 );
5848
5849 return stringlib_split(
5850 (PyObject*) self, self->str, self->length,
5851 substring->str, substring->length,
5852 maxcount
5853 );
5854 }
5855
5856 static
5857 PyObject *rsplit(PyUnicodeObject *self,
5858 PyUnicodeObject *substring,
5859 Py_ssize_t maxcount)
5860 {
5861 if (maxcount < 0)
5862 maxcount = PY_SSIZE_T_MAX;
5863
5864 if (substring == NULL)
5865 return stringlib_rsplit_whitespace(
5866 (PyObject*) self, self->str, self->length, maxcount
5867 );
5868
5869 return stringlib_rsplit(
5870 (PyObject*) self, self->str, self->length,
5871 substring->str, substring->length,
5872 maxcount
5873 );
5874 }
5875
5876 static
5877 PyObject *replace(PyUnicodeObject *self,
5878 PyUnicodeObject *str1,
5879 PyUnicodeObject *str2,
5880 Py_ssize_t maxcount)
5881 {
5882 PyUnicodeObject *u;
5883
5884 if (maxcount < 0)
5885 maxcount = PY_SSIZE_T_MAX;
5886 else if (maxcount == 0 || self->length == 0)
5887 goto nothing;
5888
5889 if (str1->length == str2->length) {
5890 Py_ssize_t i;
5891 /* same length */
5892 if (str1->length == 0)
5893 goto nothing;
5894 if (str1->length == 1) {
5895 /* replace characters */
5896 Py_UNICODE u1, u2;
5897 if (!findchar(self->str, self->length, str1->str[0]))
5898 goto nothing;
5899 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5900 if (!u)
5901 return NULL;
5902 Py_UNICODE_COPY(u->str, self->str, self->length);
5903 u1 = str1->str[0];
5904 u2 = str2->str[0];
5905 for (i = 0; i < u->length; i++)
5906 if (u->str[i] == u1) {
5907 if (--maxcount < 0)
5908 break;
5909 u->str[i] = u2;
5910 }
5911 } else {
5912 i = stringlib_find(
5913 self->str, self->length, str1->str, str1->length, 0
5914 );
5915 if (i < 0)
5916 goto nothing;
5917 u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5918 if (!u)
5919 return NULL;
5920 Py_UNICODE_COPY(u->str, self->str, self->length);
5921
5922 /* change everything in-place, starting with this one */
5923 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5924 i += str1->length;
5925
5926 while ( --maxcount > 0) {
5927 i = stringlib_find(self->str+i, self->length-i,
5928 str1->str, str1->length,
5929 i);
5930 if (i == -1)
5931 break;
5932 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5933 i += str1->length;
5934 }
5935 }
5936 } else {
5937
5938 Py_ssize_t n, i, j;
5939 Py_ssize_t product, new_size, delta;
5940 Py_UNICODE *p;
5941
5942 /* replace strings */
5943 n = stringlib_count(self->str, self->length, str1->str, str1->length,
5944 maxcount);
5945 if (n == 0)
5946 goto nothing;
5947 /* new_size = self->length + n * (str2->length - str1->length)); */
5948 delta = (str2->length - str1->length);
5949 if (delta == 0) {
5950 new_size = self->length;
5951 } else {
5952 product = n * (str2->length - str1->length);
5953 if ((product / (str2->length - str1->length)) != n) {
5954 PyErr_SetString(PyExc_OverflowError,
5955 "replace string is too long");
5956 return NULL;
5957 }
5958 new_size = self->length + product;
5959 if (new_size < 0) {
5960 PyErr_SetString(PyExc_OverflowError,
5961 "replace string is too long");
5962 return NULL;
5963 }
5964 }
5965 u = _PyUnicode_New(new_size);
5966 if (!u)
5967 return NULL;
5968 i = 0;
5969 p = u->str;
5970 if (str1->length > 0) {
5971 while (n-- > 0) {
5972 /* look for next match */
5973 j = stringlib_find(self->str+i, self->length-i,
5974 str1->str, str1->length,
5975 i);
5976 if (j == -1)
5977 break;
5978 else if (j > i) {
5979 /* copy unchanged part [i:j] */
5980 Py_UNICODE_COPY(p, self->str+i, j-i);
5981 p += j - i;
5982 }
5983 /* copy substitution string */
5984 if (str2->length > 0) {
5985 Py_UNICODE_COPY(p, str2->str, str2->length);
5986 p += str2->length;
5987 }
5988 i = j + str1->length;
5989 }
5990 if (i < self->length)
5991 /* copy tail [i:] */
5992 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5993 } else {
5994 /* interleave */
5995 while (n > 0) {
5996 Py_UNICODE_COPY(p, str2->str, str2->length);
5997 p += str2->length;
5998 if (--n <= 0)
5999 break;
6000 *p++ = self->str[i++];
6001 }
6002 Py_UNICODE_COPY(p, self->str+i, self->length-i);
6003 }
6004 }
6005 return (PyObject *) u;
6006
6007 nothing:
6008 /* nothing to replace; return original string (when possible) */
6009 if (PyUnicode_CheckExact(self)) {
6010 Py_INCREF(self);
6011 return (PyObject *) self;
6012 }
6013 return PyUnicode_FromUnicode(self->str, self->length);
6014 }
6015
6016 /* --- Unicode Object Methods --------------------------------------------- */
6017
6018 PyDoc_STRVAR(title__doc__,
6019 "S.title() -> unicode\n\
6020 \n\
6021 Return a titlecased version of S, i.e. words start with title case\n\
6022 characters, all remaining cased characters have lower case.");
6023
6024 static PyObject*
6025 unicode_title(PyUnicodeObject *self)
6026 {
6027 return fixup(self, fixtitle);
6028 }
6029
6030 PyDoc_STRVAR(capitalize__doc__,
6031 "S.capitalize() -> unicode\n\
6032 \n\
6033 Return a capitalized version of S, i.e. make the first character\n\
6034 have upper case and the rest lower case.");
6035
6036 static PyObject*
6037 unicode_capitalize(PyUnicodeObject *self)
6038 {
6039 return fixup(self, fixcapitalize);
6040 }
6041
6042 #if 0
6043 PyDoc_STRVAR(capwords__doc__,
6044 "S.capwords() -> unicode\n\
6045 \n\
6046 Apply .capitalize() to all words in S and return the result with\n\
6047 normalized whitespace (all whitespace strings are replaced by ' ').");
6048
6049 static PyObject*
6050 unicode_capwords(PyUnicodeObject *self)
6051 {
6052 PyObject *list;
6053 PyObject *item;
6054 Py_ssize_t i;
6055
6056 /* Split into words */
6057 list = split(self, NULL, -1);
6058 if (!list)
6059 return NULL;
6060
6061 /* Capitalize each word */
6062 for (i = 0; i < PyList_GET_SIZE(list); i++) {
6063 item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6064 fixcapitalize);
6065 if (item == NULL)
6066 goto onError;
6067 Py_DECREF(PyList_GET_ITEM(list, i));
6068 PyList_SET_ITEM(list, i, item);
6069 }
6070
6071 /* Join the words to form a new string */
6072 item = PyUnicode_Join(NULL, list);
6073
6074 onError:
6075 Py_DECREF(list);
6076 return (PyObject *)item;
6077 }
6078 #endif
6079
6080 /* Argument converter. Coerces to a single unicode character */
6081
6082 static int
6083 convert_uc(PyObject *obj, void *addr)
6084 {
6085 Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6086 PyObject *uniobj;
6087 Py_UNICODE *unistr;
6088
6089 uniobj = PyUnicode_FromObject(obj);
6090 if (uniobj == NULL) {
6091 PyErr_SetString(PyExc_TypeError,
6092 "The fill character cannot be converted to Unicode");
6093 return 0;
6094 }
6095 if (PyUnicode_GET_SIZE(uniobj) != 1) {
6096 PyErr_SetString(PyExc_TypeError,
6097 "The fill character must be exactly one character long");
6098 Py_DECREF(uniobj);
6099 return 0;
6100 }
6101 unistr = PyUnicode_AS_UNICODE(uniobj);
6102 *fillcharloc = unistr[0];
6103 Py_DECREF(uniobj);
6104 return 1;
6105 }
6106
6107 PyDoc_STRVAR(center__doc__,
6108 "S.center(width[, fillchar]) -> unicode\n\
6109 \n\
6110 Return S centered in a Unicode string of length width. Padding is\n\
6111 done using the specified fill character (default is a space)");
6112
6113 static PyObject *
6114 unicode_center(PyUnicodeObject *self, PyObject *args)
6115 {
6116 Py_ssize_t marg, left;
6117 Py_ssize_t width;
6118 Py_UNICODE fillchar = ' ';
6119
6120 if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6121 return NULL;
6122
6123 if (self->length >= width && PyUnicode_CheckExact(self)) {
6124 Py_INCREF(self);
6125 return (PyObject*) self;
6126 }
6127
6128 marg = width - self->length;
6129 left = marg / 2 + (marg & width & 1);
6130
6131 return (PyObject*) pad(self, left, marg - left, fillchar);
6132 }
6133
6134 #if 0
6135
6136 /* This code should go into some future Unicode collation support
6137 module. The basic comparison should compare ordinals on a naive
6138 basis (this is what Java does and thus Jython too). */
6139
6140 /* speedy UTF-16 code point order comparison */
6141 /* gleaned from: */
6142 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6143
6144 static short utf16Fixup[32] =
6145 {
6146 0, 0, 0, 0, 0, 0, 0, 0,
6147 0, 0, 0, 0, 0, 0, 0, 0,
6148 0, 0, 0, 0, 0, 0, 0, 0,
6149 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6150 };
6151
6152 static int
6153 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6154 {
6155 Py_ssize_t len1, len2;
6156
6157 Py_UNICODE *s1 = str1->str;
6158 Py_UNICODE *s2 = str2->str;
6159
6160 len1 = str1->length;
6161 len2 = str2->length;
6162
6163 while (len1 > 0 && len2 > 0) {
6164 Py_UNICODE c1, c2;
6165
6166 c1 = *s1++;
6167 c2 = *s2++;
6168
6169 if (c1 > (1<<11) * 26)
6170 c1 += utf16Fixup[c1>>11];
6171 if (c2 > (1<<11) * 26)
6172 c2 += utf16Fixup[c2>>11];
6173 /* now c1 and c2 are in UTF-32-compatible order */
6174
6175 if (c1 != c2)
6176 return (c1 < c2) ? -1 : 1;
6177
6178 len1--; len2--;
6179 }
6180
6181 return (len1 < len2) ? -1 : (len1 != len2);
6182 }
6183
6184 #else
6185
6186 static int
6187 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6188 {
6189 register Py_ssize_t len1, len2;
6190
6191 Py_UNICODE *s1 = str1->str;
6192 Py_UNICODE *s2 = str2->str;
6193
6194 len1 = str1->length;
6195 len2 = str2->length;
6196
6197 while (len1 > 0 && len2 > 0) {
6198 Py_UNICODE c1, c2;
6199
6200 c1 = *s1++;
6201 c2 = *s2++;
6202
6203 if (c1 != c2)
6204 return (c1 < c2) ? -1 : 1;
6205
6206 len1--; len2--;
6207 }
6208
6209 return (len1 < len2) ? -1 : (len1 != len2);
6210 }
6211
6212 #endif
6213
6214 int PyUnicode_Compare(PyObject *left,
6215 PyObject *right)
6216 {
6217 PyUnicodeObject *u = NULL, *v = NULL;
6218 int result;
6219
6220 /* Coerce the two arguments */
6221 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6222 if (u == NULL)
6223 goto onError;
6224 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6225 if (v == NULL)
6226 goto onError;
6227
6228 /* Shortcut for empty or interned objects */
6229 if (v == u) {
6230 Py_DECREF(u);
6231 Py_DECREF(v);
6232 return 0;
6233 }
6234
6235 result = unicode_compare(u, v);
6236
6237 Py_DECREF(u);
6238 Py_DECREF(v);
6239 return result;
6240
6241 onError:
6242 Py_XDECREF(u);
6243 Py_XDECREF(v);
6244 return -1;
6245 }
6246
6247 PyObject *PyUnicode_RichCompare(PyObject *left,
6248 PyObject *right,
6249 int op)
6250 {
6251 int result;
6252
6253 result = PyUnicode_Compare(left, right);
6254 if (result == -1 && PyErr_Occurred())
6255 goto onError;
6256
6257 /* Convert the return value to a Boolean */
6258 switch (op) {
6259 case Py_EQ:
6260 result = (result == 0);
6261 break;
6262 case Py_NE:
6263 result = (result != 0);
6264 break;
6265 case Py_LE:
6266 result = (result <= 0);
6267 break;
6268 case Py_GE:
6269 result = (result >= 0);
6270 break;
6271 case Py_LT:
6272 result = (result == -1);
6273 break;
6274 case Py_GT:
6275 result = (result == 1);
6276 break;
6277 }
6278 return PyBool_FromLong(result);
6279
6280 onError:
6281
6282 /* Standard case
6283
6284 Type errors mean that PyUnicode_FromObject() could not convert
6285 one of the arguments (usually the right hand side) to Unicode,
6286 ie. we can't handle the comparison request. However, it is
6287 possible that the other object knows a comparison method, which
6288 is why we return Py_NotImplemented to give the other object a
6289 chance.
6290
6291 */
6292 if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6293 PyErr_Clear();
6294 Py_INCREF(Py_NotImplemented);
6295 return Py_NotImplemented;
6296 }
6297 if (op != Py_EQ && op != Py_NE)
6298 return NULL;
6299
6300 /* Equality comparison.
6301
6302 This is a special case: we silence any PyExc_UnicodeDecodeError
6303 and instead turn it into a PyErr_UnicodeWarning.
6304
6305 */
6306 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6307 return NULL;
6308 PyErr_Clear();
6309 if (PyErr_Warn(PyExc_UnicodeWarning,
6310 (op == Py_EQ) ?
6311 "Unicode equal comparison "
6312 "failed to convert both arguments to Unicode - "
6313 "interpreting them as being unequal" :
6314 "Unicode unequal comparison "
6315 "failed to convert both arguments to Unicode - "
6316 "interpreting them as being unequal"
6317 ) < 0)
6318 return NULL;
6319 result = (op == Py_NE);
6320 return PyBool_FromLong(result);
6321 }
6322
6323 int PyUnicode_Contains(PyObject *container,
6324 PyObject *element)
6325 {
6326 PyObject *str, *sub;
6327 int result;
6328
6329 /* Coerce the two arguments */
6330 sub = PyUnicode_FromObject(element);
6331 if (!sub) {
6332 return -1;
6333 }
6334
6335 str = PyUnicode_FromObject(container);
6336 if (!str) {
6337 Py_DECREF(sub);
6338 return -1;
6339 }
6340
6341 result = stringlib_contains_obj(str, sub);
6342
6343 Py_DECREF(str);
6344 Py_DECREF(sub);
6345
6346 return result;
6347 }
6348
6349 /* Concat to string or Unicode object giving a new Unicode object. */
6350
6351 PyObject *PyUnicode_Concat(PyObject *left,
6352 PyObject *right)
6353 {
6354 PyUnicodeObject *u = NULL, *v = NULL, *w;
6355
6356 /* Coerce the two arguments */
6357 u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6358 if (u == NULL)
6359 goto onError;
6360 v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6361 if (v == NULL)
6362 goto onError;
6363
6364 /* Shortcuts */
6365 if (v == unicode_empty) {
6366 Py_DECREF(v);
6367 return (PyObject *)u;
6368 }
6369 if (u == unicode_empty) {
6370 Py_DECREF(u);
6371 return (PyObject *)v;
6372 }
6373
6374 /* Concat the two Unicode strings */
6375 w = _PyUnicode_New(u->length + v->length);
6376 if (w == NULL)
6377 goto onError;
6378 Py_UNICODE_COPY(w->str, u->str, u->length);
6379 Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6380
6381 Py_DECREF(u);
6382 Py_DECREF(v);
6383 return (PyObject *)w;
6384
6385 onError:
6386 Py_XDECREF(u);
6387 Py_XDECREF(v);
6388 return NULL;
6389 }
6390
6391 PyDoc_STRVAR(count__doc__,
6392 "S.count(sub[, start[, end]]) -> int\n\
6393 \n\
6394 Return the number of non-overlapping occurrences of substring sub in\n\
6395 Unicode string S[start:end]. Optional arguments start and end are\n\
6396 interpreted as in slice notation.");
6397
6398 static PyObject *
6399 unicode_count(PyUnicodeObject *self, PyObject *args)
6400 {
6401 PyUnicodeObject *substring;
6402 Py_ssize_t start = 0;
6403 Py_ssize_t end = PY_SSIZE_T_MAX;
6404 PyObject *result;
6405
6406 if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6407 &start, &end))
6408 return NULL;
6409
6410 ADJUST_INDICES(start, end, self->length);
6411 result = PyInt_FromSsize_t(
6412 stringlib_count(self->str + start, end - start,
6413 substring->str, substring->length,
6414 PY_SSIZE_T_MAX)
6415 );
6416
6417 Py_DECREF(substring);
6418
6419 return result;
6420 }
6421
6422 PyDoc_STRVAR(encode__doc__,
6423 "S.encode([encoding[,errors]]) -> string or unicode\n\
6424 \n\
6425 Encodes S using the codec registered for encoding. encoding defaults\n\
6426 to the default encoding. errors may be given to set a different error\n\
6427 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6428 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6429 'xmlcharrefreplace' as well as any other name registered with\n\
6430 codecs.register_error that can handle UnicodeEncodeErrors.");
6431
6432 static PyObject *
6433 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6434 {
6435 static char *kwlist[] = {"encoding", "errors", 0};
6436 char *encoding = NULL;
6437 char *errors = NULL;
6438 PyObject *v;
6439
6440 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6441 kwlist, &encoding, &errors))
6442 return NULL;
6443 v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6444 if (v == NULL)
6445 goto onError;
6446 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6447 PyErr_Format(PyExc_TypeError,
6448 "encoder did not return a string/unicode object "
6449 "(type=%.400s)",
6450 Py_TYPE(v)->tp_name);
6451 Py_DECREF(v);
6452 return NULL;
6453 }
6454 return v;
6455
6456 onError:
6457 return NULL;
6458 }
6459
6460 PyDoc_STRVAR(decode__doc__,
6461 "S.decode([encoding[,errors]]) -> string or unicode\n\
6462 \n\
6463 Decodes S using the codec registered for encoding. encoding defaults\n\
6464 to the default encoding. errors may be given to set a different error\n\
6465 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6466 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6467 as well as any other name registered with codecs.register_error that is\n\
6468 able to handle UnicodeDecodeErrors.");
6469
6470 static PyObject *
6471 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6472 {
6473 static char *kwlist[] = {"encoding", "errors", 0};
6474 char *encoding = NULL;
6475 char *errors = NULL;
6476 PyObject *v;
6477
6478 if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6479 kwlist, &encoding, &errors))
6480 return NULL;
6481 v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6482 if (v == NULL)
6483 goto onError;
6484 if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6485 PyErr_Format(PyExc_TypeError,
6486 "decoder did not return a string/unicode object "
6487 "(type=%.400s)",
6488 Py_TYPE(v)->tp_name);
6489 Py_DECREF(v);
6490 return NULL;
6491 }
6492 return v;
6493
6494 onError:
6495 return NULL;
6496 }
6497
6498 PyDoc_STRVAR(expandtabs__doc__,
6499 "S.expandtabs([tabsize]) -> unicode\n\
6500 \n\
6501 Return a copy of S where all tab characters are expanded using spaces.\n\
6502 If tabsize is not given, a tab size of 8 characters is assumed.");
6503
6504 static PyObject*
6505 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6506 {
6507 Py_UNICODE *e;
6508 Py_UNICODE *p;
6509 Py_UNICODE *q;
6510 Py_UNICODE *qe;
6511 Py_ssize_t i, j, incr;
6512 PyUnicodeObject *u;
6513 int tabsize = 8;
6514
6515 if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6516 return NULL;
6517
6518 /* First pass: determine size of output string */
6519 i = 0; /* chars up to and including most recent \n or \r */
6520 j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6521 e = self->str + self->length; /* end of input */
6522 for (p = self->str; p < e; p++)
6523 if (*p == '\t') {
6524 if (tabsize > 0) {
6525 incr = tabsize - (j % tabsize); /* cannot overflow */
6526 if (j > PY_SSIZE_T_MAX - incr)
6527 goto overflow1;
6528 j += incr;
6529 }
6530 }
6531 else {
6532 if (j > PY_SSIZE_T_MAX - 1)
6533 goto overflow1;
6534 j++;
6535 if (*p == '\n' || *p == '\r') {
6536 if (i > PY_SSIZE_T_MAX - j)
6537 goto overflow1;
6538 i += j;
6539 j = 0;
6540 }
6541 }
6542
6543 if (i > PY_SSIZE_T_MAX - j)
6544 goto overflow1;
6545
6546 /* Second pass: create output string and fill it */
6547 u = _PyUnicode_New(i + j);
6548 if (!u)
6549 return NULL;
6550
6551 j = 0; /* same as in first pass */
6552 q = u->str; /* next output char */
6553 qe = u->str + u->length; /* end of output */
6554
6555 for (p = self->str; p < e; p++)
6556 if (*p == '\t') {
6557 if (tabsize > 0) {
6558 i = tabsize - (j % tabsize);
6559 j += i;
6560 while (i--) {
6561 if (q >= qe)
6562 goto overflow2;
6563 *q++ = ' ';
6564 }
6565 }
6566 }
6567 else {
6568 if (q >= qe)
6569 goto overflow2;
6570 *q++ = *p;
6571 j++;
6572 if (*p == '\n' || *p == '\r')
6573 j = 0;
6574 }
6575
6576 return (PyObject*) u;
6577
6578 overflow2:
6579 Py_DECREF(u);
6580 overflow1:
6581 PyErr_SetString(PyExc_OverflowError, "new string is too long");
6582 return NULL;
6583 }
6584
6585 PyDoc_STRVAR(find__doc__,
6586 "S.find(sub [,start [,end]]) -> int\n\
6587 \n\
6588 Return the lowest index in S where substring sub is found,\n\
6589 such that sub is contained within S[start:end]. Optional\n\
6590 arguments start and end are interpreted as in slice notation.\n\
6591 \n\
6592 Return -1 on failure.");
6593
6594 static PyObject *
6595 unicode_find(PyUnicodeObject *self, PyObject *args)
6596 {
6597 PyUnicodeObject *substring;
6598 Py_ssize_t start;
6599 Py_ssize_t end;
6600 Py_ssize_t result;
6601
6602 if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6603 &start, &end))
6604 return NULL;
6605
6606 result = stringlib_find_slice(
6607 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6608 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6609 start, end
6610 );
6611
6612 Py_DECREF(substring);
6613
6614 return PyInt_FromSsize_t(result);
6615 }
6616
6617 static PyObject *
6618 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6619 {
6620 if (index < 0 || index >= self->length) {
6621 PyErr_SetString(PyExc_IndexError, "string index out of range");
6622 return NULL;
6623 }
6624
6625 return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6626 }
6627
6628 static long
6629 unicode_hash(PyUnicodeObject *self)
6630 {
6631 /* Since Unicode objects compare equal to their ASCII string
6632 counterparts, they should use the individual character values
6633 as basis for their hash value. This is needed to assure that
6634 strings and Unicode objects behave in the same way as
6635 dictionary keys. */
6636
6637 register Py_ssize_t len;
6638 register Py_UNICODE *p;
6639 register long x;
6640
6641 #ifdef Py_DEBUG
6642 assert(_Py_HashSecret_Initialized);
6643 #endif
6644 if (self->hash != -1)
6645 return self->hash;
6646 len = PyUnicode_GET_SIZE(self);
6647 /*
6648 We make the hash of the empty string be 0, rather than using
6649 (prefix ^ suffix), since this slightly obfuscates the hash secret
6650 */
6651 if (len == 0) {
6652 self->hash = 0;
6653 return 0;
6654 }
6655 p = PyUnicode_AS_UNICODE(self);
6656 x = _Py_HashSecret.prefix;
6657 x ^= *p << 7;
6658 while (--len >= 0)
6659 x = (1000003*x) ^ *p++;
6660 x ^= PyUnicode_GET_SIZE(self);
6661 x ^= _Py_HashSecret.suffix;
6662 if (x == -1)
6663 x = -2;
6664 self->hash = x;
6665 return x;
6666 }
6667
6668 PyDoc_STRVAR(index__doc__,
6669 "S.index(sub [,start [,end]]) -> int\n\
6670 \n\
6671 Like S.find() but raise ValueError when the substring is not found.");
6672
6673 static PyObject *
6674 unicode_index(PyUnicodeObject *self, PyObject *args)
6675 {
6676 Py_ssize_t result;
6677 PyUnicodeObject *substring;
6678 Py_ssize_t start;
6679 Py_ssize_t end;
6680
6681 if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6682 &start, &end))
6683 return NULL;
6684
6685 result = stringlib_find_slice(
6686 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6687 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6688 start, end
6689 );
6690
6691 Py_DECREF(substring);
6692
6693 if (result < 0) {
6694 PyErr_SetString(PyExc_ValueError, "substring not found");
6695 return NULL;
6696 }
6697
6698 return PyInt_FromSsize_t(result);
6699 }
6700
6701 PyDoc_STRVAR(islower__doc__,
6702 "S.islower() -> bool\n\
6703 \n\
6704 Return True if all cased characters in S are lowercase and there is\n\
6705 at least one cased character in S, False otherwise.");
6706
6707 static PyObject*
6708 unicode_islower(PyUnicodeObject *self)
6709 {
6710 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6711 register const Py_UNICODE *e;
6712 int cased;
6713
6714 /* Shortcut for single character strings */
6715 if (PyUnicode_GET_SIZE(self) == 1)
6716 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6717
6718 /* Special case for empty strings */
6719 if (PyUnicode_GET_SIZE(self) == 0)
6720 return PyBool_FromLong(0);
6721
6722 e = p + PyUnicode_GET_SIZE(self);
6723 cased = 0;
6724 for (; p < e; p++) {
6725 register const Py_UNICODE ch = *p;
6726
6727 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6728 return PyBool_FromLong(0);
6729 else if (!cased && Py_UNICODE_ISLOWER(ch))
6730 cased = 1;
6731 }
6732 return PyBool_FromLong(cased);
6733 }
6734
6735 PyDoc_STRVAR(isupper__doc__,
6736 "S.isupper() -> bool\n\
6737 \n\
6738 Return True if all cased characters in S are uppercase and there is\n\
6739 at least one cased character in S, False otherwise.");
6740
6741 static PyObject*
6742 unicode_isupper(PyUnicodeObject *self)
6743 {
6744 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6745 register const Py_UNICODE *e;
6746 int cased;
6747
6748 /* Shortcut for single character strings */
6749 if (PyUnicode_GET_SIZE(self) == 1)
6750 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6751
6752 /* Special case for empty strings */
6753 if (PyUnicode_GET_SIZE(self) == 0)
6754 return PyBool_FromLong(0);
6755
6756 e = p + PyUnicode_GET_SIZE(self);
6757 cased = 0;
6758 for (; p < e; p++) {
6759 register const Py_UNICODE ch = *p;
6760
6761 if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6762 return PyBool_FromLong(0);
6763 else if (!cased && Py_UNICODE_ISUPPER(ch))
6764 cased = 1;
6765 }
6766 return PyBool_FromLong(cased);
6767 }
6768
6769 PyDoc_STRVAR(istitle__doc__,
6770 "S.istitle() -> bool\n\
6771 \n\
6772 Return True if S is a titlecased string and there is at least one\n\
6773 character in S, i.e. upper- and titlecase characters may only\n\
6774 follow uncased characters and lowercase characters only cased ones.\n\
6775 Return False otherwise.");
6776
6777 static PyObject*
6778 unicode_istitle(PyUnicodeObject *self)
6779 {
6780 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6781 register const Py_UNICODE *e;
6782 int cased, previous_is_cased;
6783
6784 /* Shortcut for single character strings */
6785 if (PyUnicode_GET_SIZE(self) == 1)
6786 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6787 (Py_UNICODE_ISUPPER(*p) != 0));
6788
6789 /* Special case for empty strings */
6790 if (PyUnicode_GET_SIZE(self) == 0)
6791 return PyBool_FromLong(0);
6792
6793 e = p + PyUnicode_GET_SIZE(self);
6794 cased = 0;
6795 previous_is_cased = 0;
6796 for (; p < e; p++) {
6797 register const Py_UNICODE ch = *p;
6798
6799 if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6800 if (previous_is_cased)
6801 return PyBool_FromLong(0);
6802 previous_is_cased = 1;
6803 cased = 1;
6804 }
6805 else if (Py_UNICODE_ISLOWER(ch)) {
6806 if (!previous_is_cased)
6807 return PyBool_FromLong(0);
6808 previous_is_cased = 1;
6809 cased = 1;
6810 }
6811 else
6812 previous_is_cased = 0;
6813 }
6814 return PyBool_FromLong(cased);
6815 }
6816
6817 PyDoc_STRVAR(isspace__doc__,
6818 "S.isspace() -> bool\n\
6819 \n\
6820 Return True if all characters in S are whitespace\n\
6821 and there is at least one character in S, False otherwise.");
6822
6823 static PyObject*
6824 unicode_isspace(PyUnicodeObject *self)
6825 {
6826 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6827 register const Py_UNICODE *e;
6828
6829 /* Shortcut for single character strings */
6830 if (PyUnicode_GET_SIZE(self) == 1 &&
6831 Py_UNICODE_ISSPACE(*p))
6832 return PyBool_FromLong(1);
6833
6834 /* Special case for empty strings */
6835 if (PyUnicode_GET_SIZE(self) == 0)
6836 return PyBool_FromLong(0);
6837
6838 e = p + PyUnicode_GET_SIZE(self);
6839 for (; p < e; p++) {
6840 if (!Py_UNICODE_ISSPACE(*p))
6841 return PyBool_FromLong(0);
6842 }
6843 return PyBool_FromLong(1);
6844 }
6845
6846 PyDoc_STRVAR(isalpha__doc__,
6847 "S.isalpha() -> bool\n\
6848 \n\
6849 Return True if all characters in S are alphabetic\n\
6850 and there is at least one character in S, False otherwise.");
6851
6852 static PyObject*
6853 unicode_isalpha(PyUnicodeObject *self)
6854 {
6855 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6856 register const Py_UNICODE *e;
6857
6858 /* Shortcut for single character strings */
6859 if (PyUnicode_GET_SIZE(self) == 1 &&
6860 Py_UNICODE_ISALPHA(*p))
6861 return PyBool_FromLong(1);
6862
6863 /* Special case for empty strings */
6864 if (PyUnicode_GET_SIZE(self) == 0)
6865 return PyBool_FromLong(0);
6866
6867 e = p + PyUnicode_GET_SIZE(self);
6868 for (; p < e; p++) {
6869 if (!Py_UNICODE_ISALPHA(*p))
6870 return PyBool_FromLong(0);
6871 }
6872 return PyBool_FromLong(1);
6873 }
6874
6875 PyDoc_STRVAR(isalnum__doc__,
6876 "S.isalnum() -> bool\n\
6877 \n\
6878 Return True if all characters in S are alphanumeric\n\
6879 and there is at least one character in S, False otherwise.");
6880
6881 static PyObject*
6882 unicode_isalnum(PyUnicodeObject *self)
6883 {
6884 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6885 register const Py_UNICODE *e;
6886
6887 /* Shortcut for single character strings */
6888 if (PyUnicode_GET_SIZE(self) == 1 &&
6889 Py_UNICODE_ISALNUM(*p))
6890 return PyBool_FromLong(1);
6891
6892 /* Special case for empty strings */
6893 if (PyUnicode_GET_SIZE(self) == 0)
6894 return PyBool_FromLong(0);
6895
6896 e = p + PyUnicode_GET_SIZE(self);
6897 for (; p < e; p++) {
6898 if (!Py_UNICODE_ISALNUM(*p))
6899 return PyBool_FromLong(0);
6900 }
6901 return PyBool_FromLong(1);
6902 }
6903
6904 PyDoc_STRVAR(isdecimal__doc__,
6905 "S.isdecimal() -> bool\n\
6906 \n\
6907 Return True if there are only decimal characters in S,\n\
6908 False otherwise.");
6909
6910 static PyObject*
6911 unicode_isdecimal(PyUnicodeObject *self)
6912 {
6913 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6914 register const Py_UNICODE *e;
6915
6916 /* Shortcut for single character strings */
6917 if (PyUnicode_GET_SIZE(self) == 1 &&
6918 Py_UNICODE_ISDECIMAL(*p))
6919 return PyBool_FromLong(1);
6920
6921 /* Special case for empty strings */
6922 if (PyUnicode_GET_SIZE(self) == 0)
6923 return PyBool_FromLong(0);
6924
6925 e = p + PyUnicode_GET_SIZE(self);
6926 for (; p < e; p++) {
6927 if (!Py_UNICODE_ISDECIMAL(*p))
6928 return PyBool_FromLong(0);
6929 }
6930 return PyBool_FromLong(1);
6931 }
6932
6933 PyDoc_STRVAR(isdigit__doc__,
6934 "S.isdigit() -> bool\n\
6935 \n\
6936 Return True if all characters in S are digits\n\
6937 and there is at least one character in S, False otherwise.");
6938
6939 static PyObject*
6940 unicode_isdigit(PyUnicodeObject *self)
6941 {
6942 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6943 register const Py_UNICODE *e;
6944
6945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self) == 1 &&
6947 Py_UNICODE_ISDIGIT(*p))
6948 return PyBool_FromLong(1);
6949
6950 /* Special case for empty strings */
6951 if (PyUnicode_GET_SIZE(self) == 0)
6952 return PyBool_FromLong(0);
6953
6954 e = p + PyUnicode_GET_SIZE(self);
6955 for (; p < e; p++) {
6956 if (!Py_UNICODE_ISDIGIT(*p))
6957 return PyBool_FromLong(0);
6958 }
6959 return PyBool_FromLong(1);
6960 }
6961
6962 PyDoc_STRVAR(isnumeric__doc__,
6963 "S.isnumeric() -> bool\n\
6964 \n\
6965 Return True if there are only numeric characters in S,\n\
6966 False otherwise.");
6967
6968 static PyObject*
6969 unicode_isnumeric(PyUnicodeObject *self)
6970 {
6971 register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6972 register const Py_UNICODE *e;
6973
6974 /* Shortcut for single character strings */
6975 if (PyUnicode_GET_SIZE(self) == 1 &&
6976 Py_UNICODE_ISNUMERIC(*p))
6977 return PyBool_FromLong(1);
6978
6979 /* Special case for empty strings */
6980 if (PyUnicode_GET_SIZE(self) == 0)
6981 return PyBool_FromLong(0);
6982
6983 e = p + PyUnicode_GET_SIZE(self);
6984 for (; p < e; p++) {
6985 if (!Py_UNICODE_ISNUMERIC(*p))
6986 return PyBool_FromLong(0);
6987 }
6988 return PyBool_FromLong(1);
6989 }
6990
6991 PyDoc_STRVAR(join__doc__,
6992 "S.join(iterable) -> unicode\n\
6993 \n\
6994 Return a string which is the concatenation of the strings in the\n\
6995 iterable. The separator between elements is S.");
6996
6997 static PyObject*
6998 unicode_join(PyObject *self, PyObject *data)
6999 {
7000 return PyUnicode_Join(self, data);
7001 }
7002
7003 static Py_ssize_t
7004 unicode_length(PyUnicodeObject *self)
7005 {
7006 return self->length;
7007 }
7008
7009 PyDoc_STRVAR(ljust__doc__,
7010 "S.ljust(width[, fillchar]) -> int\n\
7011 \n\
7012 Return S left-justified in a Unicode string of length width. Padding is\n\
7013 done using the specified fill character (default is a space).");
7014
7015 static PyObject *
7016 unicode_ljust(PyUnicodeObject *self, PyObject *args)
7017 {
7018 Py_ssize_t width;
7019 Py_UNICODE fillchar = ' ';
7020
7021 if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7022 return NULL;
7023
7024 if (self->length >= width && PyUnicode_CheckExact(self)) {
7025 Py_INCREF(self);
7026 return (PyObject*) self;
7027 }
7028
7029 return (PyObject*) pad(self, 0, width - self->length, fillchar);
7030 }
7031
7032 PyDoc_STRVAR(lower__doc__,
7033 "S.lower() -> unicode\n\
7034 \n\
7035 Return a copy of the string S converted to lowercase.");
7036
7037 static PyObject*
7038 unicode_lower(PyUnicodeObject *self)
7039 {
7040 return fixup(self, fixlower);
7041 }
7042
7043 #define LEFTSTRIP 0
7044 #define RIGHTSTRIP 1
7045 #define BOTHSTRIP 2
7046
7047 /* Arrays indexed by above */
7048 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7049
7050 #define STRIPNAME(i) (stripformat[i]+3)
7051
7052 /* externally visible for str.strip(unicode) */
7053 PyObject *
7054 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7055 {
7056 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7057 Py_ssize_t len = PyUnicode_GET_SIZE(self);
7058 Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7059 Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7060 Py_ssize_t i, j;
7061
7062 BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7063
7064 i = 0;
7065 if (striptype != RIGHTSTRIP) {
7066 while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7067 i++;
7068 }
7069 }
7070
7071 j = len;
7072 if (striptype != LEFTSTRIP) {
7073 do {
7074 j--;
7075 } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7076 j++;
7077 }
7078
7079 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7080 Py_INCREF(self);
7081 return (PyObject*)self;
7082 }
7083 else
7084 return PyUnicode_FromUnicode(s+i, j-i);
7085 }
7086
7087
7088 static PyObject *
7089 do_strip(PyUnicodeObject *self, int striptype)
7090 {
7091 Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7092 Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7093
7094 i = 0;
7095 if (striptype != RIGHTSTRIP) {
7096 while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7097 i++;
7098 }
7099 }
7100
7101 j = len;
7102 if (striptype != LEFTSTRIP) {
7103 do {
7104 j--;
7105 } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7106 j++;
7107 }
7108
7109 if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7110 Py_INCREF(self);
7111 return (PyObject*)self;
7112 }
7113 else
7114 return PyUnicode_FromUnicode(s+i, j-i);
7115 }
7116
7117
7118 static PyObject *
7119 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7120 {
7121 PyObject *sep = NULL;
7122
7123 if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7124 return NULL;
7125
7126 if (sep != NULL && sep != Py_None) {
7127 if (PyUnicode_Check(sep))
7128 return _PyUnicode_XStrip(self, striptype, sep);
7129 else if (PyString_Check(sep)) {
7130 PyObject *res;
7131 sep = PyUnicode_FromObject(sep);
7132 if (sep==NULL)
7133 return NULL;
7134 res = _PyUnicode_XStrip(self, striptype, sep);
7135 Py_DECREF(sep);
7136 return res;
7137 }
7138 else {
7139 PyErr_Format(PyExc_TypeError,
7140 "%s arg must be None, unicode or str",
7141 STRIPNAME(striptype));
7142 return NULL;
7143 }
7144 }
7145
7146 return do_strip(self, striptype);
7147 }
7148
7149
7150 PyDoc_STRVAR(strip__doc__,
7151 "S.strip([chars]) -> unicode\n\
7152 \n\
7153 Return a copy of the string S with leading and trailing\n\
7154 whitespace removed.\n\
7155 If chars is given and not None, remove characters in chars instead.\n\
7156 If chars is a str, it will be converted to unicode before stripping");
7157
7158 static PyObject *
7159 unicode_strip(PyUnicodeObject *self, PyObject *args)
7160 {
7161 if (PyTuple_GET_SIZE(args) == 0)
7162 return do_strip(self, BOTHSTRIP); /* Common case */
7163 else
7164 return do_argstrip(self, BOTHSTRIP, args);
7165 }
7166
7167
7168 PyDoc_STRVAR(lstrip__doc__,
7169 "S.lstrip([chars]) -> unicode\n\
7170 \n\
7171 Return a copy of the string S with leading whitespace removed.\n\
7172 If chars is given and not None, remove characters in chars instead.\n\
7173 If chars is a str, it will be converted to unicode before stripping");
7174
7175 static PyObject *
7176 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7177 {
7178 if (PyTuple_GET_SIZE(args) == 0)
7179 return do_strip(self, LEFTSTRIP); /* Common case */
7180 else
7181 return do_argstrip(self, LEFTSTRIP, args);
7182 }
7183
7184
7185 PyDoc_STRVAR(rstrip__doc__,
7186 "S.rstrip([chars]) -> unicode\n\
7187 \n\
7188 Return a copy of the string S with trailing whitespace removed.\n\
7189 If chars is given and not None, remove characters in chars instead.\n\
7190 If chars is a str, it will be converted to unicode before stripping");
7191
7192 static PyObject *
7193 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7194 {
7195 if (PyTuple_GET_SIZE(args) == 0)
7196 return do_strip(self, RIGHTSTRIP); /* Common case */
7197 else
7198 return do_argstrip(self, RIGHTSTRIP, args);
7199 }
7200
7201
7202 static PyObject*
7203 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7204 {
7205 PyUnicodeObject *u;
7206 Py_UNICODE *p;
7207 Py_ssize_t nchars;
7208 size_t nbytes;
7209
7210 if (len < 0)
7211 len = 0;
7212
7213 if (len == 1 && PyUnicode_CheckExact(str)) {
7214 /* no repeat, return original string */
7215 Py_INCREF(str);
7216 return (PyObject*) str;
7217 }
7218
7219 /* ensure # of chars needed doesn't overflow int and # of bytes
7220 * needed doesn't overflow size_t
7221 */
7222 nchars = len * str->length;
7223 if (len && nchars / len != str->length) {
7224 PyErr_SetString(PyExc_OverflowError,
7225 "repeated string is too long");
7226 return NULL;
7227 }
7228 nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7229 if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7230 PyErr_SetString(PyExc_OverflowError,
7231 "repeated string is too long");
7232 return NULL;
7233 }
7234 u = _PyUnicode_New(nchars);
7235 if (!u)
7236 return NULL;
7237
7238 p = u->str;
7239
7240 if (str->length == 1 && len > 0) {
7241 Py_UNICODE_FILL(p, str->str[0], len);
7242 } else {
7243 Py_ssize_t done = 0; /* number of characters copied this far */
7244 if (done < nchars) {
7245 Py_UNICODE_COPY(p, str->str, str->length);
7246 done = str->length;
7247 }
7248 while (done < nchars) {
7249 Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7250 Py_UNICODE_COPY(p+done, p, n);
7251 done += n;
7252 }
7253 }
7254
7255 return (PyObject*) u;
7256 }
7257
7258 PyObject *PyUnicode_Replace(PyObject *obj,
7259 PyObject *subobj,
7260 PyObject *replobj,
7261 Py_ssize_t maxcount)
7262 {
7263 PyObject *self;
7264 PyObject *str1;
7265 PyObject *str2;
7266 PyObject *result;
7267
7268 self = PyUnicode_FromObject(obj);
7269 if (self == NULL)
7270 return NULL;
7271 str1 = PyUnicode_FromObject(subobj);
7272 if (str1 == NULL) {
7273 Py_DECREF(self);
7274 return NULL;
7275 }
7276 str2 = PyUnicode_FromObject(replobj);
7277 if (str2 == NULL) {
7278 Py_DECREF(self);
7279 Py_DECREF(str1);
7280 return NULL;
7281 }
7282 result = replace((PyUnicodeObject *)self,
7283 (PyUnicodeObject *)str1,
7284 (PyUnicodeObject *)str2,
7285 maxcount);
7286 Py_DECREF(self);
7287 Py_DECREF(str1);
7288 Py_DECREF(str2);
7289 return result;
7290 }
7291
7292 PyDoc_STRVAR(replace__doc__,
7293 "S.replace(old, new[, count]) -> unicode\n\
7294 \n\
7295 Return a copy of S with all occurrences of substring\n\
7296 old replaced by new. If the optional argument count is\n\
7297 given, only the first count occurrences are replaced.");
7298
7299 static PyObject*
7300 unicode_replace(PyUnicodeObject *self, PyObject *args)
7301 {
7302 PyUnicodeObject *str1;
7303 PyUnicodeObject *str2;
7304 Py_ssize_t maxcount = -1;
7305 PyObject *result;
7306
7307 if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7308 return NULL;
7309 str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7310 if (str1 == NULL)
7311 return NULL;
7312 str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7313 if (str2 == NULL) {
7314 Py_DECREF(str1);
7315 return NULL;
7316 }
7317
7318 result = replace(self, str1, str2, maxcount);
7319
7320 Py_DECREF(str1);
7321 Py_DECREF(str2);
7322 return result;
7323 }
7324
7325 static
7326 PyObject *unicode_repr(PyObject *unicode)
7327 {
7328 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7329 PyUnicode_GET_SIZE(unicode),
7330 1);
7331 }
7332
7333 PyDoc_STRVAR(rfind__doc__,
7334 "S.rfind(sub [,start [,end]]) -> int\n\
7335 \n\
7336 Return the highest index in S where substring sub is found,\n\
7337 such that sub is contained within S[start:end]. Optional\n\
7338 arguments start and end are interpreted as in slice notation.\n\
7339 \n\
7340 Return -1 on failure.");
7341
7342 static PyObject *
7343 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7344 {
7345 PyUnicodeObject *substring;
7346 Py_ssize_t start;
7347 Py_ssize_t end;
7348 Py_ssize_t result;
7349
7350 if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7351 &start, &end))
7352 return NULL;
7353
7354 result = stringlib_rfind_slice(
7355 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7356 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7357 start, end
7358 );
7359
7360 Py_DECREF(substring);
7361
7362 return PyInt_FromSsize_t(result);
7363 }
7364
7365 PyDoc_STRVAR(rindex__doc__,
7366 "S.rindex(sub [,start [,end]]) -> int\n\
7367 \n\
7368 Like S.rfind() but raise ValueError when the substring is not found.");
7369
7370 static PyObject *
7371 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7372 {
7373 PyUnicodeObject *substring;
7374 Py_ssize_t start;
7375 Py_ssize_t end;
7376 Py_ssize_t result;
7377
7378 if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7379 &start, &end))
7380 return NULL;
7381
7382 result = stringlib_rfind_slice(
7383 PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7384 PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7385 start, end
7386 );
7387
7388 Py_DECREF(substring);
7389
7390 if (result < 0) {
7391 PyErr_SetString(PyExc_ValueError, "substring not found");
7392 return NULL;
7393 }
7394 return PyInt_FromSsize_t(result);
7395 }
7396
7397 PyDoc_STRVAR(rjust__doc__,
7398 "S.rjust(width[, fillchar]) -> unicode\n\
7399 \n\
7400 Return S right-justified in a Unicode string of length width. Padding is\n\
7401 done using the specified fill character (default is a space).");
7402
7403 static PyObject *
7404 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7405 {
7406 Py_ssize_t width;
7407 Py_UNICODE fillchar = ' ';
7408
7409 if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7410 return NULL;
7411
7412 if (self->length >= width && PyUnicode_CheckExact(self)) {
7413 Py_INCREF(self);
7414 return (PyObject*) self;
7415 }
7416
7417 return (PyObject*) pad(self, width - self->length, 0, fillchar);
7418 }
7419
7420 static PyObject*
7421 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7422 {
7423 /* standard clamping */
7424 if (start < 0)
7425 start = 0;
7426 if (end < 0)
7427 end = 0;
7428 if (end > self->length)
7429 end = self->length;
7430 if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7431 /* full slice, return original string */
7432 Py_INCREF(self);
7433 return (PyObject*) self;
7434 }
7435 if (start > end)
7436 start = end;
7437 /* copy slice */
7438 return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7439 end - start);
7440 }
7441
7442 PyObject *PyUnicode_Split(PyObject *s,
7443 PyObject *sep,
7444 Py_ssize_t maxsplit)
7445 {
7446 PyObject *result;
7447
7448 s = PyUnicode_FromObject(s);
7449 if (s == NULL)
7450 return NULL;
7451 if (sep != NULL) {
7452 sep = PyUnicode_FromObject(sep);
7453 if (sep == NULL) {
7454 Py_DECREF(s);
7455 return NULL;
7456 }
7457 }
7458
7459 result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7460
7461 Py_DECREF(s);
7462 Py_XDECREF(sep);
7463 return result;
7464 }
7465
7466 PyDoc_STRVAR(split__doc__,
7467 "S.split([sep [,maxsplit]]) -> list of strings\n\
7468 \n\
7469 Return a list of the words in S, using sep as the\n\
7470 delimiter string. If maxsplit is given, at most maxsplit\n\
7471 splits are done. If sep is not specified or is None, any\n\
7472 whitespace string is a separator and empty strings are\n\
7473 removed from the result.");
7474
7475 static PyObject*
7476 unicode_split(PyUnicodeObject *self, PyObject *args)
7477 {
7478 PyObject *substring = Py_None;
7479 Py_ssize_t maxcount = -1;
7480
7481 if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7482 return NULL;
7483
7484 if (substring == Py_None)
7485 return split(self, NULL, maxcount);
7486 else if (PyUnicode_Check(substring))
7487 return split(self, (PyUnicodeObject *)substring, maxcount);
7488 else
7489 return PyUnicode_Split((PyObject *)self, substring, maxcount);
7490 }
7491
7492 PyObject *
7493 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7494 {
7495 PyObject* str_obj;
7496 PyObject* sep_obj;
7497 PyObject* out;
7498
7499 str_obj = PyUnicode_FromObject(str_in);
7500 if (!str_obj)
7501 return NULL;
7502 sep_obj = PyUnicode_FromObject(sep_in);
7503 if (!sep_obj) {
7504 Py_DECREF(str_obj);
7505 return NULL;
7506 }
7507
7508 out = stringlib_partition(
7509 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7510 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7511 );
7512
7513 Py_DECREF(sep_obj);
7514 Py_DECREF(str_obj);
7515
7516 return out;
7517 }
7518
7519
7520 PyObject *
7521 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7522 {
7523 PyObject* str_obj;
7524 PyObject* sep_obj;
7525 PyObject* out;
7526
7527 str_obj = PyUnicode_FromObject(str_in);
7528 if (!str_obj)
7529 return NULL;
7530 sep_obj = PyUnicode_FromObject(sep_in);
7531 if (!sep_obj) {
7532 Py_DECREF(str_obj);
7533 return NULL;
7534 }
7535
7536 out = stringlib_rpartition(
7537 str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7538 sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7539 );
7540
7541 Py_DECREF(sep_obj);
7542 Py_DECREF(str_obj);
7543
7544 return out;
7545 }
7546
7547 PyDoc_STRVAR(partition__doc__,
7548 "S.partition(sep) -> (head, sep, tail)\n\
7549 \n\
7550 Search for the separator sep in S, and return the part before it,\n\
7551 the separator itself, and the part after it. If the separator is not\n\
7552 found, return S and two empty strings.");
7553
7554 static PyObject*
7555 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7556 {
7557 return PyUnicode_Partition((PyObject *)self, separator);
7558 }
7559
7560 PyDoc_STRVAR(rpartition__doc__,
7561 "S.rpartition(sep) -> (head, sep, tail)\n\
7562 \n\
7563 Search for the separator sep in S, starting at the end of S, and return\n\
7564 the part before it, the separator itself, and the part after it. If the\n\
7565 separator is not found, return two empty strings and S.");
7566
7567 static PyObject*
7568 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7569 {
7570 return PyUnicode_RPartition((PyObject *)self, separator);
7571 }
7572
7573 PyObject *PyUnicode_RSplit(PyObject *s,
7574 PyObject *sep,
7575 Py_ssize_t maxsplit)
7576 {
7577 PyObject *result;
7578
7579 s = PyUnicode_FromObject(s);
7580 if (s == NULL)
7581 return NULL;
7582 if (sep != NULL) {
7583 sep = PyUnicode_FromObject(sep);
7584 if (sep == NULL) {
7585 Py_DECREF(s);
7586 return NULL;
7587 }
7588 }
7589
7590 result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7591
7592 Py_DECREF(s);
7593 Py_XDECREF(sep);
7594 return result;
7595 }
7596
7597 PyDoc_STRVAR(rsplit__doc__,
7598 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7599 \n\
7600 Return a list of the words in S, using sep as the\n\
7601 delimiter string, starting at the end of the string and\n\
7602 working to the front. If maxsplit is given, at most maxsplit\n\
7603 splits are done. If sep is not specified, any whitespace string\n\
7604 is a separator.");
7605
7606 static PyObject*
7607 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7608 {
7609 PyObject *substring = Py_None;
7610 Py_ssize_t maxcount = -1;
7611
7612 if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7613 return NULL;
7614
7615 if (substring == Py_None)
7616 return rsplit(self, NULL, maxcount);
7617 else if (PyUnicode_Check(substring))
7618 return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7619 else
7620 return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7621 }
7622
7623 PyDoc_STRVAR(splitlines__doc__,
7624 "S.splitlines(keepends=False) -> list of strings\n\
7625 \n\
7626 Return a list of the lines in S, breaking at line boundaries.\n\
7627 Line breaks are not included in the resulting list unless keepends\n\
7628 is given and true.");
7629
7630 static PyObject*
7631 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7632 {
7633 int keepends = 0;
7634
7635 if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7636 return NULL;
7637
7638 return PyUnicode_Splitlines((PyObject *)self, keepends);
7639 }
7640
7641 static
7642 PyObject *unicode_str(PyUnicodeObject *self)
7643 {
7644 return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7645 }
7646
7647 PyDoc_STRVAR(swapcase__doc__,
7648 "S.swapcase() -> unicode\n\
7649 \n\
7650 Return a copy of S with uppercase characters converted to lowercase\n\
7651 and vice versa.");
7652
7653 static PyObject*
7654 unicode_swapcase(PyUnicodeObject *self)
7655 {
7656 return fixup(self, fixswapcase);
7657 }
7658
7659 PyDoc_STRVAR(translate__doc__,
7660 "S.translate(table) -> unicode\n\
7661 \n\
7662 Return a copy of the string S, where all characters have been mapped\n\
7663 through the given translation table, which must be a mapping of\n\
7664 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7665 Unmapped characters are left untouched. Characters mapped to None\n\
7666 are deleted.");
7667
7668 static PyObject*
7669 unicode_translate(PyUnicodeObject *self, PyObject *table)
7670 {
7671 return PyUnicode_TranslateCharmap(self->str,
7672 self->length,
7673 table,
7674 "ignore");
7675 }
7676
7677 PyDoc_STRVAR(upper__doc__,
7678 "S.upper() -> unicode\n\
7679 \n\
7680 Return a copy of S converted to uppercase.");
7681
7682 static PyObject*
7683 unicode_upper(PyUnicodeObject *self)
7684 {
7685 return fixup(self, fixupper);
7686 }
7687
7688 PyDoc_STRVAR(zfill__doc__,
7689 "S.zfill(width) -> unicode\n\
7690 \n\
7691 Pad a numeric string S with zeros on the left, to fill a field\n\
7692 of the specified width. The string S is never truncated.");
7693
7694 static PyObject *
7695 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7696 {
7697 Py_ssize_t fill;
7698 PyUnicodeObject *u;
7699
7700 Py_ssize_t width;
7701 if (!PyArg_ParseTuple(args, "n:zfill", &width))
7702 return NULL;
7703
7704 if (self->length >= width) {
7705 if (PyUnicode_CheckExact(self)) {
7706 Py_INCREF(self);
7707 return (PyObject*) self;
7708 }
7709 else
7710 return PyUnicode_FromUnicode(
7711 PyUnicode_AS_UNICODE(self),
7712 PyUnicode_GET_SIZE(self)
7713 );
7714 }
7715
7716 fill = width - self->length;
7717
7718 u = pad(self, fill, 0, '0');
7719
7720 if (u == NULL)
7721 return NULL;
7722
7723 if (u->str[fill] == '+' || u->str[fill] == '-') {
7724 /* move sign to beginning of string */
7725 u->str[0] = u->str[fill];
7726 u->str[fill] = '0';
7727 }
7728
7729 return (PyObject*) u;
7730 }
7731
7732 #if 0
7733 static PyObject*
7734 free_listsize(PyUnicodeObject *self)
7735 {
7736 return PyInt_FromLong(numfree);
7737 }
7738 #endif
7739
7740 PyDoc_STRVAR(startswith__doc__,
7741 "S.startswith(prefix[, start[, end]]) -> bool\n\
7742 \n\
7743 Return True if S starts with the specified prefix, False otherwise.\n\
7744 With optional start, test S beginning at that position.\n\
7745 With optional end, stop comparing S at that position.\n\
7746 prefix can also be a tuple of strings to try.");
7747
7748 static PyObject *
7749 unicode_startswith(PyUnicodeObject *self,
7750 PyObject *args)
7751 {
7752 PyObject *subobj;
7753 PyUnicodeObject *substring;
7754 Py_ssize_t start = 0;
7755 Py_ssize_t end = PY_SSIZE_T_MAX;
7756 int result;
7757
7758 if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7759 return NULL;
7760 if (PyTuple_Check(subobj)) {
7761 Py_ssize_t i;
7762 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7763 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7764 PyTuple_GET_ITEM(subobj, i));
7765 if (substring == NULL)
7766 return NULL;
7767 result = tailmatch(self, substring, start, end, -1);
7768 Py_DECREF(substring);
7769 if (result) {
7770 Py_RETURN_TRUE;
7771 }
7772 }
7773 /* nothing matched */
7774 Py_RETURN_FALSE;
7775 }
7776 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7777 if (substring == NULL) {
7778 if (PyErr_ExceptionMatches(PyExc_TypeError))
7779 PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7780 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7781 return NULL;
7782 }
7783 result = tailmatch(self, substring, start, end, -1);
7784 Py_DECREF(substring);
7785 return PyBool_FromLong(result);
7786 }
7787
7788
7789 PyDoc_STRVAR(endswith__doc__,
7790 "S.endswith(suffix[, start[, end]]) -> bool\n\
7791 \n\
7792 Return True if S ends with the specified suffix, False otherwise.\n\
7793 With optional start, test S beginning at that position.\n\
7794 With optional end, stop comparing S at that position.\n\
7795 suffix can also be a tuple of strings to try.");
7796
7797 static PyObject *
7798 unicode_endswith(PyUnicodeObject *self,
7799 PyObject *args)
7800 {
7801 PyObject *subobj;
7802 PyUnicodeObject *substring;
7803 Py_ssize_t start = 0;
7804 Py_ssize_t end = PY_SSIZE_T_MAX;
7805 int result;
7806
7807 if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7808 return NULL;
7809 if (PyTuple_Check(subobj)) {
7810 Py_ssize_t i;
7811 for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7812 substring = (PyUnicodeObject *)PyUnicode_FromObject(
7813 PyTuple_GET_ITEM(subobj, i));
7814 if (substring == NULL)
7815 return NULL;
7816 result = tailmatch(self, substring, start, end, +1);
7817 Py_DECREF(substring);
7818 if (result) {
7819 Py_RETURN_TRUE;
7820 }
7821 }
7822 Py_RETURN_FALSE;
7823 }
7824 substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7825 if (substring == NULL) {
7826 if (PyErr_ExceptionMatches(PyExc_TypeError))
7827 PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7828 "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7829 return NULL;
7830 }
7831 result = tailmatch(self, substring, start, end, +1);
7832 Py_DECREF(substring);
7833 return PyBool_FromLong(result);
7834 }
7835
7836
7837 /* Implements do_string_format, which is unicode because of stringlib */
7838 #include "stringlib/string_format.h"
7839
7840 PyDoc_STRVAR(format__doc__,
7841 "S.format(*args, **kwargs) -> unicode\n\
7842 \n\
7843 Return a formatted version of S, using substitutions from args and kwargs.\n\
7844 The substitutions are identified by braces ('{' and '}').");
7845
7846 static PyObject *
7847 unicode__format__(PyObject *self, PyObject *args)
7848 {
7849 PyObject *format_spec;
7850 PyObject *result = NULL;
7851 PyObject *tmp = NULL;
7852
7853 /* If 2.x, convert format_spec to the same type as value */
7854 /* This is to allow things like u''.format('') */
7855 if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7856 goto done;
7857 if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7858 PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7859 "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7860 goto done;
7861 }
7862 tmp = PyObject_Unicode(format_spec);
7863 if (tmp == NULL)
7864 goto done;
7865 format_spec = tmp;
7866
7867 result = _PyUnicode_FormatAdvanced(self,
7868 PyUnicode_AS_UNICODE(format_spec),
7869 PyUnicode_GET_SIZE(format_spec));
7870 done:
7871 Py_XDECREF(tmp);
7872 return result;
7873 }
7874
7875 PyDoc_STRVAR(p_format__doc__,
7876 "S.__format__(format_spec) -> unicode\n\
7877 \n\
7878 Return a formatted version of S as described by format_spec.");
7879
7880 static PyObject *
7881 unicode__sizeof__(PyUnicodeObject *v)
7882 {
7883 return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7884 sizeof(Py_UNICODE) * (v->length + 1));
7885 }
7886
7887 PyDoc_STRVAR(sizeof__doc__,
7888 "S.__sizeof__() -> size of S in memory, in bytes\n\
7889 \n\
7890 ");
7891
7892 static PyObject *
7893 unicode_getnewargs(PyUnicodeObject *v)
7894 {
7895 return Py_BuildValue("(u#)", v->str, v->length);
7896 }
7897
7898
7899 static PyMethodDef unicode_methods[] = {
7900 {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7901 {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7902 {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7903 {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7904 {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7905 {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7906 {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7907 {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7908 {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7909 {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7910 {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7911 {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7912 {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7913 {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7914 {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7915 {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7916 {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7917 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7918 {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7919 {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7920 {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7921 {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7922 {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7923 {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7924 {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7925 {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7926 {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7927 {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7928 {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7929 {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7930 {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7931 {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7932 {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7933 {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7934 {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7935 {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7936 {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7937 {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7938 {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7939 {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7940 {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7941 {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7942 {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7943 {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7944 {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7945 #if 0
7946 {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7947 #endif
7948
7949 #if 0
7950 /* This one is just used for debugging the implementation. */
7951 {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7952 #endif
7953
7954 {"__getnewargs__", (PyCFunction)unicode_getnewargs, METH_NOARGS},
7955 {NULL, NULL}
7956 };
7957
7958 static PyObject *
7959 unicode_mod(PyObject *v, PyObject *w)
7960 {
7961 if (!PyUnicode_Check(v)) {
7962 Py_INCREF(Py_NotImplemented);
7963 return Py_NotImplemented;
7964 }
7965 return PyUnicode_Format(v, w);
7966 }
7967
7968 static PyNumberMethods unicode_as_number = {
7969 0, /*nb_add*/
7970 0, /*nb_subtract*/
7971 0, /*nb_multiply*/
7972 0, /*nb_divide*/
7973 unicode_mod, /*nb_remainder*/
7974 };
7975
7976 static PySequenceMethods unicode_as_sequence = {
7977 (lenfunc) unicode_length, /* sq_length */
7978 PyUnicode_Concat, /* sq_concat */
7979 (ssizeargfunc) unicode_repeat, /* sq_repeat */
7980 (ssizeargfunc) unicode_getitem, /* sq_item */
7981 (ssizessizeargfunc) unicode_slice, /* sq_slice */
7982 0, /* sq_ass_item */
7983 0, /* sq_ass_slice */
7984 PyUnicode_Contains, /* sq_contains */
7985 };
7986
7987 static PyObject*
7988 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7989 {
7990 if (PyIndex_Check(item)) {
7991 Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7992 if (i == -1 && PyErr_Occurred())
7993 return NULL;
7994 if (i < 0)
7995 i += PyUnicode_GET_SIZE(self);
7996 return unicode_getitem(self, i);
7997 } else if (PySlice_Check(item)) {
7998 Py_ssize_t start, stop, step, slicelength, cur, i;
7999 Py_UNICODE* source_buf;
8000 Py_UNICODE* result_buf;
8001 PyObject* result;
8002
8003 if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8004 &start, &stop, &step, &slicelength) < 0) {
8005 return NULL;
8006 }
8007
8008 if (slicelength <= 0) {
8009 return PyUnicode_FromUnicode(NULL, 0);
8010 } else if (start == 0 && step == 1 && slicelength == self->length &&
8011 PyUnicode_CheckExact(self)) {
8012 Py_INCREF(self);
8013 return (PyObject *)self;
8014 } else if (step == 1) {
8015 return PyUnicode_FromUnicode(self->str + start, slicelength);
8016 } else {
8017 source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8018 result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8019 sizeof(Py_UNICODE));
8020
8021 if (result_buf == NULL)
8022 return PyErr_NoMemory();
8023
8024 for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8025 result_buf[i] = source_buf[cur];
8026 }
8027
8028 result = PyUnicode_FromUnicode(result_buf, slicelength);
8029 PyObject_FREE(result_buf);
8030 return result;
8031 }
8032 } else {
8033 PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8034 return NULL;
8035 }
8036 }
8037
8038 static PyMappingMethods unicode_as_mapping = {
8039 (lenfunc)unicode_length, /* mp_length */
8040 (binaryfunc)unicode_subscript, /* mp_subscript */
8041 (objobjargproc)0, /* mp_ass_subscript */
8042 };
8043
8044 static Py_ssize_t
8045 unicode_buffer_getreadbuf(PyUnicodeObject *self,
8046 Py_ssize_t index,
8047 const void **ptr)
8048 {
8049 if (index != 0) {
8050 PyErr_SetString(PyExc_SystemError,
8051 "accessing non-existent unicode segment");
8052 return -1;
8053 }
8054 *ptr = (void *) self->str;
8055 return PyUnicode_GET_DATA_SIZE(self);
8056 }
8057
8058 static Py_ssize_t
8059 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8060 const void **ptr)
8061 {
8062 PyErr_SetString(PyExc_TypeError,
8063 "cannot use unicode as modifiable buffer");
8064 return -1;
8065 }
8066
8067 static int
8068 unicode_buffer_getsegcount(PyUnicodeObject *self,
8069 Py_ssize_t *lenp)
8070 {
8071 if (lenp)
8072 *lenp = PyUnicode_GET_DATA_SIZE(self);
8073 return 1;
8074 }
8075
8076 static Py_ssize_t
8077 unicode_buffer_getcharbuf(PyUnicodeObject *self,
8078 Py_ssize_t index,
8079 const void **ptr)
8080 {
8081 PyObject *str;
8082
8083 if (index != 0) {
8084 PyErr_SetString(PyExc_SystemError,
8085 "accessing non-existent unicode segment");
8086 return -1;
8087 }
8088 str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8089 if (str == NULL)
8090 return -1;
8091 *ptr = (void *) PyString_AS_STRING(str);
8092 return PyString_GET_SIZE(str);
8093 }
8094
8095 /* Helpers for PyUnicode_Format() */
8096
8097 static PyObject *
8098 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8099 {
8100 Py_ssize_t argidx = *p_argidx;
8101 if (argidx < arglen) {
8102 (*p_argidx)++;
8103 if (arglen < 0)
8104 return args;
8105 else
8106 return PyTuple_GetItem(args, argidx);
8107 }
8108 PyErr_SetString(PyExc_TypeError,
8109 "not enough arguments for format string");
8110 return NULL;
8111 }
8112
8113 #define F_LJUST (1<<0)
8114 #define F_SIGN (1<<1)
8115 #define F_BLANK (1<<2)
8116 #define F_ALT (1<<3)
8117 #define F_ZERO (1<<4)
8118
8119 static Py_ssize_t
8120 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8121 {
8122 register Py_ssize_t i;
8123 Py_ssize_t len = strlen(charbuffer);
8124 for (i = len - 1; i >= 0; i--)
8125 buffer[i] = (Py_UNICODE) charbuffer[i];
8126
8127 return len;
8128 }
8129
8130 static int
8131 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8132 {
8133 Py_ssize_t result;
8134
8135 PyOS_snprintf((char *)buffer, len, format, x);
8136 result = strtounicode(buffer, (char *)buffer);
8137 return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8138 }
8139
8140 /* XXX To save some code duplication, formatfloat/long/int could have been
8141 shared with stringobject.c, converting from 8-bit to Unicode after the
8142 formatting is done. */
8143
8144 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8145
8146 static PyObject *
8147 formatfloat(PyObject *v, int flags, int prec, int type)
8148 {
8149 char *p;
8150 PyObject *result;
8151 double x;
8152
8153 x = PyFloat_AsDouble(v);
8154 if (x == -1.0 && PyErr_Occurred())
8155 return NULL;
8156
8157 if (prec < 0)
8158 prec = 6;
8159
8160 p = PyOS_double_to_string(x, type, prec,
8161 (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8162 if (p == NULL)
8163 return NULL;
8164 result = PyUnicode_FromStringAndSize(p, strlen(p));
8165 PyMem_Free(p);
8166 return result;
8167 }
8168
8169 static PyObject*
8170 formatlong(PyObject *val, int flags, int prec, int type)
8171 {
8172 char *buf;
8173 int i, len;
8174 PyObject *str; /* temporary string object. */
8175 PyUnicodeObject *result;
8176
8177 str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8178 if (!str)
8179 return NULL;
8180 result = _PyUnicode_New(len);
8181 if (!result) {
8182 Py_DECREF(str);
8183 return NULL;
8184 }
8185 for (i = 0; i < len; i++)
8186 result->str[i] = buf[i];
8187 result->str[len] = 0;
8188 Py_DECREF(str);
8189 return (PyObject*)result;
8190 }
8191
8192 static int
8193 formatint(Py_UNICODE *buf,
8194 size_t buflen,
8195 int flags,
8196 int prec,
8197 int type,
8198 PyObject *v)
8199 {
8200 /* fmt = '%#.' + `prec` + 'l' + `type`
8201 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8202 * + 1 + 1
8203 * = 24
8204 */
8205 char fmt[64]; /* plenty big enough! */
8206 char *sign;
8207 long x;
8208
8209 x = PyInt_AsLong(v);
8210 if (x == -1 && PyErr_Occurred())
8211 return -1;
8212 if (x < 0 && type == 'u') {
8213 type = 'd';
8214 }
8215 if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8216 sign = "-";
8217 else
8218 sign = "";
8219 if (prec < 0)
8220 prec = 1;
8221
8222 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8223 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8224 */
8225 if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8226 PyErr_SetString(PyExc_OverflowError,
8227 "formatted integer is too long (precision too large?)");
8228 return -1;
8229 }
8230
8231 if ((flags & F_ALT) &&
8232 (type == 'x' || type == 'X')) {
8233 /* When converting under %#x or %#X, there are a number
8234 * of issues that cause pain:
8235 * - when 0 is being converted, the C standard leaves off
8236 * the '0x' or '0X', which is inconsistent with other
8237 * %#x/%#X conversions and inconsistent with Python's
8238 * hex() function
8239 * - there are platforms that violate the standard and
8240 * convert 0 with the '0x' or '0X'
8241 * (Metrowerks, Compaq Tru64)
8242 * - there are platforms that give '0x' when converting
8243 * under %#X, but convert 0 in accordance with the
8244 * standard (OS/2 EMX)
8245 *
8246 * We can achieve the desired consistency by inserting our
8247 * own '0x' or '0X' prefix, and substituting %x/%X in place
8248 * of %#x/%#X.
8249 *
8250 * Note that this is the same approach as used in
8251 * formatint() in stringobject.c
8252 */
8253 PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8254 sign, type, prec, type);
8255 }
8256 else {
8257 PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8258 sign, (flags&F_ALT) ? "#" : "",
8259 prec, type);
8260 }
8261 if (sign[0])
8262 return longtounicode(buf, buflen, fmt, -x);
8263 else
8264 return longtounicode(buf, buflen, fmt, x);
8265 }
8266
8267 static int
8268 formatchar(Py_UNICODE *buf,
8269 size_t buflen,
8270 PyObject *v)
8271 {
8272 PyObject *unistr;
8273 char *str;
8274 /* presume that the buffer is at least 2 characters long */
8275 if (PyUnicode_Check(v)) {
8276 if (PyUnicode_GET_SIZE(v) != 1)
8277 goto onError;
8278 buf[0] = PyUnicode_AS_UNICODE(v)[0];
8279 }
8280
8281 else if (PyString_Check(v)) {
8282 if (PyString_GET_SIZE(v) != 1)
8283 goto onError;
8284 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8285 with a UnicodeDecodeError if 'char' is not decodable with the
8286 default encoding (usually ASCII, but it might be something else) */
8287 str = PyString_AS_STRING(v);
8288 if ((unsigned char)str[0] > 0x7F) {
8289 /* the char is not ASCII; try to decode the string using the
8290 default encoding and return -1 to let the UnicodeDecodeError
8291 be raised if the string can't be decoded */
8292 unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8293 if (unistr == NULL)
8294 return -1;
8295 buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8296 Py_DECREF(unistr);
8297 }
8298 else
8299 buf[0] = (Py_UNICODE)str[0];
8300 }
8301
8302 else {
8303 /* Integer input truncated to a character */
8304 long x;
8305 x = PyInt_AsLong(v);
8306 if (x == -1 && PyErr_Occurred())
8307 goto onError;
8308 #ifdef Py_UNICODE_WIDE
8309 if (x < 0 || x > 0x10ffff) {
8310 PyErr_SetString(PyExc_OverflowError,
8311 "%c arg not in range(0x110000) "
8312 "(wide Python build)");
8313 return -1;
8314 }
8315 #else
8316 if (x < 0 || x > 0xffff) {
8317 PyErr_SetString(PyExc_OverflowError,
8318 "%c arg not in range(0x10000) "
8319 "(narrow Python build)");
8320 return -1;
8321 }
8322 #endif
8323 buf[0] = (Py_UNICODE) x;
8324 }
8325 buf[1] = '\0';
8326 return 1;
8327
8328 onError:
8329 PyErr_SetString(PyExc_TypeError,
8330 "%c requires int or char");
8331 return -1;
8332 }
8333
8334 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8335
8336 FORMATBUFLEN is the length of the buffer in which the ints &
8337 chars are formatted. XXX This is a magic number. Each formatting
8338 routine does bounds checking to ensure no overflow, but a better
8339 solution may be to malloc a buffer of appropriate size for each
8340 format. For now, the current solution is sufficient.
8341 */
8342 #define FORMATBUFLEN (size_t)120
8343
8344 PyObject *PyUnicode_Format(PyObject *format,
8345 PyObject *args)
8346 {
8347 Py_UNICODE *fmt, *res;
8348 Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8349 int args_owned = 0;
8350 PyUnicodeObject *result = NULL;
8351 PyObject *dict = NULL;
8352 PyObject *uformat;
8353
8354 if (format == NULL || args == NULL) {
8355 PyErr_BadInternalCall();
8356 return NULL;
8357 }
8358 uformat = PyUnicode_FromObject(format);
8359 if (uformat == NULL)
8360 return NULL;
8361 fmt = PyUnicode_AS_UNICODE(uformat);
8362 fmtcnt = PyUnicode_GET_SIZE(uformat);
8363
8364 reslen = rescnt = fmtcnt + 100;
8365 result = _PyUnicode_New(reslen);
8366 if (result == NULL)
8367 goto onError;
8368 res = PyUnicode_AS_UNICODE(result);
8369
8370 if (PyTuple_Check(args)) {
8371 arglen = PyTuple_Size(args);
8372 argidx = 0;
8373 }
8374 else {
8375 arglen = -1;
8376 argidx = -2;
8377 }
8378 if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8379 !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
8380 dict = args;
8381
8382 while (--fmtcnt >= 0) {
8383 if (*fmt != '%') {
8384 if (--rescnt < 0) {
8385 rescnt = fmtcnt + 100;
8386 reslen += rescnt;
8387 if (_PyUnicode_Resize(&result, reslen) < 0)
8388 goto onError;
8389 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8390 --rescnt;
8391 }
8392 *res++ = *fmt++;
8393 }
8394 else {
8395 /* Got a format specifier */
8396 int flags = 0;
8397 Py_ssize_t width = -1;
8398 int prec = -1;
8399 Py_UNICODE c = '\0';
8400 Py_UNICODE fill;
8401 int isnumok;
8402 PyObject *v = NULL;
8403 PyObject *temp = NULL;
8404 Py_UNICODE *pbuf;
8405 Py_UNICODE sign;
8406 Py_ssize_t len;
8407 Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8408
8409 fmt++;
8410 if (*fmt == '(') {
8411 Py_UNICODE *keystart;
8412 Py_ssize_t keylen;
8413 PyObject *key;
8414 int pcount = 1;
8415
8416 if (dict == NULL) {
8417 PyErr_SetString(PyExc_TypeError,
8418 "format requires a mapping");
8419 goto onError;
8420 }
8421 ++fmt;
8422 --fmtcnt;
8423 keystart = fmt;
8424 /* Skip over balanced parentheses */
8425 while (pcount > 0 && --fmtcnt >= 0) {
8426 if (*fmt == ')')
8427 --pcount;
8428 else if (*fmt == '(')
8429 ++pcount;
8430 fmt++;
8431 }
8432 keylen = fmt - keystart - 1;
8433 if (fmtcnt < 0 || pcount > 0) {
8434 PyErr_SetString(PyExc_ValueError,
8435 "incomplete format key");
8436 goto onError;
8437 }
8438 #if 0
8439 /* keys are converted to strings using UTF-8 and
8440 then looked up since Python uses strings to hold
8441 variables names etc. in its namespaces and we
8442 wouldn't want to break common idioms. */
8443 key = PyUnicode_EncodeUTF8(keystart,
8444 keylen,
8445 NULL);
8446 #else
8447 key = PyUnicode_FromUnicode(keystart, keylen);
8448 #endif
8449 if (key == NULL)
8450 goto onError;
8451 if (args_owned) {
8452 Py_DECREF(args);
8453 args_owned = 0;
8454 }
8455 args = PyObject_GetItem(dict, key);
8456 Py_DECREF(key);
8457 if (args == NULL) {
8458 goto onError;
8459 }
8460 args_owned = 1;
8461 arglen = -1;
8462 argidx = -2;
8463 }
8464 while (--fmtcnt >= 0) {
8465 switch (c = *fmt++) {
8466 case '-': flags |= F_LJUST; continue;
8467 case '+': flags |= F_SIGN; continue;
8468 case ' ': flags |= F_BLANK; continue;
8469 case '#': flags |= F_ALT; continue;
8470 case '0': flags |= F_ZERO; continue;
8471 }
8472 break;
8473 }
8474 if (c == '*') {
8475 v = getnextarg(args, arglen, &argidx);
8476 if (v == NULL)
8477 goto onError;
8478 if (!PyInt_Check(v)) {
8479 PyErr_SetString(PyExc_TypeError,
8480 "* wants int");
8481 goto onError;
8482 }
8483 width = PyInt_AsSsize_t(v);
8484 if (width == -1 && PyErr_Occurred())
8485 goto onError;
8486 if (width < 0) {
8487 flags |= F_LJUST;
8488 width = -width;
8489 }
8490 if (--fmtcnt >= 0)
8491 c = *fmt++;
8492 }
8493 else if (c >= '0' && c <= '9') {
8494 width = c - '0';
8495 while (--fmtcnt >= 0) {
8496 c = *fmt++;
8497 if (c < '0' || c > '9')
8498 break;
8499 if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
8500 PyErr_SetString(PyExc_ValueError,
8501 "width too big");
8502 goto onError;
8503 }
8504 width = width*10 + (c - '0');
8505 }
8506 }
8507 if (c == '.') {
8508 prec = 0;
8509 if (--fmtcnt >= 0)
8510 c = *fmt++;
8511 if (c == '*') {
8512 v = getnextarg(args, arglen, &argidx);
8513 if (v == NULL)
8514 goto onError;
8515 if (!PyInt_Check(v)) {
8516 PyErr_SetString(PyExc_TypeError,
8517 "* wants int");
8518 goto onError;
8519 }
8520 prec = _PyInt_AsInt(v);
8521 if (prec == -1 && PyErr_Occurred())
8522 goto onError;
8523 if (prec < 0)
8524 prec = 0;
8525 if (--fmtcnt >= 0)
8526 c = *fmt++;
8527 }
8528 else if (c >= '0' && c <= '9') {
8529 prec = c - '0';
8530 while (--fmtcnt >= 0) {
8531 c = *fmt++;
8532 if (c < '0' || c > '9')
8533 break;
8534 if (prec > (INT_MAX - ((int)c - '0')) / 10) {
8535 PyErr_SetString(PyExc_ValueError,
8536 "prec too big");
8537 goto onError;
8538 }
8539 prec = prec*10 + (c - '0');
8540 }
8541 }
8542 } /* prec */
8543 if (fmtcnt >= 0) {
8544 if (c == 'h' || c == 'l' || c == 'L') {
8545 if (--fmtcnt >= 0)
8546 c = *fmt++;
8547 }
8548 }
8549 if (fmtcnt < 0) {
8550 PyErr_SetString(PyExc_ValueError,
8551 "incomplete format");
8552 goto onError;
8553 }
8554 if (c != '%') {
8555 v = getnextarg(args, arglen, &argidx);
8556 if (v == NULL)
8557 goto onError;
8558 }
8559 sign = 0;
8560 fill = ' ';
8561 switch (c) {
8562
8563 case '%':
8564 pbuf = formatbuf;
8565 /* presume that buffer length is at least 1 */
8566 pbuf[0] = '%';
8567 len = 1;
8568 break;
8569
8570 case 's':
8571 case 'r':
8572 if (PyUnicode_CheckExact(v) && c == 's') {
8573 temp = v;
8574 Py_INCREF(temp);
8575 }
8576 else {
8577 PyObject *unicode;
8578 if (c == 's')
8579 temp = PyObject_Unicode(v);
8580 else
8581 temp = PyObject_Repr(v);
8582 if (temp == NULL)
8583 goto onError;
8584 if (PyUnicode_Check(temp))
8585 /* nothing to do */;
8586 else if (PyString_Check(temp)) {
8587 /* convert to string to Unicode */
8588 unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8589 PyString_GET_SIZE(temp),
8590 NULL,
8591 "strict");
8592 Py_DECREF(temp);
8593 temp = unicode;
8594 if (temp == NULL)
8595 goto onError;
8596 }
8597 else {
8598 Py_DECREF(temp);
8599 PyErr_SetString(PyExc_TypeError,
8600 "%s argument has non-string str()");
8601 goto onError;
8602 }
8603 }
8604 pbuf = PyUnicode_AS_UNICODE(temp);
8605 len = PyUnicode_GET_SIZE(temp);
8606 if (prec >= 0 && len > prec)
8607 len = prec;
8608 break;
8609
8610 case 'i':
8611 case 'd':
8612 case 'u':
8613 case 'o':
8614 case 'x':
8615 case 'X':
8616 if (c == 'i')
8617 c = 'd';
8618 isnumok = 0;
8619 if (PyNumber_Check(v)) {
8620 PyObject *iobj=NULL;
8621
8622 if (PyInt_Check(v) || (PyLong_Check(v))) {
8623 iobj = v;
8624 Py_INCREF(iobj);
8625 }
8626 else {
8627 iobj = PyNumber_Int(v);
8628 if (iobj==NULL) iobj = PyNumber_Long(v);
8629 }
8630 if (iobj!=NULL) {
8631 if (PyInt_Check(iobj)) {
8632 isnumok = 1;
8633 pbuf = formatbuf;
8634 len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8635 flags, prec, c, iobj);
8636 Py_DECREF(iobj);
8637 if (len < 0)
8638 goto onError;
8639 sign = 1;
8640 }
8641 else if (PyLong_Check(iobj)) {
8642 isnumok = 1;
8643 temp = formatlong(iobj, flags, prec, c);
8644 Py_DECREF(iobj);
8645 if (!temp)
8646 goto onError;
8647 pbuf = PyUnicode_AS_UNICODE(temp);
8648 len = PyUnicode_GET_SIZE(temp);
8649 sign = 1;
8650 }
8651 else {
8652 Py_DECREF(iobj);
8653 }
8654 }
8655 }
8656 if (!isnumok) {
8657 PyErr_Format(PyExc_TypeError,
8658 "%%%c format: a number is required, "
8659 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8660 goto onError;
8661 }
8662 if (flags & F_ZERO)
8663 fill = '0';
8664 break;
8665
8666 case 'e':
8667 case 'E':
8668 case 'f':
8669 case 'F':
8670 case 'g':
8671 case 'G':
8672 temp = formatfloat(v, flags, prec, c);
8673 if (temp == NULL)
8674 goto onError;
8675 pbuf = PyUnicode_AS_UNICODE(temp);
8676 len = PyUnicode_GET_SIZE(temp);
8677 sign = 1;
8678 if (flags & F_ZERO)
8679 fill = '0';
8680 break;
8681
8682 case 'c':
8683 pbuf = formatbuf;
8684 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8685 if (len < 0)
8686 goto onError;
8687 break;
8688
8689 default:
8690 PyErr_Format(PyExc_ValueError,
8691 "unsupported format character '%c' (0x%x) "
8692 "at index %zd",
8693 (31<=c && c<=126) ? (char)c : '?',
8694 (int)c,
8695 (Py_ssize_t)(fmt - 1 -
8696 PyUnicode_AS_UNICODE(uformat)));
8697 goto onError;
8698 }
8699 if (sign) {
8700 if (*pbuf == '-' || *pbuf == '+') {
8701 sign = *pbuf++;
8702 len--;
8703 }
8704 else if (flags & F_SIGN)
8705 sign = '+';
8706 else if (flags & F_BLANK)
8707 sign = ' ';
8708 else
8709 sign = 0;
8710 }
8711 if (width < len)
8712 width = len;
8713 if (rescnt - (sign != 0) < width) {
8714 reslen -= rescnt;
8715 rescnt = width + fmtcnt + 100;
8716 reslen += rescnt;
8717 if (reslen < 0) {
8718 Py_XDECREF(temp);
8719 PyErr_NoMemory();
8720 goto onError;
8721 }
8722 if (_PyUnicode_Resize(&result, reslen) < 0) {
8723 Py_XDECREF(temp);
8724 goto onError;
8725 }
8726 res = PyUnicode_AS_UNICODE(result)
8727 + reslen - rescnt;
8728 }
8729 if (sign) {
8730 if (fill != ' ')
8731 *res++ = sign;
8732 rescnt--;
8733 if (width > len)
8734 width--;
8735 }
8736 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8737 assert(pbuf[0] == '0');
8738 assert(pbuf[1] == c);
8739 if (fill != ' ') {
8740 *res++ = *pbuf++;
8741 *res++ = *pbuf++;
8742 }
8743 rescnt -= 2;
8744 width -= 2;
8745 if (width < 0)
8746 width = 0;
8747 len -= 2;
8748 }
8749 if (width > len && !(flags & F_LJUST)) {
8750 do {
8751 --rescnt;
8752 *res++ = fill;
8753 } while (--width > len);
8754 }
8755 if (fill == ' ') {
8756 if (sign)
8757 *res++ = sign;
8758 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8759 assert(pbuf[0] == '0');
8760 assert(pbuf[1] == c);
8761 *res++ = *pbuf++;
8762 *res++ = *pbuf++;
8763 }
8764 }
8765 Py_UNICODE_COPY(res, pbuf, len);
8766 res += len;
8767 rescnt -= len;
8768 while (--width >= len) {
8769 --rescnt;
8770 *res++ = ' ';
8771 }
8772 if (dict && (argidx < arglen) && c != '%') {
8773 PyErr_SetString(PyExc_TypeError,
8774 "not all arguments converted during string formatting");
8775 Py_XDECREF(temp);
8776 goto onError;
8777 }
8778 Py_XDECREF(temp);
8779 } /* '%' */
8780 } /* until end */
8781 if (argidx < arglen && !dict) {
8782 PyErr_SetString(PyExc_TypeError,
8783 "not all arguments converted during string formatting");
8784 goto onError;
8785 }
8786
8787 if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8788 goto onError;
8789 if (args_owned) {
8790 Py_DECREF(args);
8791 }
8792 Py_DECREF(uformat);
8793 return (PyObject *)result;
8794
8795 onError:
8796 Py_XDECREF(result);
8797 Py_DECREF(uformat);
8798 if (args_owned) {
8799 Py_DECREF(args);
8800 }
8801 return NULL;
8802 }
8803
8804 static PyBufferProcs unicode_as_buffer = {
8805 (readbufferproc) unicode_buffer_getreadbuf,
8806 (writebufferproc) unicode_buffer_getwritebuf,
8807 (segcountproc) unicode_buffer_getsegcount,
8808 (charbufferproc) unicode_buffer_getcharbuf,
8809 };
8810
8811 static PyObject *
8812 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8813
8814 static PyObject *
8815 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8816 {
8817 PyObject *x = NULL;
8818 static char *kwlist[] = {"string", "encoding", "errors", 0};
8819 char *encoding = NULL;
8820 char *errors = NULL;
8821
8822 if (type != &PyUnicode_Type)
8823 return unicode_subtype_new(type, args, kwds);
8824 if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8825 kwlist, &x, &encoding, &errors))
8826 return NULL;
8827 if (x == NULL)
8828 return (PyObject *)_PyUnicode_New(0);
8829 if (encoding == NULL && errors == NULL)
8830 return PyObject_Unicode(x);
8831 else
8832 return PyUnicode_FromEncodedObject(x, encoding, errors);
8833 }
8834
8835 static PyObject *
8836 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8837 {
8838 PyUnicodeObject *tmp, *pnew;
8839 Py_ssize_t n;
8840
8841 assert(PyType_IsSubtype(type, &PyUnicode_Type));
8842 tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8843 if (tmp == NULL)
8844 return NULL;
8845 assert(PyUnicode_Check(tmp));
8846 pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8847 if (pnew == NULL) {
8848 Py_DECREF(tmp);
8849 return NULL;
8850 }
8851 pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8852 if (pnew->str == NULL) {
8853 _Py_ForgetReference((PyObject *)pnew);
8854 PyObject_Del(pnew);
8855 Py_DECREF(tmp);
8856 return PyErr_NoMemory();
8857 }
8858 Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8859 pnew->length = n;
8860 pnew->hash = tmp->hash;
8861 Py_DECREF(tmp);
8862 return (PyObject *)pnew;
8863 }
8864
8865 PyDoc_STRVAR(unicode_doc,
8866 "unicode(object='') -> unicode object\n\
8867 unicode(string[, encoding[, errors]]) -> unicode object\n\
8868 \n\
8869 Create a new Unicode object from the given encoded string.\n\
8870 encoding defaults to the current default string encoding.\n\
8871 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8872
8873 PyTypeObject PyUnicode_Type = {
8874 PyVarObject_HEAD_INIT(&PyType_Type, 0)
8875 "unicode", /* tp_name */
8876 sizeof(PyUnicodeObject), /* tp_size */
8877 0, /* tp_itemsize */
8878 /* Slots */
8879 (destructor)unicode_dealloc, /* tp_dealloc */
8880 0, /* tp_print */
8881 0, /* tp_getattr */
8882 0, /* tp_setattr */
8883 0, /* tp_compare */
8884 unicode_repr, /* tp_repr */
8885 &unicode_as_number, /* tp_as_number */
8886 &unicode_as_sequence, /* tp_as_sequence */
8887 &unicode_as_mapping, /* tp_as_mapping */
8888 (hashfunc) unicode_hash, /* tp_hash*/
8889 0, /* tp_call*/
8890 (reprfunc) unicode_str, /* tp_str */
8891 PyObject_GenericGetAttr, /* tp_getattro */
8892 0, /* tp_setattro */
8893 &unicode_as_buffer, /* tp_as_buffer */
8894 Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8895 Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS, /* tp_flags */
8896 unicode_doc, /* tp_doc */
8897 0, /* tp_traverse */
8898 0, /* tp_clear */
8899 PyUnicode_RichCompare, /* tp_richcompare */
8900 0, /* tp_weaklistoffset */
8901 0, /* tp_iter */
8902 0, /* tp_iternext */
8903 unicode_methods, /* tp_methods */
8904 0, /* tp_members */
8905 0, /* tp_getset */
8906 &PyBaseString_Type, /* tp_base */
8907 0, /* tp_dict */
8908 0, /* tp_descr_get */
8909 0, /* tp_descr_set */
8910 0, /* tp_dictoffset */
8911 0, /* tp_init */
8912 0, /* tp_alloc */
8913 unicode_new, /* tp_new */
8914 PyObject_Del, /* tp_free */
8915 };
8916
8917 /* Initialize the Unicode implementation */
8918
8919 void _PyUnicode_Init(void)
8920 {
8921 /* XXX - move this array to unicodectype.c ? */
8922 Py_UNICODE linebreak[] = {
8923 0x000A, /* LINE FEED */
8924 0x000D, /* CARRIAGE RETURN */
8925 0x001C, /* FILE SEPARATOR */
8926 0x001D, /* GROUP SEPARATOR */
8927 0x001E, /* RECORD SEPARATOR */
8928 0x0085, /* NEXT LINE */
8929 0x2028, /* LINE SEPARATOR */
8930 0x2029, /* PARAGRAPH SEPARATOR */
8931 };
8932
8933 /* Init the implementation */
8934 if (!unicode_empty) {
8935 unicode_empty = _PyUnicode_New(0);
8936 if (!unicode_empty)
8937 return;
8938 }
8939
8940 if (PyType_Ready(&PyUnicode_Type) < 0)
8941 Py_FatalError("Can't initialize 'unicode'");
8942
8943 /* initialize the linebreak bloom filter */
8944 bloom_linebreak = make_bloom_mask(
8945 linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8946 );
8947
8948 PyType_Ready(&EncodingMapType);
8949
8950 if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8951 Py_FatalError("Can't initialize field name iterator type");
8952
8953 if (PyType_Ready(&PyFormatterIter_Type) < 0)
8954 Py_FatalError("Can't initialize formatter iter type");
8955 }
8956
8957 /* Finalize the Unicode implementation */
8958
8959 int
8960 PyUnicode_ClearFreeList(void)
8961 {
8962 int freelist_size = numfree;
8963 PyUnicodeObject *u;
8964
8965 for (u = free_list; u != NULL;) {
8966 PyUnicodeObject *v = u;
8967 u = *(PyUnicodeObject **)u;
8968 if (v->str)
8969 PyObject_DEL(v->str);
8970 Py_XDECREF(v->defenc);
8971 PyObject_Del(v);
8972 numfree--;
8973 }
8974 free_list = NULL;
8975 assert(numfree == 0);
8976 return freelist_size;
8977 }
8978
8979 void
8980 _PyUnicode_Fini(void)
8981 {
8982 int i;
8983
8984 Py_CLEAR(unicode_empty);
8985
8986 for (i = 0; i < 256; i++)
8987 Py_CLEAR(unicode_latin1[i]);
8988
8989 (void)PyUnicode_ClearFreeList();
8990 }
8991
8992 #ifdef __cplusplus
8993 }
8994 #endif