3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 NOTE: In the interpreter's initialization phase, some globals are currently
86 initialized dynamically as needed. In the process Unicode objects may
87 be created before the Unicode type is ready.
96 /* Free list for Unicode objects */
97 static PyUnicodeObject
*free_list
= NULL
;
98 static int numfree
= 0;
100 /* The empty Unicode object is shared to improve performance. */
101 static PyUnicodeObject
*unicode_empty
= NULL
;
103 #define _Py_RETURN_UNICODE_EMPTY() \
105 if (unicode_empty != NULL) \
106 Py_INCREF(unicode_empty); \
108 unicode_empty = _PyUnicode_New(0); \
109 if (unicode_empty != NULL) \
110 Py_INCREF(unicode_empty); \
112 return (PyObject *)unicode_empty; \
115 /* Single character Unicode strings in the Latin-1 range are being
117 static PyUnicodeObject
*unicode_latin1
[256] = {NULL
};
119 /* Default encoding to use and assume when NULL is passed as encoding
120 parameter; it is initialized by _PyUnicode_Init().
122 Always use the PyUnicode_SetDefaultEncoding() and
123 PyUnicode_GetDefaultEncoding() APIs to access this global.
126 static char unicode_default_encoding
[100 + 1] = "ascii";
128 /* Fast detection of the most frequent whitespace characters */
129 const unsigned char _Py_ascii_whitespace
[] = {
130 0, 0, 0, 0, 0, 0, 0, 0,
131 /* case 0x0009: * CHARACTER TABULATION */
132 /* case 0x000A: * LINE FEED */
133 /* case 0x000B: * LINE TABULATION */
134 /* case 0x000C: * FORM FEED */
135 /* case 0x000D: * CARRIAGE RETURN */
136 0, 1, 1, 1, 1, 1, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 /* case 0x001C: * FILE SEPARATOR */
139 /* case 0x001D: * GROUP SEPARATOR */
140 /* case 0x001E: * RECORD SEPARATOR */
141 /* case 0x001F: * UNIT SEPARATOR */
142 0, 0, 0, 0, 1, 1, 1, 1,
143 /* case 0x0020: * SPACE */
144 1, 0, 0, 0, 0, 0, 0, 0,
145 0, 0, 0, 0, 0, 0, 0, 0,
146 0, 0, 0, 0, 0, 0, 0, 0,
147 0, 0, 0, 0, 0, 0, 0, 0,
149 0, 0, 0, 0, 0, 0, 0, 0,
150 0, 0, 0, 0, 0, 0, 0, 0,
151 0, 0, 0, 0, 0, 0, 0, 0,
152 0, 0, 0, 0, 0, 0, 0, 0,
153 0, 0, 0, 0, 0, 0, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 0, 0, 0, 0, 0, 0, 0, 0,
156 0, 0, 0, 0, 0, 0, 0, 0
159 /* Same for linebreaks */
160 static unsigned char ascii_linebreak
[] = {
161 0, 0, 0, 0, 0, 0, 0, 0,
162 /* 0x000A, * LINE FEED */
163 /* 0x000B, * LINE TABULATION */
164 /* 0x000C, * FORM FEED */
165 /* 0x000D, * CARRIAGE RETURN */
166 0, 0, 1, 1, 1, 1, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 /* 0x001C, * FILE SEPARATOR */
169 /* 0x001D, * GROUP SEPARATOR */
170 /* 0x001E, * RECORD SEPARATOR */
171 0, 0, 0, 0, 1, 1, 1, 0,
172 0, 0, 0, 0, 0, 0, 0, 0,
173 0, 0, 0, 0, 0, 0, 0, 0,
174 0, 0, 0, 0, 0, 0, 0, 0,
175 0, 0, 0, 0, 0, 0, 0, 0,
177 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0,
183 0, 0, 0, 0, 0, 0, 0, 0,
184 0, 0, 0, 0, 0, 0, 0, 0
189 PyUnicode_GetMax(void)
191 #ifdef Py_UNICODE_WIDE
194 /* This is actually an illegal character, so it should
195 not be passed to unichr. */
200 /* --- Bloom Filters ----------------------------------------------------- */
202 /* stuff to implement simple "bloom filters" for Unicode characters.
203 to keep things simple, we use a single bitmask, using the least 5
204 bits from each unicode characters as the bit index. */
206 /* the linebreak mask is set up by Unicode_Init below */
209 #define BLOOM_WIDTH 128
211 #define BLOOM_WIDTH 64
213 #define BLOOM_WIDTH 32
215 #error "LONG_BIT is smaller than 32"
218 #define BLOOM_MASK unsigned long
220 static BLOOM_MASK bloom_linebreak
= ~(BLOOM_MASK
)0;
222 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
225 #define BLOOM_LINEBREAK(ch) \
226 ((ch) < 128U ? ascii_linebreak[(ch)] : \
227 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
229 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
231 /* calculate simple bloom-style bitmask for a given unicode string */
237 for (i
= 0; i
< len
; i
++)
238 BLOOM_ADD(mask
, ptr
[i
]);
243 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
247 for (i
= 0; i
< setlen
; i
++)
254 #define BLOOM_MEMBER(mask, chr, set, setlen) \
255 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257 /* --- Unicode Object ----------------------------------------------------- */
260 int unicode_resize(register PyUnicodeObject
*unicode
,
265 /* Shortcut if there's nothing much to do. */
266 if (unicode
->length
== length
)
269 /* Resizing shared object (unicode_empty or single character
270 objects) in-place is not allowed. Use PyUnicode_Resize()
273 if (unicode
== unicode_empty
||
274 (unicode
->length
== 1 &&
275 unicode
->str
[0] < 256U &&
276 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
277 PyErr_SetString(PyExc_SystemError
,
278 "can't resize shared unicode objects");
282 /* We allocate one more byte to make sure the string is Ux0000 terminated.
283 The overallocation is also used by fastsearch, which assumes that it's
284 safe to look at str[length] (without making any assumptions about what
287 oldstr
= unicode
->str
;
288 unicode
->str
= PyObject_REALLOC(unicode
->str
,
289 sizeof(Py_UNICODE
) * (length
+ 1));
291 unicode
->str
= (Py_UNICODE
*)oldstr
;
295 unicode
->str
[length
] = 0;
296 unicode
->length
= length
;
299 /* Reset the object caches */
300 if (unicode
->defenc
) {
301 Py_CLEAR(unicode
->defenc
);
308 /* We allocate one more byte to make sure the string is
309 Ux0000 terminated; some code relies on that.
311 XXX This allocator could further be enhanced by assuring that the
312 free list never reduces its size below 1.
317 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
319 register PyUnicodeObject
*unicode
;
321 /* Optimization for empty strings */
322 if (length
== 0 && unicode_empty
!= NULL
) {
323 Py_INCREF(unicode_empty
);
324 return unicode_empty
;
327 /* Ensure we won't overflow the size. */
328 if (length
> ((PY_SSIZE_T_MAX
/ sizeof(Py_UNICODE
)) - 1)) {
329 return (PyUnicodeObject
*)PyErr_NoMemory();
332 /* Unicode freelist & memory allocation */
335 free_list
= *(PyUnicodeObject
**)unicode
;
338 /* Keep-Alive optimization: we only upsize the buffer,
339 never downsize it. */
340 if ((unicode
->length
< length
) &&
341 unicode_resize(unicode
, length
) < 0) {
342 PyObject_DEL(unicode
->str
);
347 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
348 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
350 PyObject_INIT(unicode
, &PyUnicode_Type
);
354 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
357 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
358 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
365 /* Initialize the first element to guard against cases where
366 * the caller fails before initializing str -- unicode_resize()
367 * reads str[0], and the Keep-Alive optimization can keep memory
368 * allocated for str alive across a call to unicode_dealloc(unicode).
369 * We don't want unicode_resize to read uninitialized memory in
373 unicode
->str
[length
] = 0;
374 unicode
->length
= length
;
376 unicode
->defenc
= NULL
;
380 /* XXX UNREF/NEWREF interface should be more symmetrical */
382 _Py_ForgetReference((PyObject
*)unicode
);
383 PyObject_Del(unicode
);
388 void unicode_dealloc(register PyUnicodeObject
*unicode
)
390 if (PyUnicode_CheckExact(unicode
) &&
391 numfree
< PyUnicode_MAXFREELIST
) {
392 /* Keep-Alive optimization */
393 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
394 PyObject_DEL(unicode
->str
);
398 if (unicode
->defenc
) {
399 Py_CLEAR(unicode
->defenc
);
401 /* Add to free list */
402 *(PyUnicodeObject
**)unicode
= free_list
;
407 PyObject_DEL(unicode
->str
);
408 Py_XDECREF(unicode
->defenc
);
409 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
414 int _PyUnicode_Resize(PyUnicodeObject
**unicode
, Py_ssize_t length
)
416 register PyUnicodeObject
*v
;
418 /* Argument checks */
419 if (unicode
== NULL
) {
420 PyErr_BadInternalCall();
424 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
425 PyErr_BadInternalCall();
429 /* Resizing unicode_empty and single character objects is not
430 possible since these are being shared. We simply return a fresh
431 copy with the same Unicode content. */
432 if (v
->length
!= length
&&
433 (v
== unicode_empty
|| v
->length
== 1)) {
434 PyUnicodeObject
*w
= _PyUnicode_New(length
);
437 Py_UNICODE_COPY(w
->str
, v
->str
,
438 length
< v
->length
? length
: v
->length
);
444 /* Note that we don't have to modify *unicode for unshared Unicode
445 objects, since we can modify them in-place. */
446 return unicode_resize(v
, length
);
449 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
451 return _PyUnicode_Resize((PyUnicodeObject
**)unicode
, length
);
454 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
457 PyUnicodeObject
*unicode
;
459 /* If the Unicode data is known at construction time, we can apply
460 some optimizations which share commonly used objects. */
463 /* Optimization for empty strings */
465 _Py_RETURN_UNICODE_EMPTY();
467 /* Single character Unicode objects in the Latin-1 range are
468 shared when using this constructor */
469 if (size
== 1 && *u
< 256) {
470 unicode
= unicode_latin1
[*u
];
472 unicode
= _PyUnicode_New(1);
475 unicode
->str
[0] = *u
;
476 unicode_latin1
[*u
] = unicode
;
479 return (PyObject
*)unicode
;
483 unicode
= _PyUnicode_New(size
);
487 /* Copy the Unicode data into the new object */
489 Py_UNICODE_COPY(unicode
->str
, u
, size
);
491 return (PyObject
*)unicode
;
494 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
496 PyUnicodeObject
*unicode
;
499 PyErr_SetString(PyExc_SystemError
,
500 "Negative size passed to PyUnicode_FromStringAndSize");
504 /* If the Unicode data is known at construction time, we can apply
505 some optimizations which share commonly used objects.
506 Also, this means the input must be UTF-8, so fall back to the
507 UTF-8 decoder at the end. */
510 /* Optimization for empty strings */
512 _Py_RETURN_UNICODE_EMPTY();
514 /* Single characters are shared when using this constructor.
515 Restrict to ASCII, since the input must be UTF-8. */
516 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
517 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
519 unicode
= _PyUnicode_New(1);
522 unicode
->str
[0] = Py_CHARMASK(*u
);
523 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
526 return (PyObject
*)unicode
;
529 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
532 unicode
= _PyUnicode_New(size
);
536 return (PyObject
*)unicode
;
539 PyObject
*PyUnicode_FromString(const char *u
)
541 size_t size
= strlen(u
);
542 if (size
> PY_SSIZE_T_MAX
) {
543 PyErr_SetString(PyExc_OverflowError
, "input too long");
547 return PyUnicode_FromStringAndSize(u
, size
);
550 /* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 * (especially if used with other macros).
563 /* helper macros used by _Py_UNICODE_NEXT */
564 #define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565 #define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566 /* Join two surrogate characters and return a single Py_UCS4 value. */
567 #define _Py_UNICODE_JOIN_SURROGATES(high, low) \
568 (((((Py_UCS4)(high) & 0x03FF) << 10) | \
569 ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
571 #ifdef Py_UNICODE_WIDE
572 #define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
574 #define _Py_UNICODE_NEXT(ptr, end) \
575 (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) && \
576 _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ? \
577 ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
583 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584 # define CONVERT_WCHAR_TO_SURROGATES
587 #ifdef CONVERT_WCHAR_TO_SURROGATES
589 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590 to convert from UTF32 to UTF16. */
592 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
595 PyUnicodeObject
*unicode
;
596 register Py_ssize_t i
;
598 const wchar_t *orig_w
;
601 PyErr_BadInternalCall();
607 for (i
= size
; i
> 0; i
--) {
613 unicode
= _PyUnicode_New(alloc
);
617 /* Copy the wchar_t data into the new object */
619 register Py_UNICODE
*u
;
620 u
= PyUnicode_AS_UNICODE(unicode
);
621 for (i
= size
; i
> 0; i
--) {
623 wchar_t ordinal
= *w
++;
625 *u
++ = 0xD800 | (ordinal
>> 10);
626 *u
++ = 0xDC00 | (ordinal
& 0x3FF);
632 return (PyObject
*)unicode
;
637 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
640 PyUnicodeObject
*unicode
;
643 PyErr_BadInternalCall();
647 unicode
= _PyUnicode_New(size
);
651 /* Copy the wchar_t data into the new object */
652 #ifdef HAVE_USABLE_WCHAR_T
653 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
656 register Py_UNICODE
*u
;
657 register Py_ssize_t i
;
658 u
= PyUnicode_AS_UNICODE(unicode
);
659 for (i
= size
; i
> 0; i
--)
664 return (PyObject
*)unicode
;
667 #endif /* CONVERT_WCHAR_TO_SURROGATES */
669 #undef CONVERT_WCHAR_TO_SURROGATES
672 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
678 fmt
+= sprintf(fmt
, "%d", width
);
681 fmt
+= sprintf(fmt
, ".%d", precision
);
684 else if (size_tflag
) {
685 char *f
= PY_FORMAT_SIZE_T
;
693 #define appendstring(string) \
695 for (copy = string;*copy; copy++) { \
696 *s++ = (unsigned char)*copy; \
701 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
704 Py_ssize_t callcount
= 0;
705 PyObject
**callresults
= NULL
;
706 PyObject
**callresult
= NULL
;
714 /* used by sprintf */
716 /* use abuffer instead of buffer, if we need more space
717 * (which can happen if there's a format specifier with width). */
718 char *abuffer
= NULL
;
720 Py_ssize_t abuffersize
= 0;
721 char fmt
[60]; /* should be enough for %0width.precisionld */
724 #ifdef VA_LIST_IS_ARRAY
725 Py_MEMCPY(count
, vargs
, sizeof(va_list));
728 __va_copy(count
, vargs
);
733 /* step 1: count the number of %S/%R/%s format specifications
734 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735 * objects once during step 3 and put the result in an array) */
736 for (f
= format
; *f
; f
++) {
739 while (*f
&& *f
!= '%' && !isalpha((unsigned)*f
))
743 if (*f
== 's' || *f
=='S' || *f
=='R')
747 /* step 2: allocate memory for the results of
748 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
750 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
755 callresult
= callresults
;
757 /* step 3: figure out how large a buffer we need */
758 for (f
= format
; *f
; f
++) {
762 while (isdigit((unsigned)*f
))
763 width
= (width
*10) + *f
++ - '0';
767 while (isdigit((unsigned)*f
))
768 precision
= (precision
*10) + *f
++ - '0';
771 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772 * they don't affect the amount of space we reserve.
774 if ((*f
== 'l' || *f
== 'z') &&
775 (f
[1] == 'd' || f
[1] == 'u'))
781 int ordinal
= va_arg(count
, int);
782 #ifdef Py_UNICODE_WIDE
783 if (ordinal
< 0 || ordinal
> 0x10ffff) {
784 PyErr_SetString(PyExc_OverflowError
,
785 "%c arg not in range(0x110000) "
786 "(wide Python build)");
790 if (ordinal
< 0 || ordinal
> 0xffff) {
791 PyErr_SetString(PyExc_OverflowError
,
792 "%c arg not in range(0x10000) "
793 "(narrow Python build)");
797 /* fall through... */
802 case 'd': case 'u': case 'i': case 'x':
803 (void) va_arg(count
, int);
804 if (width
< precision
)
806 /* 20 bytes is enough to hold a 64-bit
807 integer. Decimal takes the most space.
808 This isn't enough for octal.
809 If a width is specified we need more
810 (which we allocate later). */
814 if (abuffersize
< width
)
820 const char *s
= va_arg(count
, const char*);
821 PyObject
*str
= PyUnicode_DecodeUTF8(s
, strlen(s
), "replace");
824 n
+= PyUnicode_GET_SIZE(str
);
825 /* Remember the str and switch to the next slot */
831 PyObject
*obj
= va_arg(count
, PyObject
*);
832 assert(obj
&& PyUnicode_Check(obj
));
833 n
+= PyUnicode_GET_SIZE(obj
);
838 PyObject
*obj
= va_arg(count
, PyObject
*);
839 const char *str
= va_arg(count
, const char *);
841 assert(!obj
|| PyUnicode_Check(obj
));
843 n
+= PyUnicode_GET_SIZE(obj
);
850 PyObject
*obj
= va_arg(count
, PyObject
*);
853 str
= PyObject_Str(obj
);
856 n
+= PyString_GET_SIZE(str
);
857 /* Remember the str and switch to the next slot */
863 PyObject
*obj
= va_arg(count
, PyObject
*);
866 repr
= PyObject_Repr(obj
);
869 n
+= PyUnicode_GET_SIZE(repr
);
870 /* Remember the repr and switch to the next slot */
871 *callresult
++ = repr
;
875 (void) va_arg(count
, int);
876 /* maximum 64-bit pointer representation:
878 * so 19 characters is enough.
879 * XXX I count 18 -- what's the extra for?
884 /* if we stumble upon an unknown
885 formatting code, copy the rest of
886 the format string to the output
887 string. (we cannot just skip the
888 code, since there's no way to know
889 what's in the argument list) */
897 if (abuffersize
> 20) {
898 /* add 1 for sprintf's trailing null byte */
899 abuffer
= PyObject_Malloc(abuffersize
+ 1);
904 realbuffer
= abuffer
;
908 /* step 4: fill the buffer */
909 /* Since we've analyzed how much space we need for the worst case,
910 we don't have to resize the string.
911 There can be no errors beyond this point. */
912 string
= PyUnicode_FromUnicode(NULL
, n
);
916 s
= PyUnicode_AS_UNICODE(string
);
917 callresult
= callresults
;
919 for (f
= format
; *f
; f
++) {
924 zeropad
= (*f
== '0');
925 /* parse the width.precision part */
927 while (isdigit((unsigned)*f
))
928 width
= (width
*10) + *f
++ - '0';
932 while (isdigit((unsigned)*f
))
933 precision
= (precision
*10) + *f
++ - '0';
935 /* handle the long flag, but only for %ld and %lu.
936 others can be added when necessary. */
937 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
941 /* handle the size_t flag. */
942 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
949 *s
++ = va_arg(vargs
, int);
952 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
954 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
956 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
958 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
959 appendstring(realbuffer
);
962 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
964 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
966 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
968 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
969 appendstring(realbuffer
);
972 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
973 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
974 appendstring(realbuffer
);
977 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
978 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
979 appendstring(realbuffer
);
983 /* unused, since we already have the result */
984 (void) va_arg(vargs
, char *);
985 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(*callresult
),
986 PyUnicode_GET_SIZE(*callresult
));
987 s
+= PyUnicode_GET_SIZE(*callresult
);
988 /* We're done with the unicode()/repr() => forget it */
989 Py_DECREF(*callresult
);
990 /* switch to next unicode()/repr() result */
996 PyObject
*obj
= va_arg(vargs
, PyObject
*);
997 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
998 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
1004 PyObject
*obj
= va_arg(vargs
, PyObject
*);
1005 const char *str
= va_arg(vargs
, const char *);
1007 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
1008 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
1018 const char *str
= PyString_AS_STRING(*callresult
);
1019 /* unused, since we already have the result */
1020 (void) va_arg(vargs
, PyObject
*);
1022 /* We're done with the unicode()/repr() => forget it */
1023 Py_DECREF(*callresult
);
1024 /* switch to next unicode()/repr() result */
1029 sprintf(buffer
, "%p", va_arg(vargs
, void*));
1030 /* %p is ill-defined: ensure leading 0x. */
1031 if (buffer
[1] == 'X')
1033 else if (buffer
[1] != 'x') {
1034 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
1038 appendstring(buffer
);
1053 PyObject_Free(callresults
);
1055 PyObject_Free(abuffer
);
1056 PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
1060 PyObject
**callresult2
= callresults
;
1061 while (callresult2
< callresult
) {
1062 Py_DECREF(*callresult2
);
1065 PyObject_Free(callresults
);
1068 PyObject_Free(abuffer
);
1075 PyUnicode_FromFormat(const char *format
, ...)
1080 #ifdef HAVE_STDARG_PROTOTYPES
1081 va_start(vargs
, format
);
1085 ret
= PyUnicode_FromFormatV(format
, vargs
);
1090 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
1094 if (unicode
== NULL
) {
1095 PyErr_BadInternalCall();
1099 /* If possible, try to copy the 0-termination as well */
1100 if (size
> PyUnicode_GET_SIZE(unicode
))
1101 size
= PyUnicode_GET_SIZE(unicode
) + 1;
1103 #ifdef HAVE_USABLE_WCHAR_T
1104 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
1107 register Py_UNICODE
*u
;
1108 register Py_ssize_t i
;
1109 u
= PyUnicode_AS_UNICODE(unicode
);
1110 for (i
= size
; i
> 0; i
--)
1115 if (size
> PyUnicode_GET_SIZE(unicode
))
1116 return PyUnicode_GET_SIZE(unicode
);
1123 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1127 #ifdef Py_UNICODE_WIDE
1128 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1129 PyErr_SetString(PyExc_ValueError
,
1130 "unichr() arg not in range(0x110000) "
1131 "(wide Python build)");
1135 if (ordinal
< 0 || ordinal
> 0xffff) {
1136 PyErr_SetString(PyExc_ValueError
,
1137 "unichr() arg not in range(0x10000) "
1138 "(narrow Python build)");
1143 s
[0] = (Py_UNICODE
)ordinal
;
1144 return PyUnicode_FromUnicode(s
, 1);
1147 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1149 /* XXX Perhaps we should make this API an alias of
1150 PyObject_Unicode() instead ?! */
1151 if (PyUnicode_CheckExact(obj
)) {
1155 if (PyUnicode_Check(obj
)) {
1156 /* For a Unicode subtype that's not a Unicode object,
1157 return a true Unicode object with the same data. */
1158 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1159 PyUnicode_GET_SIZE(obj
));
1161 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1164 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1165 const char *encoding
,
1168 const char *s
= NULL
;
1173 PyErr_BadInternalCall();
1178 /* For b/w compatibility we also accept Unicode objects provided
1179 that no encodings is given and then redirect to
1180 PyObject_Unicode() which then applies the additional logic for
1183 NOTE: This API should really only be used for object which
1184 represent *encoded* Unicode !
1187 if (PyUnicode_Check(obj
)) {
1189 PyErr_SetString(PyExc_TypeError
,
1190 "decoding Unicode is not supported");
1193 return PyObject_Unicode(obj
);
1196 if (PyUnicode_Check(obj
)) {
1197 PyErr_SetString(PyExc_TypeError
,
1198 "decoding Unicode is not supported");
1204 if (PyString_Check(obj
)) {
1205 s
= PyString_AS_STRING(obj
);
1206 len
= PyString_GET_SIZE(obj
);
1208 else if (PyByteArray_Check(obj
)) {
1209 /* Python 2.x specific */
1210 PyErr_Format(PyExc_TypeError
,
1211 "decoding bytearray is not supported");
1214 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1215 /* Overwrite the error message with something more useful in
1216 case of a TypeError. */
1217 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1218 PyErr_Format(PyExc_TypeError
,
1219 "coercing to Unicode: need string or buffer, "
1221 Py_TYPE(obj
)->tp_name
);
1225 /* Convert to Unicode */
1227 _Py_RETURN_UNICODE_EMPTY();
1229 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1236 PyObject
*PyUnicode_Decode(const char *s
,
1238 const char *encoding
,
1241 PyObject
*buffer
= NULL
, *unicode
;
1243 if (encoding
== NULL
)
1244 encoding
= PyUnicode_GetDefaultEncoding();
1246 /* Shortcuts for common default encodings */
1247 if (strcmp(encoding
, "utf-8") == 0)
1248 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1249 else if (strcmp(encoding
, "latin-1") == 0)
1250 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1251 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252 else if (strcmp(encoding
, "mbcs") == 0)
1253 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1255 else if (strcmp(encoding
, "ascii") == 0)
1256 return PyUnicode_DecodeASCII(s
, size
, errors
);
1258 /* Decode via the codec registry */
1259 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1262 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1263 if (unicode
== NULL
)
1265 if (!PyUnicode_Check(unicode
)) {
1266 PyErr_Format(PyExc_TypeError
,
1267 "decoder did not return an unicode object (type=%.400s)",
1268 Py_TYPE(unicode
)->tp_name
);
1280 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1281 const char *encoding
,
1286 if (!PyUnicode_Check(unicode
)) {
1287 PyErr_BadArgument();
1291 if (encoding
== NULL
)
1292 encoding
= PyUnicode_GetDefaultEncoding();
1294 /* Decode via the codec registry */
1295 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1304 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1306 const char *encoding
,
1309 PyObject
*v
, *unicode
;
1311 unicode
= PyUnicode_FromUnicode(s
, size
);
1312 if (unicode
== NULL
)
1314 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1319 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1320 const char *encoding
,
1325 if (!PyUnicode_Check(unicode
)) {
1326 PyErr_BadArgument();
1330 if (encoding
== NULL
)
1331 encoding
= PyUnicode_GetDefaultEncoding();
1333 /* Encode via the codec registry */
1334 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1343 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1344 const char *encoding
,
1349 if (!PyUnicode_Check(unicode
)) {
1350 PyErr_BadArgument();
1354 if (encoding
== NULL
)
1355 encoding
= PyUnicode_GetDefaultEncoding();
1357 /* Shortcuts for common default encodings */
1358 if (errors
== NULL
) {
1359 if (strcmp(encoding
, "utf-8") == 0)
1360 return PyUnicode_AsUTF8String(unicode
);
1361 else if (strcmp(encoding
, "latin-1") == 0)
1362 return PyUnicode_AsLatin1String(unicode
);
1363 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1364 else if (strcmp(encoding
, "mbcs") == 0)
1365 return PyUnicode_AsMBCSString(unicode
);
1367 else if (strcmp(encoding
, "ascii") == 0)
1368 return PyUnicode_AsASCIIString(unicode
);
1371 /* Encode via the codec registry */
1372 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1375 if (!PyString_Check(v
)) {
1376 PyErr_Format(PyExc_TypeError
,
1377 "encoder did not return a string object (type=%.400s)",
1378 Py_TYPE(v
)->tp_name
);
1388 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1391 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1395 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1396 if (v
&& errors
== NULL
)
1397 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1401 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1403 if (!PyUnicode_Check(unicode
)) {
1404 PyErr_BadArgument();
1407 return PyUnicode_AS_UNICODE(unicode
);
1413 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1415 if (!PyUnicode_Check(unicode
)) {
1416 PyErr_BadArgument();
1419 return PyUnicode_GET_SIZE(unicode
);
1425 const char *PyUnicode_GetDefaultEncoding(void)
1427 return unicode_default_encoding
;
1430 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1434 /* Make sure the encoding is valid. As side effect, this also
1435 loads the encoding into the codec registry cache. */
1436 v
= _PyCodec_Lookup(encoding
);
1440 strncpy(unicode_default_encoding
,
1442 sizeof(unicode_default_encoding
) - 1);
1449 /* error handling callback helper:
1450 build arguments, call the callback and check the arguments,
1451 if no exception occurred, copy the replacement to the output
1452 and adjust various state variables.
1453 return 0 on success, -1 on error
1457 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1458 const char *encoding
, const char *reason
,
1459 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1460 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1461 PyUnicodeObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1463 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1465 PyObject
*restuple
= NULL
;
1466 PyObject
*repunicode
= NULL
;
1467 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1468 Py_ssize_t requiredsize
;
1474 if (*errorHandler
== NULL
) {
1475 *errorHandler
= PyCodec_LookupError(errors
);
1476 if (*errorHandler
== NULL
)
1480 if (*exceptionObject
== NULL
) {
1481 *exceptionObject
= PyUnicodeDecodeError_Create(
1482 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1483 if (*exceptionObject
== NULL
)
1487 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1489 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1491 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1495 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1496 if (restuple
== NULL
)
1498 if (!PyTuple_Check(restuple
)) {
1499 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
1502 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1505 newpos
= insize
+newpos
;
1506 if (newpos
<0 || newpos
>insize
) {
1507 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1511 /* need more space? (at least enough for what we
1512 have+the replacement+the rest of the string (starting
1513 at the new input position), so we won't have to check space
1514 when there are no errors in the rest of the string) */
1515 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1516 repsize
= PyUnicode_GET_SIZE(repunicode
);
1517 requiredsize
= *outpos
;
1518 if (requiredsize
> PY_SSIZE_T_MAX
- repsize
)
1520 requiredsize
+= repsize
;
1521 if (requiredsize
> PY_SSIZE_T_MAX
- (insize
- newpos
))
1523 requiredsize
+= insize
- newpos
;
1524 if (requiredsize
> outsize
) {
1525 if (outsize
<= PY_SSIZE_T_MAX
/2 && requiredsize
< 2*outsize
)
1526 requiredsize
= 2*outsize
;
1527 if (_PyUnicode_Resize(output
, requiredsize
) < 0)
1529 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1532 *inptr
= input
+ newpos
;
1533 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1540 Py_XDECREF(restuple
);
1544 PyErr_SetString(PyExc_OverflowError
,
1545 "decoded result is too long for a Python string");
1549 /* --- UTF-7 Codec -------------------------------------------------------- */
1551 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1553 /* Three simple macros defining base-64. */
1555 /* Is c a base-64 character? */
1557 #define IS_BASE64(c) \
1558 (isalnum(c) || (c) == '+' || (c) == '/')
1560 /* given that c is a base-64 character, what is its base-64 value? */
1562 #define FROM_BASE64(c) \
1563 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1564 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1565 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1566 (c) == '+' ? 62 : 63)
1568 /* What is the base-64 character of the bottom 6 bits of n? */
1570 #define TO_BASE64(n) \
1571 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1573 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1574 * decoded as itself. We are permissive on decoding; the only ASCII
1575 * byte not decoding to itself is the + which begins a base64
1578 #define DECODE_DIRECT(c) \
1579 ((c) <= 127 && (c) != '+')
1581 /* The UTF-7 encoder treats ASCII characters differently according to
1582 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1583 * the above). See RFC2152. This array identifies these different
1586 * alphanumeric and '(),-./:?
1588 * !"#$%&*;<=>@[]^_`{|}
1591 * 3 : special (must be base64 encoded)
1592 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1596 char utf7_category
[128] = {
1597 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1598 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1599 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1600 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1601 /* sp ! " # $ % & ' ( ) * + , - . / */
1602 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1603 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1605 /* @ A B C D E F G H I J K L M N O */
1606 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1607 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1608 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1609 /* ` a b c d e f g h i j k l m n o */
1610 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1611 /* p q r s t u v w x y z { | } ~ del */
1612 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1615 /* ENCODE_DIRECT: this character should be encoded as itself. The
1616 * answer depends on whether we are encoding set O as itself, and also
1617 * on whether we are encoding whitespace as itself. RFC2152 makes it
1618 * clear that the answers to these questions vary between
1619 * applications, so this code needs to be flexible. */
1621 #define ENCODE_DIRECT(c, directO, directWS) \
1622 ((c) < 128 && (c) > 0 && \
1623 ((utf7_category[(c)] == 0) || \
1624 (directWS && (utf7_category[(c)] == 2)) || \
1625 (directO && (utf7_category[(c)] == 1))))
1627 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1631 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1634 /* The decoder. The only state we preserve is our read position,
1635 * i.e. how many characters we have consumed. So if we end in the
1636 * middle of a shift sequence we have to back off the read position
1637 * and the output to the beginning of the sequence, otherwise we lose
1638 * all the shift state (seen bits, number of bits seen, high
1641 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1644 Py_ssize_t
*consumed
)
1646 const char *starts
= s
;
1647 Py_ssize_t startinpos
;
1648 Py_ssize_t endinpos
;
1651 PyUnicodeObject
*unicode
;
1653 const char *errmsg
= "";
1655 Py_UNICODE
*shiftOutStart
;
1656 unsigned int base64bits
= 0;
1657 unsigned long base64buffer
= 0;
1658 Py_UNICODE surrogate
= 0;
1659 PyObject
*errorHandler
= NULL
;
1660 PyObject
*exc
= NULL
;
1662 unicode
= _PyUnicode_New(size
);
1668 return (PyObject
*)unicode
;
1676 Py_UNICODE ch
= (unsigned char) *s
;
1678 if (inShift
) { /* in a base-64 section */
1679 if (IS_BASE64(ch
)) { /* consume a base-64 character */
1680 base64buffer
= (base64buffer
<< 6) | FROM_BASE64(ch
);
1683 if (base64bits
>= 16) {
1684 /* we have enough bits for a UTF-16 value */
1685 Py_UNICODE outCh
= (Py_UNICODE
)
1686 (base64buffer
>> (base64bits
-16));
1688 base64buffer
&= (1 << base64bits
) - 1; /* clear high bits */
1689 assert(outCh
<= 0xffff);
1691 /* expecting a second surrogate */
1692 if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1693 #ifdef Py_UNICODE_WIDE
1694 *p
++ = (((surrogate
& 0x3FF)<<10)
1695 | (outCh
& 0x3FF)) + 0x10000;
1708 if (outCh
>= 0xD800 && outCh
<= 0xDBFF) {
1709 /* first surrogate */
1717 else { /* now leaving a base-64 section */
1724 if (base64bits
> 0) { /* left-over bits */
1725 if (base64bits
>= 6) {
1726 /* We've seen at least one base-64 character */
1727 errmsg
= "partial character in shift sequence";
1731 /* Some bits remain; they should be zero */
1732 if (base64buffer
!= 0) {
1733 errmsg
= "non-zero padding bits in shift sequence";
1739 /* '-' is absorbed; other terminating
1740 characters are preserved */
1745 else if ( ch
== '+' ) {
1746 startinpos
= s
-starts
;
1747 s
++; /* consume '+' */
1748 if (s
< e
&& *s
== '-') { /* '+-' encodes '+' */
1752 else { /* begin base64-encoded section */
1759 else if (DECODE_DIRECT(ch
)) { /* character decodes as itself */
1764 startinpos
= s
-starts
;
1766 errmsg
= "unexpected special character";
1771 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1772 endinpos
= s
-starts
;
1773 if (unicode_decode_call_errorhandler(
1774 errors
, &errorHandler
,
1776 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1777 &unicode
, &outpos
, &p
))
1783 if (inShift
&& !consumed
) { /* in shift sequence, no more to follow */
1784 /* if we're in an inconsistent state, that's an error */
1786 (base64bits
>= 6) ||
1787 (base64bits
> 0 && base64buffer
!= 0)) {
1788 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1790 if (unicode_decode_call_errorhandler(
1791 errors
, &errorHandler
,
1792 "utf7", "unterminated shift sequence",
1793 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1794 &unicode
, &outpos
, &p
))
1802 p
= shiftOutStart
; /* back off output */
1803 *consumed
= startinpos
;
1806 *consumed
= s
-starts
;
1810 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1813 Py_XDECREF(errorHandler
);
1815 return (PyObject
*)unicode
;
1818 Py_XDECREF(errorHandler
);
1825 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1828 int base64WhiteSpace
,
1832 /* It might be possible to tighten this worst case */
1833 Py_ssize_t allocated
= 8 * size
;
1836 unsigned int base64bits
= 0;
1837 unsigned long base64buffer
= 0;
1841 if (allocated
/ 8 != size
)
1842 return PyErr_NoMemory();
1845 return PyString_FromStringAndSize(NULL
, 0);
1847 v
= PyString_FromStringAndSize(NULL
, allocated
);
1851 start
= out
= PyString_AS_STRING(v
);
1852 for (;i
< size
; ++i
) {
1853 Py_UNICODE ch
= s
[i
];
1856 if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1858 if (base64bits
) { /* output remaining bits */
1859 *out
++ = TO_BASE64(base64buffer
<< (6-base64bits
));
1864 /* Characters not in the BASE64 set implicitly unshift the sequence
1865 so no '-' is required, except if the character is itself a '-' */
1866 if (IS_BASE64(ch
) || ch
== '-') {
1875 else { /* not in a shift sequence */
1880 else if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1891 #ifdef Py_UNICODE_WIDE
1892 if (ch
>= 0x10000) {
1893 /* code first surrogate */
1895 base64buffer
= (base64buffer
<< 16) | 0xd800 | ((ch
-0x10000) >> 10);
1896 while (base64bits
>= 6) {
1897 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1900 /* prepare second surrogate */
1901 ch
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1905 base64buffer
= (base64buffer
<< 16) | ch
;
1906 while (base64bits
>= 6) {
1907 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1912 *out
++= TO_BASE64(base64buffer
<< (6-base64bits
) );
1916 if (_PyString_Resize(&v
, out
- start
))
1924 #undef DECODE_DIRECT
1925 #undef ENCODE_DIRECT
1927 /* --- UTF-8 Codec -------------------------------------------------------- */
1930 char utf8_code_length
[256] = {
1931 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1932 illegal prefix. See RFC 3629 for details */
1933 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1934 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1935 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1938 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1940 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1941 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1942 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1943 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1944 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1945 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1946 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1947 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1948 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1951 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1955 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1958 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1961 Py_ssize_t
*consumed
)
1963 const char *starts
= s
;
1966 Py_ssize_t startinpos
;
1967 Py_ssize_t endinpos
;
1970 PyUnicodeObject
*unicode
;
1972 const char *errmsg
= "";
1973 PyObject
*errorHandler
= NULL
;
1974 PyObject
*exc
= NULL
;
1976 /* Note: size will always be longer than the resulting Unicode
1978 unicode
= _PyUnicode_New(size
);
1984 return (PyObject
*)unicode
;
1987 /* Unpack UTF-8 encoded data */
1992 Py_UCS4 ch
= (unsigned char)*s
;
1995 *p
++ = (Py_UNICODE
)ch
;
2000 n
= utf8_code_length
[ch
];
2006 errmsg
= "unexpected end of data";
2007 startinpos
= s
-starts
;
2008 endinpos
= startinpos
+1;
2009 for (k
=1; (k
< size
-startinpos
) && ((s
[k
]&0xC0) == 0x80); k
++)
2018 errmsg
= "invalid start byte";
2019 startinpos
= s
-starts
;
2020 endinpos
= startinpos
+1;
2024 errmsg
= "internal error";
2025 startinpos
= s
-starts
;
2026 endinpos
= startinpos
+1;
2030 if ((s
[1] & 0xc0) != 0x80) {
2031 errmsg
= "invalid continuation byte";
2032 startinpos
= s
-starts
;
2033 endinpos
= startinpos
+ 1;
2036 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
2037 assert ((ch
> 0x007F) && (ch
<= 0x07FF));
2038 *p
++ = (Py_UNICODE
)ch
;
2042 /* XXX: surrogates shouldn't be valid UTF-8!
2043 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2044 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2045 Uncomment the 2 lines below to make them invalid,
2046 code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2047 if ((s
[1] & 0xc0) != 0x80 ||
2048 (s
[2] & 0xc0) != 0x80 ||
2049 ((unsigned char)s
[0] == 0xE0 &&
2050 (unsigned char)s
[1] < 0xA0)/* ||
2051 ((unsigned char)s[0] == 0xED &&
2052 (unsigned char)s[1] > 0x9F)*/) {
2053 errmsg
= "invalid continuation byte";
2054 startinpos
= s
-starts
;
2055 endinpos
= startinpos
+ 1;
2057 /* if s[1] first two bits are 1 and 0, then the invalid
2058 continuation byte is s[2], so increment endinpos by 1,
2059 if not, s[1] is invalid and endinpos doesn't need to
2061 if ((s
[1] & 0xC0) == 0x80)
2065 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
2066 assert ((ch
> 0x07FF) && (ch
<= 0xFFFF));
2067 *p
++ = (Py_UNICODE
)ch
;
2071 if ((s
[1] & 0xc0) != 0x80 ||
2072 (s
[2] & 0xc0) != 0x80 ||
2073 (s
[3] & 0xc0) != 0x80 ||
2074 ((unsigned char)s
[0] == 0xF0 &&
2075 (unsigned char)s
[1] < 0x90) ||
2076 ((unsigned char)s
[0] == 0xF4 &&
2077 (unsigned char)s
[1] > 0x8F)) {
2078 errmsg
= "invalid continuation byte";
2079 startinpos
= s
-starts
;
2080 endinpos
= startinpos
+ 1;
2081 if ((s
[1] & 0xC0) == 0x80) {
2083 if ((s
[2] & 0xC0) == 0x80)
2088 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
2089 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
2090 assert ((ch
> 0xFFFF) && (ch
<= 0x10ffff));
2092 #ifdef Py_UNICODE_WIDE
2093 *p
++ = (Py_UNICODE
)ch
;
2095 /* compute and append the two surrogates: */
2097 /* translate from 10000..10FFFF to 0..FFFF */
2100 /* high surrogate = top 10 bits added to D800 */
2101 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
2103 /* low surrogate = bottom 10 bits added to DC00 */
2104 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
2112 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2113 if (unicode_decode_call_errorhandler(
2114 errors
, &errorHandler
,
2116 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2117 &unicode
, &outpos
, &p
))
2121 *consumed
= s
-starts
;
2124 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2127 Py_XDECREF(errorHandler
);
2129 return (PyObject
*)unicode
;
2132 Py_XDECREF(errorHandler
);
2138 /* Allocation strategy: if the string is short, convert into a stack buffer
2139 and allocate exactly as much space needed at the end. Else allocate the
2140 maximum possible needed (4 result bytes per Unicode character), and return
2141 the excess memory at the end.
2144 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
2148 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2150 Py_ssize_t i
; /* index into s of next input byte */
2151 PyObject
*v
; /* result string object */
2152 char *p
; /* next free byte in output buffer */
2153 Py_ssize_t nallocated
; /* number of result bytes allocated */
2154 Py_ssize_t nneeded
; /* number of result bytes needed */
2155 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
2160 if (size
<= MAX_SHORT_UNICHARS
) {
2161 /* Write into the stack buffer; nallocated can't overflow.
2162 * At the end, we'll allocate exactly as much heap space as it
2163 * turns out we need.
2165 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
2166 v
= NULL
; /* will allocate after we're done */
2170 /* Overallocate on the heap, and give the excess back at the end. */
2171 nallocated
= size
* 4;
2172 if (nallocated
/ 4 != size
) /* overflow! */
2173 return PyErr_NoMemory();
2174 v
= PyString_FromStringAndSize(NULL
, nallocated
);
2177 p
= PyString_AS_STRING(v
);
2180 for (i
= 0; i
< size
;) {
2181 Py_UCS4 ch
= s
[i
++];
2187 else if (ch
< 0x0800) {
2188 /* Encode Latin-1 */
2189 *p
++ = (char)(0xc0 | (ch
>> 6));
2190 *p
++ = (char)(0x80 | (ch
& 0x3f));
2193 /* Encode UCS2 Unicode ordinals */
2195 /* Special case: check for high surrogate */
2196 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2198 /* Check for low surrogate and combine the two to
2199 form a UCS4 value */
2200 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2201 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2205 /* Fall through: handles isolated high surrogates */
2207 *p
++ = (char)(0xe0 | (ch
>> 12));
2208 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2209 *p
++ = (char)(0x80 | (ch
& 0x3f));
2213 /* Encode UCS4 Unicode ordinals */
2214 *p
++ = (char)(0xf0 | (ch
>> 18));
2215 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2216 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2217 *p
++ = (char)(0x80 | (ch
& 0x3f));
2222 /* This was stack allocated. */
2223 nneeded
= p
- stackbuf
;
2224 assert(nneeded
<= nallocated
);
2225 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2228 /* Cut back to size actually needed. */
2229 nneeded
= p
- PyString_AS_STRING(v
);
2230 assert(nneeded
<= nallocated
);
2231 if (_PyString_Resize(&v
, nneeded
))
2236 #undef MAX_SHORT_UNICHARS
2239 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2241 if (!PyUnicode_Check(unicode
)) {
2242 PyErr_BadArgument();
2245 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2246 PyUnicode_GET_SIZE(unicode
),
2250 /* --- UTF-32 Codec ------------------------------------------------------- */
2253 PyUnicode_DecodeUTF32(const char *s
,
2258 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2262 PyUnicode_DecodeUTF32Stateful(const char *s
,
2266 Py_ssize_t
*consumed
)
2268 const char *starts
= s
;
2269 Py_ssize_t startinpos
;
2270 Py_ssize_t endinpos
;
2272 PyUnicodeObject
*unicode
;
2274 #ifndef Py_UNICODE_WIDE
2276 const unsigned char *qq
;
2278 const int pairs
= 0;
2280 const unsigned char *q
, *e
;
2281 int bo
= 0; /* assume native ordering by default */
2282 const char *errmsg
= "";
2283 /* Offsets from q for retrieving bytes in the right order. */
2284 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2285 int iorder
[] = {0, 1, 2, 3};
2287 int iorder
[] = {3, 2, 1, 0};
2289 PyObject
*errorHandler
= NULL
;
2290 PyObject
*exc
= NULL
;
2292 q
= (unsigned char *)s
;
2298 /* Check for BOM marks (U+FEFF) in the input and adjust current
2299 byte order setting accordingly. In native mode, the leading BOM
2300 mark is skipped, in all other modes, it is copied to the output
2301 stream as-is (giving a ZWNBSP character). */
2304 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2305 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2306 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2307 if (bom
== 0x0000FEFF) {
2311 else if (bom
== 0xFFFE0000) {
2316 if (bom
== 0x0000FEFF) {
2320 else if (bom
== 0xFFFE0000) {
2343 /* On narrow builds we split characters outside the BMP into two
2344 code points => count how much extra space we need. */
2345 #ifndef Py_UNICODE_WIDE
2346 for (qq
= q
; e
- qq
>= 4; qq
+= 4)
2347 if (qq
[iorder
[2]] != 0 || qq
[iorder
[3]] != 0)
2351 /* This might be one to much, because of a BOM */
2352 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2356 return (PyObject
*)unicode
;
2358 /* Unpack UTF-32 encoded data */
2363 /* remaining bytes at the end? (size should be divisible by 4) */
2367 errmsg
= "truncated data";
2368 startinpos
= ((const char *)q
)-starts
;
2369 endinpos
= ((const char *)e
)-starts
;
2371 /* The remaining input chars are ignored if the callback
2372 chooses to skip the input */
2374 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2375 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2379 errmsg
= "code point not in range(0x110000)";
2380 startinpos
= ((const char *)q
)-starts
;
2381 endinpos
= startinpos
+4;
2384 #ifndef Py_UNICODE_WIDE
2387 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2388 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2396 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2397 if (unicode_decode_call_errorhandler(
2398 errors
, &errorHandler
,
2400 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2401 &unicode
, &outpos
, &p
))
2409 *consumed
= (const char *)q
-starts
;
2412 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2415 Py_XDECREF(errorHandler
);
2417 return (PyObject
*)unicode
;
2421 Py_XDECREF(errorHandler
);
2427 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2434 Py_ssize_t nsize
, bytesize
;
2435 #ifndef Py_UNICODE_WIDE
2436 Py_ssize_t i
, pairs
;
2438 const int pairs
= 0;
2440 /* Offsets from p for storing byte pairs in the right order. */
2441 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2442 int iorder
[] = {0, 1, 2, 3};
2444 int iorder
[] = {3, 2, 1, 0};
2447 #define STORECHAR(CH) \
2449 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2450 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2451 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2452 p[iorder[0]] = (CH) & 0xff; \
2456 /* In narrow builds we can output surrogate pairs as one code point,
2457 so we need less space. */
2458 #ifndef Py_UNICODE_WIDE
2459 for (i
= pairs
= 0; i
< size
-1; i
++)
2460 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2461 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2464 nsize
= (size
- pairs
+ (byteorder
== 0));
2465 bytesize
= nsize
* 4;
2466 if (bytesize
/ 4 != nsize
)
2467 return PyErr_NoMemory();
2468 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2472 p
= (unsigned char *)PyString_AS_STRING(v
);
2478 if (byteorder
== -1) {
2485 else if (byteorder
== 1) {
2493 while (size
-- > 0) {
2495 #ifndef Py_UNICODE_WIDE
2496 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2498 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2499 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2511 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2513 if (!PyUnicode_Check(unicode
)) {
2514 PyErr_BadArgument();
2517 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2518 PyUnicode_GET_SIZE(unicode
),
2523 /* --- UTF-16 Codec ------------------------------------------------------- */
2526 PyUnicode_DecodeUTF16(const char *s
,
2531 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2535 PyUnicode_DecodeUTF16Stateful(const char *s
,
2539 Py_ssize_t
*consumed
)
2541 const char *starts
= s
;
2542 Py_ssize_t startinpos
;
2543 Py_ssize_t endinpos
;
2545 PyUnicodeObject
*unicode
;
2547 const unsigned char *q
, *e
;
2548 int bo
= 0; /* assume native ordering by default */
2549 const char *errmsg
= "";
2550 /* Offsets from q for retrieving byte pairs in the right order. */
2551 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2552 int ihi
= 1, ilo
= 0;
2554 int ihi
= 0, ilo
= 1;
2556 PyObject
*errorHandler
= NULL
;
2557 PyObject
*exc
= NULL
;
2559 /* Note: size will always be longer than the resulting Unicode
2561 unicode
= _PyUnicode_New(size
);
2565 return (PyObject
*)unicode
;
2567 /* Unpack UTF-16 encoded data */
2569 q
= (unsigned char *)s
;
2575 /* Check for BOM marks (U+FEFF) in the input and adjust current
2576 byte order setting accordingly. In native mode, the leading BOM
2577 mark is skipped, in all other modes, it is copied to the output
2578 stream as-is (giving a ZWNBSP character). */
2581 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2582 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2583 if (bom
== 0xFEFF) {
2587 else if (bom
== 0xFFFE) {
2592 if (bom
== 0xFEFF) {
2596 else if (bom
== 0xFFFE) {
2617 /* remaining bytes at the end? (size should be even) */
2621 errmsg
= "truncated data";
2622 startinpos
= ((const char *)q
)-starts
;
2623 endinpos
= ((const char *)e
)-starts
;
2625 /* The remaining input chars are ignored if the callback
2626 chooses to skip the input */
2628 ch
= (q
[ihi
] << 8) | q
[ilo
];
2632 if (ch
< 0xD800 || ch
> 0xDFFF) {
2637 /* UTF-16 code pair: */
2642 errmsg
= "unexpected end of data";
2643 startinpos
= ((const char *)q
)-starts
;
2644 endinpos
= ((const char *)e
)-starts
;
2647 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2648 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2650 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2651 #ifndef Py_UNICODE_WIDE
2655 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2660 errmsg
= "illegal UTF-16 surrogate";
2661 startinpos
= (((const char *)q
)-4)-starts
;
2662 endinpos
= startinpos
+2;
2667 errmsg
= "illegal encoding";
2668 startinpos
= (((const char *)q
)-2)-starts
;
2669 endinpos
= startinpos
+2;
2670 /* Fall through to report the error */
2673 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2674 if (unicode_decode_call_errorhandler(
2675 errors
, &errorHandler
,
2677 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2678 &unicode
, &outpos
, &p
))
2686 *consumed
= (const char *)q
-starts
;
2689 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2692 Py_XDECREF(errorHandler
);
2694 return (PyObject
*)unicode
;
2698 Py_XDECREF(errorHandler
);
2704 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2711 Py_ssize_t nsize
, bytesize
;
2712 #ifdef Py_UNICODE_WIDE
2713 Py_ssize_t i
, pairs
;
2715 const int pairs
= 0;
2717 /* Offsets from p for storing byte pairs in the right order. */
2718 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2719 int ihi
= 1, ilo
= 0;
2721 int ihi
= 0, ilo
= 1;
2724 #define STORECHAR(CH) \
2726 p[ihi] = ((CH) >> 8) & 0xff; \
2727 p[ilo] = (CH) & 0xff; \
2731 #ifdef Py_UNICODE_WIDE
2732 for (i
= pairs
= 0; i
< size
; i
++)
2733 if (s
[i
] >= 0x10000)
2736 /* 2 * (size + pairs + (byteorder == 0)) */
2737 if (size
> PY_SSIZE_T_MAX
||
2738 size
> PY_SSIZE_T_MAX
- pairs
- (byteorder
== 0))
2739 return PyErr_NoMemory();
2740 nsize
= size
+ pairs
+ (byteorder
== 0);
2741 bytesize
= nsize
* 2;
2742 if (bytesize
/ 2 != nsize
)
2743 return PyErr_NoMemory();
2744 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2748 p
= (unsigned char *)PyString_AS_STRING(v
);
2754 if (byteorder
== -1) {
2759 else if (byteorder
== 1) {
2765 while (size
-- > 0) {
2766 Py_UNICODE ch
= *s
++;
2768 #ifdef Py_UNICODE_WIDE
2769 if (ch
>= 0x10000) {
2770 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2771 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2782 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2784 if (!PyUnicode_Check(unicode
)) {
2785 PyErr_BadArgument();
2788 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2789 PyUnicode_GET_SIZE(unicode
),
2794 /* --- Unicode Escape Codec ----------------------------------------------- */
2796 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2798 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2802 const char *starts
= s
;
2803 Py_ssize_t startinpos
;
2804 Py_ssize_t endinpos
;
2810 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2811 PyObject
*errorHandler
= NULL
;
2812 PyObject
*exc
= NULL
;
2814 /* Escaped strings will always be longer than the resulting
2815 Unicode string, so we start with size here and then reduce the
2816 length after conversion to the true value.
2817 (but if the error callback returns a long replacement string
2818 we'll have to allocate more space) */
2819 v
= _PyUnicode_New(size
);
2823 return (PyObject
*)v
;
2825 p
= PyUnicode_AS_UNICODE(v
);
2833 /* Non-escape characters are interpreted as Unicode ordinals */
2835 *p
++ = (unsigned char) *s
++;
2839 startinpos
= s
-starts
;
2844 c
= '\0'; /* Invalid after \ */
2849 case '\\': *p
++ = '\\'; break;
2850 case '\'': *p
++ = '\''; break;
2851 case '\"': *p
++ = '\"'; break;
2852 case 'b': *p
++ = '\b'; break;
2853 case 'f': *p
++ = '\014'; break; /* FF */
2854 case 't': *p
++ = '\t'; break;
2855 case 'n': *p
++ = '\n'; break;
2856 case 'r': *p
++ = '\r'; break;
2857 case 'v': *p
++ = '\013'; break; /* VT */
2858 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2860 /* \OOO (octal) escapes */
2861 case '0': case '1': case '2': case '3':
2862 case '4': case '5': case '6': case '7':
2864 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2865 x
= (x
<<3) + *s
++ - '0';
2866 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2867 x
= (x
<<3) + *s
++ - '0';
2876 message
= "truncated \\xXX escape";
2882 message
= "truncated \\uXXXX escape";
2888 message
= "truncated \\UXXXXXXXX escape";
2891 if (end
- s
< digits
) {
2892 /* count only hex digits */
2893 for (; s
< end
; ++s
) {
2894 c
= (unsigned char)*s
;
2895 if (!Py_ISXDIGIT(c
))
2900 for (; digits
--; ++s
) {
2901 c
= (unsigned char)*s
;
2902 if (!Py_ISXDIGIT(c
))
2904 chr
= (chr
<<4) & ~0xF;
2905 if (c
>= '0' && c
<= '9')
2907 else if (c
>= 'a' && c
<= 'f')
2908 chr
+= 10 + c
- 'a';
2910 chr
+= 10 + c
- 'A';
2912 if (chr
== 0xffffffff && PyErr_Occurred())
2913 /* _decoding_error will have already written into the
2917 /* when we get here, chr is a 32-bit unicode character */
2919 /* UCS-2 character */
2920 *p
++ = (Py_UNICODE
) chr
;
2921 else if (chr
<= 0x10ffff) {
2922 /* UCS-4 character. Either store directly, or as
2924 #ifdef Py_UNICODE_WIDE
2928 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2929 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2932 message
= "illegal Unicode character";
2939 message
= "malformed \\N character escape";
2940 if (ucnhash_CAPI
== NULL
) {
2941 /* load the unicode data module */
2942 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME
, 1);
2943 if (ucnhash_CAPI
== NULL
)
2947 const char *start
= s
+1;
2948 /* look for the closing brace */
2949 while (*s
!= '}' && s
< end
)
2951 if (s
> start
&& s
< end
&& *s
== '}') {
2952 /* found a name. look it up in the unicode database */
2953 message
= "unknown Unicode character name";
2955 if (s
- start
- 1 <= INT_MAX
&&
2956 ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2964 message
= "\\ at end of string";
2970 *p
++ = (unsigned char)s
[-1];
2977 endinpos
= s
-starts
;
2978 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2979 if (unicode_decode_call_errorhandler(
2980 errors
, &errorHandler
,
2981 "unicodeescape", message
,
2982 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2987 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2989 Py_XDECREF(errorHandler
);
2991 return (PyObject
*)v
;
2996 "\\N escapes not supported (can't load unicodedata module)"
2999 Py_XDECREF(errorHandler
);
3005 Py_XDECREF(errorHandler
);
3010 /* Return a Unicode-Escape string version of the Unicode object.
3012 If quotes is true, the string is enclosed in u"" or u'' quotes as
3017 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
3021 /* like wcschr, but doesn't stop at NULL characters */
3023 while (size
-- > 0) {
3033 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
3040 static const char *hexdigit
= "0123456789abcdef";
3041 #ifdef Py_UNICODE_WIDE
3042 const Py_ssize_t expandsize
= 10;
3044 const Py_ssize_t expandsize
= 6;
3047 /* XXX(nnorwitz): rather than over-allocating, it would be
3048 better to choose a different scheme. Perhaps scan the
3049 first N-chars of the string and allocate based on that size.
3051 /* Initial allocation is based on the longest-possible unichr
3054 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3055 unichr, so in this case it's the longest unichr escape. In
3056 narrow (UTF-16) builds this is five chars per source unichr
3057 since there are two unichrs in the surrogate pair, so in narrow
3058 (UTF-16) builds it's not the longest unichr escape.
3060 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3061 so in the narrow (UTF-16) build case it's the longest unichr
3065 if (size
> (PY_SSIZE_T_MAX
- 2 - 1) / expandsize
)
3066 return PyErr_NoMemory();
3068 repr
= PyString_FromStringAndSize(NULL
,
3075 p
= PyString_AS_STRING(repr
);
3079 *p
++ = (findchar(s
, size
, '\'') &&
3080 !findchar(s
, size
, '"')) ? '"' : '\'';
3082 while (size
-- > 0) {
3083 Py_UNICODE ch
= *s
++;
3085 /* Escape quotes and backslashes */
3087 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
3093 #ifdef Py_UNICODE_WIDE
3094 /* Map 21-bit characters to '\U00xxxxxx' */
3095 else if (ch
>= 0x10000) {
3098 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
3099 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
3100 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
3101 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
3102 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
3103 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
3104 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
3105 *p
++ = hexdigit
[ch
& 0x0000000F];
3109 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3110 else if (ch
>= 0xD800 && ch
< 0xDC00) {
3116 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3117 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3120 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
3121 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
3122 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
3123 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
3124 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
3125 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
3126 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
3127 *p
++ = hexdigit
[ucs
& 0x0000000F];
3130 /* Fall through: isolated surrogates are copied as-is */
3136 /* Map 16-bit characters to '\uxxxx' */
3140 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
3141 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
3142 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3143 *p
++ = hexdigit
[ch
& 0x000F];
3146 /* Map special whitespace to '\t', \n', '\r' */
3147 else if (ch
== '\t') {
3151 else if (ch
== '\n') {
3155 else if (ch
== '\r') {
3160 /* Map non-printable US ASCII to '\xhh' */
3161 else if (ch
< ' ' || ch
>= 0x7F) {
3164 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3165 *p
++ = hexdigit
[ch
& 0x000F];
3168 /* Copy everything else as-is */
3173 *p
++ = PyString_AS_STRING(repr
)[1];
3176 if (_PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
)))
3181 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3184 return unicodeescape_string(s
, size
, 0);
3187 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3189 if (!PyUnicode_Check(unicode
)) {
3190 PyErr_BadArgument();
3193 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3194 PyUnicode_GET_SIZE(unicode
));
3197 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3199 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3203 const char *starts
= s
;
3204 Py_ssize_t startinpos
;
3205 Py_ssize_t endinpos
;
3211 PyObject
*errorHandler
= NULL
;
3212 PyObject
*exc
= NULL
;
3214 /* Escaped strings will always be longer than the resulting
3215 Unicode string, so we start with size here and then reduce the
3216 length after conversion to the true value. (But decoding error
3217 handler might have to resize the string) */
3218 v
= _PyUnicode_New(size
);
3222 return (PyObject
*)v
;
3223 p
= PyUnicode_AS_UNICODE(v
);
3231 /* Non-escape characters are interpreted as Unicode ordinals */
3233 *p
++ = (unsigned char)*s
++;
3236 startinpos
= s
-starts
;
3238 /* \u-escapes are only interpreted iff the number of leading
3239 backslashes if odd */
3244 *p
++ = (unsigned char)*s
++;
3246 if (((s
- bs
) & 1) == 0 ||
3248 (*s
!= 'u' && *s
!= 'U')) {
3252 count
= *s
=='u' ? 4 : 8;
3255 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3256 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3257 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3258 c
= (unsigned char)*s
;
3260 endinpos
= s
-starts
;
3261 if (unicode_decode_call_errorhandler(
3262 errors
, &errorHandler
,
3263 "rawunicodeescape", "truncated \\uXXXX",
3264 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3270 if (c
>= '0' && c
<= '9')
3272 else if (c
>= 'a' && c
<= 'f')
3278 /* UCS-2 character */
3279 *p
++ = (Py_UNICODE
) x
;
3280 else if (x
<= 0x10ffff) {
3281 /* UCS-4 character. Either store directly, or as
3283 #ifdef Py_UNICODE_WIDE
3284 *p
++ = (Py_UNICODE
) x
;
3287 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3288 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3291 endinpos
= s
-starts
;
3292 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3293 if (unicode_decode_call_errorhandler(
3294 errors
, &errorHandler
,
3295 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3296 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3303 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3305 Py_XDECREF(errorHandler
);
3307 return (PyObject
*)v
;
3311 Py_XDECREF(errorHandler
);
3316 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3323 static const char *hexdigit
= "0123456789abcdef";
3324 #ifdef Py_UNICODE_WIDE
3325 const Py_ssize_t expandsize
= 10;
3327 const Py_ssize_t expandsize
= 6;
3330 if (size
> PY_SSIZE_T_MAX
/ expandsize
)
3331 return PyErr_NoMemory();
3333 repr
= PyString_FromStringAndSize(NULL
, expandsize
* size
);
3339 p
= q
= PyString_AS_STRING(repr
);
3340 while (size
-- > 0) {
3341 Py_UNICODE ch
= *s
++;
3342 #ifdef Py_UNICODE_WIDE
3343 /* Map 32-bit characters to '\Uxxxxxxxx' */
3344 if (ch
>= 0x10000) {
3347 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3348 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3349 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3350 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3351 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3352 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3353 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3354 *p
++ = hexdigit
[ch
& 15];
3358 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3359 if (ch
>= 0xD800 && ch
< 0xDC00) {
3365 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3366 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3369 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3370 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3371 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3372 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3373 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3374 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3375 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3376 *p
++ = hexdigit
[ucs
& 0xf];
3379 /* Fall through: isolated surrogates are copied as-is */
3384 /* Map 16-bit characters to '\uxxxx' */
3388 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3389 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3390 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3391 *p
++ = hexdigit
[ch
& 15];
3393 /* Copy everything else as-is */
3398 if (_PyString_Resize(&repr
, p
- q
))
3403 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3405 if (!PyUnicode_Check(unicode
)) {
3406 PyErr_BadArgument();
3409 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3410 PyUnicode_GET_SIZE(unicode
));
3413 /* --- Unicode Internal Codec ------------------------------------------- */
3415 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3419 const char *starts
= s
;
3420 Py_ssize_t startinpos
;
3421 Py_ssize_t endinpos
;
3427 PyObject
*errorHandler
= NULL
;
3428 PyObject
*exc
= NULL
;
3430 #ifdef Py_UNICODE_WIDE
3431 Py_UNICODE unimax
= PyUnicode_GetMax();
3434 /* XXX overflow detection missing */
3435 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3438 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3439 return (PyObject
*)v
;
3440 p
= PyUnicode_AS_UNICODE(v
);
3444 if (end
-s
< Py_UNICODE_SIZE
) {
3445 endinpos
= end
-starts
;
3446 reason
= "truncated input";
3449 memcpy(p
, s
, sizeof(Py_UNICODE
));
3450 #ifdef Py_UNICODE_WIDE
3451 /* We have to sanity check the raw data, otherwise doom looms for
3452 some malformed UCS-4 data. */
3453 if (*p
> unimax
|| *p
< 0) {
3454 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3455 reason
= "illegal code point (> 0x10FFFF)";
3460 s
+= Py_UNICODE_SIZE
;
3464 startinpos
= s
- starts
;
3465 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3466 if (unicode_decode_call_errorhandler(
3467 errors
, &errorHandler
,
3468 "unicode_internal", reason
,
3469 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3475 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3477 Py_XDECREF(errorHandler
);
3479 return (PyObject
*)v
;
3483 Py_XDECREF(errorHandler
);
3488 /* --- Latin-1 Codec ------------------------------------------------------ */
3490 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3497 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3499 Py_UNICODE r
= *(unsigned char*)s
;
3500 return PyUnicode_FromUnicode(&r
, 1);
3503 v
= _PyUnicode_New(size
);
3507 return (PyObject
*)v
;
3508 p
= PyUnicode_AS_UNICODE(v
);
3510 *p
++ = (unsigned char)*s
++;
3511 return (PyObject
*)v
;
3518 /* create or adjust a UnicodeEncodeError */
3519 static void make_encode_exception(PyObject
**exceptionObject
,
3520 const char *encoding
,
3521 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3522 Py_ssize_t startpos
, Py_ssize_t endpos
,
3525 if (*exceptionObject
== NULL
) {
3526 *exceptionObject
= PyUnicodeEncodeError_Create(
3527 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3530 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3532 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3534 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3538 Py_CLEAR(*exceptionObject
);
3542 /* raises a UnicodeEncodeError */
3543 static void raise_encode_exception(PyObject
**exceptionObject
,
3544 const char *encoding
,
3545 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3546 Py_ssize_t startpos
, Py_ssize_t endpos
,
3549 make_encode_exception(exceptionObject
,
3550 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3551 if (*exceptionObject
!= NULL
)
3552 PyCodec_StrictErrors(*exceptionObject
);
3555 /* error handling callback helper:
3556 build arguments, call the callback and check the arguments,
3557 put the result into newpos and return the replacement string, which
3558 has to be freed by the caller */
3559 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3560 PyObject
**errorHandler
,
3561 const char *encoding
, const char *reason
,
3562 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3563 Py_ssize_t startpos
, Py_ssize_t endpos
,
3566 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3569 PyObject
*resunicode
;
3571 if (*errorHandler
== NULL
) {
3572 *errorHandler
= PyCodec_LookupError(errors
);
3573 if (*errorHandler
== NULL
)
3577 make_encode_exception(exceptionObject
,
3578 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3579 if (*exceptionObject
== NULL
)
3582 restuple
= PyObject_CallFunctionObjArgs(
3583 *errorHandler
, *exceptionObject
, NULL
);
3584 if (restuple
== NULL
)
3586 if (!PyTuple_Check(restuple
)) {
3587 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
3588 Py_DECREF(restuple
);
3591 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3592 &resunicode
, newpos
)) {
3593 Py_DECREF(restuple
);
3597 *newpos
= size
+*newpos
;
3598 if (*newpos
<0 || *newpos
>size
) {
3599 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3600 Py_DECREF(restuple
);
3603 Py_INCREF(resunicode
);
3604 Py_DECREF(restuple
);
3608 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3615 /* pointers to the beginning and end+1 of input */
3616 const Py_UNICODE
*startp
= p
;
3617 const Py_UNICODE
*endp
= p
+ size
;
3618 /* pointer to the beginning of the unencodable characters */
3619 /* const Py_UNICODE *badp = NULL; */
3620 /* pointer into the output */
3622 /* current output position */
3623 Py_ssize_t respos
= 0;
3625 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3626 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3627 PyObject
*errorHandler
= NULL
;
3628 PyObject
*exc
= NULL
;
3629 /* the following variable is used for caching string comparisons
3630 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3631 int known_errorHandler
= -1;
3633 /* allocate enough for a simple encoding without
3634 replacements, if we need more, we'll resize */
3635 res
= PyString_FromStringAndSize(NULL
, size
);
3640 str
= PyString_AS_STRING(res
);
3646 /* can we encode this? */
3648 /* no overflow check, because we know that the space is enough */
3653 Py_ssize_t unicodepos
= p
-startp
;
3654 Py_ssize_t requiredsize
;
3655 PyObject
*repunicode
;
3660 /* startpos for collecting unencodable chars */
3661 const Py_UNICODE
*collstart
= p
;
3662 const Py_UNICODE
*collend
= p
;
3663 /* find all unecodable characters */
3664 while ((collend
< endp
) && ((*collend
) >= limit
))
3666 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3667 if (known_errorHandler
==-1) {
3668 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3669 known_errorHandler
= 1;
3670 else if (!strcmp(errors
, "replace"))
3671 known_errorHandler
= 2;
3672 else if (!strcmp(errors
, "ignore"))
3673 known_errorHandler
= 3;
3674 else if (!strcmp(errors
, "xmlcharrefreplace"))
3675 known_errorHandler
= 4;
3677 known_errorHandler
= 0;
3679 switch (known_errorHandler
) {
3680 case 1: /* strict */
3681 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3683 case 2: /* replace */
3684 while (collstart
++ < collend
)
3685 *str
++ = '?'; /* fall through */
3686 case 3: /* ignore */
3689 case 4: /* xmlcharrefreplace */
3690 respos
= str
- PyString_AS_STRING(res
);
3691 /* determine replacement size (temporarily (mis)uses p) */
3692 requiredsize
= respos
;
3693 for (p
= collstart
; p
< collend
;) {
3694 Py_UCS4 ch
= _Py_UNICODE_NEXT(p
, collend
);
3702 else if (ch
< 10000)
3704 else if (ch
< 100000)
3706 else if (ch
< 1000000)
3710 if (requiredsize
> PY_SSIZE_T_MAX
- incr
)
3712 requiredsize
+= incr
;
3714 if (requiredsize
> PY_SSIZE_T_MAX
- (endp
- collend
))
3716 requiredsize
+= endp
- collend
;
3717 if (requiredsize
> ressize
) {
3718 if (ressize
<= PY_SSIZE_T_MAX
/2 && requiredsize
< 2*ressize
)
3719 requiredsize
= 2*ressize
;
3720 if (_PyString_Resize(&res
, requiredsize
))
3722 str
= PyString_AS_STRING(res
) + respos
;
3723 ressize
= requiredsize
;
3725 /* generate replacement (temporarily (mis)uses p) */
3726 for (p
= collstart
; p
< collend
;) {
3727 Py_UCS4 ch
= _Py_UNICODE_NEXT(p
, collend
);
3728 str
+= sprintf(str
, "&#%d;", (int)ch
);
3733 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3734 encoding
, reason
, startp
, size
, &exc
,
3735 collstart
-startp
, collend
-startp
, &newpos
);
3736 if (repunicode
== NULL
)
3738 /* need more space? (at least enough for what we have+the
3739 replacement+the rest of the string, so we won't have to
3740 check space for encodable characters) */
3741 respos
= str
- PyString_AS_STRING(res
);
3742 repsize
= PyUnicode_GET_SIZE(repunicode
);
3743 if (respos
> PY_SSIZE_T_MAX
- repsize
)
3745 requiredsize
= respos
+ repsize
;
3746 if (requiredsize
> PY_SSIZE_T_MAX
- (endp
- collend
))
3748 requiredsize
+= endp
- collend
;
3749 if (requiredsize
> ressize
) {
3750 if (ressize
<= PY_SSIZE_T_MAX
/2 && requiredsize
< 2*ressize
)
3751 requiredsize
= 2*ressize
;
3752 if (_PyString_Resize(&res
, requiredsize
)) {
3753 Py_DECREF(repunicode
);
3756 str
= PyString_AS_STRING(res
) + respos
;
3757 ressize
= requiredsize
;
3759 /* check if there is anything unencodable in the replacement
3760 and copy it to the output */
3761 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
, ++str
) {
3764 raise_encode_exception(&exc
, encoding
, startp
, size
,
3765 unicodepos
, unicodepos
+1, reason
);
3766 Py_DECREF(repunicode
);
3771 p
= startp
+ newpos
;
3772 Py_DECREF(repunicode
);
3776 /* Resize if we allocated to much */
3777 respos
= str
- PyString_AS_STRING(res
);
3778 if (respos
< ressize
)
3779 /* If this falls res will be NULL */
3780 _PyString_Resize(&res
, respos
);
3781 Py_XDECREF(errorHandler
);
3786 PyErr_SetString(PyExc_OverflowError
,
3787 "encoded result is too long for a Python string");
3791 Py_XDECREF(errorHandler
);
3796 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3800 return unicode_encode_ucs1(p
, size
, errors
, 256);
3803 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3805 if (!PyUnicode_Check(unicode
)) {
3806 PyErr_BadArgument();
3809 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3810 PyUnicode_GET_SIZE(unicode
),
3814 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3816 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3820 const char *starts
= s
;
3823 Py_ssize_t startinpos
;
3824 Py_ssize_t endinpos
;
3827 PyObject
*errorHandler
= NULL
;
3828 PyObject
*exc
= NULL
;
3830 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3831 if (size
== 1 && *(unsigned char*)s
< 128) {
3832 Py_UNICODE r
= *(unsigned char*)s
;
3833 return PyUnicode_FromUnicode(&r
, 1);
3836 v
= _PyUnicode_New(size
);
3840 return (PyObject
*)v
;
3841 p
= PyUnicode_AS_UNICODE(v
);
3844 register unsigned char c
= (unsigned char)*s
;
3850 startinpos
= s
-starts
;
3851 endinpos
= startinpos
+ 1;
3852 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3853 if (unicode_decode_call_errorhandler(
3854 errors
, &errorHandler
,
3855 "ascii", "ordinal not in range(128)",
3856 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3861 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3862 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3864 Py_XDECREF(errorHandler
);
3866 return (PyObject
*)v
;
3870 Py_XDECREF(errorHandler
);
3875 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3879 return unicode_encode_ucs1(p
, size
, errors
, 128);
3882 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3884 if (!PyUnicode_Check(unicode
)) {
3885 PyErr_BadArgument();
3888 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3889 PyUnicode_GET_SIZE(unicode
),
3893 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3895 /* --- MBCS codecs for Windows -------------------------------------------- */
3897 #if SIZEOF_INT < SIZEOF_SIZE_T
3901 /* XXX This code is limited to "true" double-byte encodings, as
3902 a) it assumes an incomplete character consists of a single byte, and
3903 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3904 encodings, see IsDBCSLeadByteEx documentation. */
3906 static int is_dbcs_lead_byte(const char *s
, int offset
)
3908 const char *curr
= s
+ offset
;
3910 if (IsDBCSLeadByte(*curr
)) {
3911 const char *prev
= CharPrev(s
, curr
);
3912 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3918 * Decode MBCS string into unicode object. If 'final' is set, converts
3919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3921 static int decode_mbcs(PyUnicodeObject
**v
,
3922 const char *s
, /* MBCS string */
3923 int size
, /* sizeof MBCS string */
3932 /* Skip trailing lead-byte unless 'final' is set */
3933 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3936 /* First get the size of the result */
3938 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3940 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3946 /* Create unicode object */
3947 *v
= _PyUnicode_New(usize
);
3952 /* Extend unicode object */
3953 n
= PyUnicode_GET_SIZE(*v
);
3954 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3958 /* Do the conversion */
3960 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3961 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3962 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3970 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3973 Py_ssize_t
*consumed
)
3975 PyUnicodeObject
*v
= NULL
;
3984 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3987 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3998 if (size
> INT_MAX
) {
4005 return (PyObject
*)v
;
4008 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
4012 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
4016 * Convert unicode into string object (MBCS).
4017 * Returns 0 if succeed, -1 otherwise.
4019 static int encode_mbcs(PyObject
**repr
,
4020 const Py_UNICODE
*p
, /* unicode */
4021 int size
) /* size of unicode */
4028 /* First get the size of the result */
4030 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
4031 if (mbcssize
== 0) {
4032 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
4037 if (*repr
== NULL
) {
4038 /* Create string object */
4039 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
4044 /* Extend string object */
4045 n
= PyString_Size(*repr
);
4046 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
4050 /* Do the conversion */
4052 char *s
= PyString_AS_STRING(*repr
) + n
;
4053 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
4054 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
4062 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
4066 PyObject
*repr
= NULL
;
4072 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
4075 ret
= encode_mbcs(&repr
, p
, (int)size
);
4083 if (size
> INT_MAX
) {
4093 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
4095 if (!PyUnicode_Check(unicode
)) {
4096 PyErr_BadArgument();
4099 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
4100 PyUnicode_GET_SIZE(unicode
),
4106 #endif /* MS_WINDOWS */
4108 /* --- Character Mapping Codec -------------------------------------------- */
4110 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
4115 const char *starts
= s
;
4116 Py_ssize_t startinpos
;
4117 Py_ssize_t endinpos
;
4122 Py_ssize_t extrachars
= 0;
4123 PyObject
*errorHandler
= NULL
;
4124 PyObject
*exc
= NULL
;
4125 Py_UNICODE
*mapstring
= NULL
;
4126 Py_ssize_t maplen
= 0;
4128 /* Default to Latin-1 */
4129 if (mapping
== NULL
)
4130 return PyUnicode_DecodeLatin1(s
, size
, errors
);
4132 v
= _PyUnicode_New(size
);
4136 return (PyObject
*)v
;
4137 p
= PyUnicode_AS_UNICODE(v
);
4139 if (PyUnicode_CheckExact(mapping
)) {
4140 mapstring
= PyUnicode_AS_UNICODE(mapping
);
4141 maplen
= PyUnicode_GET_SIZE(mapping
);
4143 unsigned char ch
= *s
;
4144 Py_UNICODE x
= 0xfffe; /* illegal value */
4150 /* undefined mapping */
4151 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4152 startinpos
= s
-starts
;
4153 endinpos
= startinpos
+1;
4154 if (unicode_decode_call_errorhandler(
4155 errors
, &errorHandler
,
4156 "charmap", "character maps to <undefined>",
4157 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4169 unsigned char ch
= *s
;
4172 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4173 w
= PyInt_FromLong((long)ch
);
4176 x
= PyObject_GetItem(mapping
, w
);
4179 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4180 /* No mapping found means: mapping is undefined. */
4190 if (PyInt_Check(x
)) {
4191 long value
= PyInt_AS_LONG(x
);
4192 if (value
== 0xFFFE)
4194 if (value
< 0 || value
> 0x10FFFF) {
4195 PyErr_SetString(PyExc_TypeError
,
4196 "character mapping must be in range(0x110000)");
4201 #ifndef Py_UNICODE_WIDE
4202 if (value
> 0xFFFF) {
4203 /* see the code for 1-n mapping below */
4204 if (extrachars
< 2) {
4206 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4207 Py_ssize_t needed
= 10 - extrachars
;
4208 extrachars
+= needed
;
4209 /* XXX overflow detection missing */
4210 if (_PyUnicode_Resize(&v
,
4211 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4215 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4218 *p
++ = 0xD800 | (value
>> 10);
4219 *p
++ = 0xDC00 | (value
& 0x3FF);
4224 *p
++ = (Py_UNICODE
)value
;
4226 else if (PyUnicode_Check(x
)) {
4227 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4229 if (targetsize
== 1) {
4231 Py_UNICODE value
= *PyUnicode_AS_UNICODE(x
);
4232 if (value
== 0xFFFE)
4236 else if (targetsize
> 1) {
4238 if (targetsize
> extrachars
) {
4240 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4241 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4243 extrachars
+= needed
;
4244 /* XXX overflow detection missing */
4245 if (_PyUnicode_Resize(&v
,
4246 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4250 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4253 PyUnicode_AS_UNICODE(x
),
4256 extrachars
-= targetsize
;
4258 /* 1-0 mapping: skip the character */
4261 /* wrong return value */
4262 PyErr_SetString(PyExc_TypeError
,
4263 "character mapping must return integer, None or unicode");
4271 /* undefined mapping */
4273 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4274 startinpos
= s
-starts
;
4275 endinpos
= startinpos
+1;
4276 if (unicode_decode_call_errorhandler(
4277 errors
, &errorHandler
,
4278 "charmap", "character maps to <undefined>",
4279 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4285 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4286 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4288 Py_XDECREF(errorHandler
);
4290 return (PyObject
*)v
;
4293 Py_XDECREF(errorHandler
);
4299 /* Charmap encoding: the lookup table */
4301 struct encoding_map
{
4303 unsigned char level1
[32];
4305 unsigned char level23
[1];
4309 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4311 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4312 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4316 static PyMethodDef encoding_map_methods
[] = {
4317 {"size", encoding_map_size
, METH_NOARGS
,
4318 PyDoc_STR("Return the size (in bytes) of this object") },
4323 encoding_map_dealloc(PyObject
* o
)
4328 static PyTypeObject EncodingMapType
= {
4329 PyVarObject_HEAD_INIT(NULL
, 0)
4330 "EncodingMap", /*tp_name*/
4331 sizeof(struct encoding_map
), /*tp_basicsize*/
4334 encoding_map_dealloc
, /*tp_dealloc*/
4341 0, /*tp_as_sequence*/
4342 0, /*tp_as_mapping*/
4349 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4353 0, /*tp_richcompare*/
4354 0, /*tp_weaklistoffset*/
4357 encoding_map_methods
, /*tp_methods*/
4364 0, /*tp_dictoffset*/
4373 PyUnicode_BuildEncodingMap(PyObject
* string
)
4377 struct encoding_map
*mresult
;
4380 unsigned char level1
[32];
4381 unsigned char level2
[512];
4382 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4383 int count2
= 0, count3
= 0;
4385 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4386 PyErr_BadArgument();
4389 decode
= PyUnicode_AS_UNICODE(string
);
4390 memset(level1
, 0xFF, sizeof level1
);
4391 memset(level2
, 0xFF, sizeof level2
);
4393 /* If there isn't a one-to-one mapping of NULL to \0,
4394 or if there are non-BMP characters, we need to use
4395 a mapping dictionary. */
4398 for (i
= 1; i
< 256; i
++) {
4401 #ifdef Py_UNICODE_WIDE
4402 || decode
[i
] > 0xFFFF
4408 if (decode
[i
] == 0xFFFE)
4409 /* unmapped character */
4411 l1
= decode
[i
] >> 11;
4412 l2
= decode
[i
] >> 7;
4413 if (level1
[l1
] == 0xFF)
4414 level1
[l1
] = count2
++;
4415 if (level2
[l2
] == 0xFF)
4416 level2
[l2
] = count3
++;
4419 if (count2
>= 0xFF || count3
>= 0xFF)
4423 PyObject
*result
= PyDict_New();
4424 PyObject
*key
, *value
;
4427 for (i
= 0; i
< 256; i
++) {
4429 key
= PyInt_FromLong(decode
[i
]);
4430 value
= PyInt_FromLong(i
);
4433 if (PyDict_SetItem(result
, key
, value
) == -1)
4446 /* Create a three-level trie */
4447 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4448 16*count2
+ 128*count3
- 1);
4450 return PyErr_NoMemory();
4451 PyObject_Init(result
, &EncodingMapType
);
4452 mresult
= (struct encoding_map
*)result
;
4453 mresult
->count2
= count2
;
4454 mresult
->count3
= count3
;
4455 mlevel1
= mresult
->level1
;
4456 mlevel2
= mresult
->level23
;
4457 mlevel3
= mresult
->level23
+ 16*count2
;
4458 memcpy(mlevel1
, level1
, 32);
4459 memset(mlevel2
, 0xFF, 16*count2
);
4460 memset(mlevel3
, 0, 128*count3
);
4462 for (i
= 1; i
< 256; i
++) {
4463 int o1
, o2
, o3
, i2
, i3
;
4464 if (decode
[i
] == 0xFFFE)
4465 /* unmapped character */
4468 o2
= (decode
[i
]>>7) & 0xF;
4469 i2
= 16*mlevel1
[o1
] + o2
;
4470 if (mlevel2
[i2
] == 0xFF)
4471 mlevel2
[i2
] = count3
++;
4472 o3
= decode
[i
] & 0x7F;
4473 i3
= 128*mlevel2
[i2
] + o3
;
4480 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4482 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4484 int l2
= (c
>>7) & 0xF;
4488 #ifdef Py_UNICODE_WIDE
4496 i
= map
->level1
[l1
];
4501 i
= map
->level23
[16*i
+l2
];
4506 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4513 /* Lookup the character ch in the mapping. If the character
4514 can't be found, Py_None is returned (or NULL, if another
4516 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4518 PyObject
*w
= PyInt_FromLong((long)c
);
4523 x
= PyObject_GetItem(mapping
, w
);
4526 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4527 /* No mapping found means: mapping is undefined. */
4535 else if (x
== Py_None
)
4537 else if (PyInt_Check(x
)) {
4538 long value
= PyInt_AS_LONG(x
);
4539 if (value
< 0 || value
> 255) {
4540 PyErr_SetString(PyExc_TypeError
,
4541 "character mapping must be in range(256)");
4547 else if (PyString_Check(x
))
4550 /* wrong return value */
4551 PyErr_SetString(PyExc_TypeError
,
4552 "character mapping must return integer, None or str");
4559 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4561 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4562 /* exponentially overallocate to minimize reallocations */
4563 if (requiredsize
< 2*outsize
)
4564 requiredsize
= 2*outsize
;
4565 if (_PyString_Resize(outobj
, requiredsize
)) {
4571 typedef enum charmapencode_result
{
4572 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4573 }charmapencode_result
;
4574 /* lookup the character, put the result in the output string and adjust
4575 various state variables. Reallocate the output string if not enough
4576 space is available. Return a new reference to the object that
4577 was put in the output buffer, or Py_None, if the mapping was undefined
4578 (in which case no character was written) or NULL, if a
4579 reallocation error occurred. The caller must decref the result */
4581 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4582 PyObject
**outobj
, Py_ssize_t
*outpos
)
4586 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4588 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4589 int res
= encoding_map_lookup(c
, mapping
);
4590 Py_ssize_t requiredsize
= *outpos
+1;
4593 if (outsize
<requiredsize
)
4594 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4595 return enc_EXCEPTION
;
4596 outstart
= PyString_AS_STRING(*outobj
);
4597 outstart
[(*outpos
)++] = (char)res
;
4601 rep
= charmapencode_lookup(c
, mapping
);
4603 return enc_EXCEPTION
;
4604 else if (rep
==Py_None
) {
4608 if (PyInt_Check(rep
)) {
4609 Py_ssize_t requiredsize
= *outpos
+1;
4610 if (outsize
<requiredsize
)
4611 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4613 return enc_EXCEPTION
;
4615 outstart
= PyString_AS_STRING(*outobj
);
4616 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4619 const char *repchars
= PyString_AS_STRING(rep
);
4620 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4621 Py_ssize_t requiredsize
= *outpos
+repsize
;
4622 if (outsize
<requiredsize
)
4623 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4625 return enc_EXCEPTION
;
4627 outstart
= PyString_AS_STRING(*outobj
);
4628 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4636 /* handle an error in PyUnicode_EncodeCharmap
4637 Return 0 on success, -1 on error */
4639 int charmap_encoding_error(
4640 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4641 PyObject
**exceptionObject
,
4642 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4643 PyObject
**res
, Py_ssize_t
*respos
)
4645 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4649 /* startpos for collecting unencodable chars */
4650 Py_ssize_t collstartpos
= *inpos
;
4651 Py_ssize_t collendpos
= *inpos
+1;
4653 char *encoding
= "charmap";
4654 char *reason
= "character maps to <undefined>";
4655 charmapencode_result x
;
4657 /* find all unencodable characters */
4658 while (collendpos
< size
) {
4660 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4661 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4668 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4671 else if (rep
!=Py_None
) {
4678 /* cache callback name lookup
4679 * (if not done yet, i.e. it's the first error) */
4680 if (*known_errorHandler
==-1) {
4681 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4682 *known_errorHandler
= 1;
4683 else if (!strcmp(errors
, "replace"))
4684 *known_errorHandler
= 2;
4685 else if (!strcmp(errors
, "ignore"))
4686 *known_errorHandler
= 3;
4687 else if (!strcmp(errors
, "xmlcharrefreplace"))
4688 *known_errorHandler
= 4;
4690 *known_errorHandler
= 0;
4692 switch (*known_errorHandler
) {
4693 case 1: /* strict */
4694 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4696 case 2: /* replace */
4697 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4698 x
= charmapencode_output('?', mapping
, res
, respos
);
4699 if (x
==enc_EXCEPTION
) {
4702 else if (x
==enc_FAILED
) {
4703 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4708 case 3: /* ignore */
4709 *inpos
= collendpos
;
4711 case 4: /* xmlcharrefreplace */
4712 /* generate replacement */
4713 for (collpos
= collstartpos
; collpos
< collendpos
;) {
4714 char buffer
[2+29+1+1];
4716 Py_UCS4 ch
= p
[collpos
++];
4717 #ifndef Py_UNICODE_WIDE
4718 if ((0xD800 <= ch
&& ch
<= 0xDBFF) &&
4719 (collpos
< collendpos
) &&
4720 (0xDC00 <= p
[collpos
] && p
[collpos
] <= 0xDFFF)) {
4721 ch
= ((((ch
& 0x03FF) << 10) |
4722 ((Py_UCS4
)p
[collpos
++] & 0x03FF)) + 0x10000);
4725 sprintf(buffer
, "&#%d;", (int)ch
);
4726 for (cp
= buffer
; *cp
; ++cp
) {
4727 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4728 if (x
==enc_EXCEPTION
)
4730 else if (x
==enc_FAILED
) {
4731 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4736 *inpos
= collendpos
;
4739 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4740 encoding
, reason
, p
, size
, exceptionObject
,
4741 collstartpos
, collendpos
, &newpos
);
4742 if (repunicode
== NULL
)
4744 /* generate replacement */
4745 repsize
= PyUnicode_GET_SIZE(repunicode
);
4746 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4747 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4748 if (x
==enc_EXCEPTION
) {
4751 else if (x
==enc_FAILED
) {
4752 Py_DECREF(repunicode
);
4753 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4758 Py_DECREF(repunicode
);
4763 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4769 PyObject
*res
= NULL
;
4770 /* current input position */
4771 Py_ssize_t inpos
= 0;
4772 /* current output position */
4773 Py_ssize_t respos
= 0;
4774 PyObject
*errorHandler
= NULL
;
4775 PyObject
*exc
= NULL
;
4776 /* the following variable is used for caching string comparisons
4777 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4778 * 3=ignore, 4=xmlcharrefreplace */
4779 int known_errorHandler
= -1;
4781 /* Default to Latin-1 */
4782 if (mapping
== NULL
)
4783 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4785 /* allocate enough for a simple encoding without
4786 replacements, if we need more, we'll resize */
4787 res
= PyString_FromStringAndSize(NULL
, size
);
4793 while (inpos
<size
) {
4794 /* try to encode it */
4795 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4796 if (x
==enc_EXCEPTION
) /* error */
4798 if (x
==enc_FAILED
) { /* unencodable character */
4799 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4801 &known_errorHandler
, &errorHandler
, errors
,
4807 /* done with this character => adjust input position */
4811 /* Resize if we allocated to much */
4812 if (respos
<PyString_GET_SIZE(res
)) {
4813 if (_PyString_Resize(&res
, respos
))
4817 Py_XDECREF(errorHandler
);
4823 Py_XDECREF(errorHandler
);
4827 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4830 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4831 PyErr_BadArgument();
4834 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4835 PyUnicode_GET_SIZE(unicode
),
4840 /* create or adjust a UnicodeTranslateError */
4841 static void make_translate_exception(PyObject
**exceptionObject
,
4842 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4843 Py_ssize_t startpos
, Py_ssize_t endpos
,
4846 if (*exceptionObject
== NULL
) {
4847 *exceptionObject
= PyUnicodeTranslateError_Create(
4848 unicode
, size
, startpos
, endpos
, reason
);
4851 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4853 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4855 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4859 Py_CLEAR(*exceptionObject
);
4863 /* raises a UnicodeTranslateError */
4864 static void raise_translate_exception(PyObject
**exceptionObject
,
4865 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4866 Py_ssize_t startpos
, Py_ssize_t endpos
,
4869 make_translate_exception(exceptionObject
,
4870 unicode
, size
, startpos
, endpos
, reason
);
4871 if (*exceptionObject
!= NULL
)
4872 PyCodec_StrictErrors(*exceptionObject
);
4875 /* error handling callback helper:
4876 build arguments, call the callback and check the arguments,
4877 put the result into newpos and return the replacement string, which
4878 has to be freed by the caller */
4879 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4880 PyObject
**errorHandler
,
4882 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4883 Py_ssize_t startpos
, Py_ssize_t endpos
,
4886 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4888 Py_ssize_t i_newpos
;
4890 PyObject
*resunicode
;
4892 if (*errorHandler
== NULL
) {
4893 *errorHandler
= PyCodec_LookupError(errors
);
4894 if (*errorHandler
== NULL
)
4898 make_translate_exception(exceptionObject
,
4899 unicode
, size
, startpos
, endpos
, reason
);
4900 if (*exceptionObject
== NULL
)
4903 restuple
= PyObject_CallFunctionObjArgs(
4904 *errorHandler
, *exceptionObject
, NULL
);
4905 if (restuple
== NULL
)
4907 if (!PyTuple_Check(restuple
)) {
4908 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
4909 Py_DECREF(restuple
);
4912 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4913 &resunicode
, &i_newpos
)) {
4914 Py_DECREF(restuple
);
4918 *newpos
= size
+i_newpos
;
4921 if (*newpos
<0 || *newpos
>size
) {
4922 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4923 Py_DECREF(restuple
);
4926 Py_INCREF(resunicode
);
4927 Py_DECREF(restuple
);
4931 /* Lookup the character ch in the mapping and put the result in result,
4932 which must be decrefed by the caller.
4933 Return 0 on success, -1 on error */
4935 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4937 PyObject
*w
= PyInt_FromLong((long)c
);
4942 x
= PyObject_GetItem(mapping
, w
);
4945 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4946 /* No mapping found means: use 1:1 mapping. */
4953 else if (x
== Py_None
) {
4957 else if (PyInt_Check(x
)) {
4958 long value
= PyInt_AS_LONG(x
);
4959 long max
= PyUnicode_GetMax();
4960 if (value
< 0 || value
> max
) {
4961 PyErr_Format(PyExc_TypeError
,
4962 "character mapping must be in range(0x%lx)", max
+1);
4969 else if (PyUnicode_Check(x
)) {
4974 /* wrong return value */
4975 PyErr_SetString(PyExc_TypeError
,
4976 "character mapping must return integer, None or unicode");
4981 /* ensure that *outobj is at least requiredsize characters long,
4982 if not reallocate and adjust various state variables.
4983 Return 0 on success, -1 on error */
4985 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4986 Py_ssize_t requiredsize
)
4988 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4989 if (requiredsize
> oldsize
) {
4990 /* remember old output position */
4991 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4992 /* exponentially overallocate to minimize reallocations */
4993 if (requiredsize
< 2 * oldsize
)
4994 requiredsize
= 2 * oldsize
;
4995 if (PyUnicode_Resize(outobj
, requiredsize
) < 0)
4997 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
5001 /* lookup the character, put the result in the output string and adjust
5002 various state variables. Return a new reference to the object that
5003 was put in the output buffer in *result, or Py_None, if the mapping was
5004 undefined (in which case no character was written).
5005 The called must decref result.
5006 Return 0 on success, -1 on error. */
5008 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
5009 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
5012 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
5015 /* not found => default to 1:1 mapping */
5016 *(*outp
)++ = *curinp
;
5018 else if (*res
==Py_None
)
5020 else if (PyInt_Check(*res
)) {
5021 /* no overflow check, because we know that the space is enough */
5022 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
5024 else if (PyUnicode_Check(*res
)) {
5025 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
5027 /* no overflow check, because we know that the space is enough */
5028 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
5030 else if (repsize
!=0) {
5031 /* more than one character */
5032 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
5033 (insize
- (curinp
-startinp
)) +
5035 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
5037 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
5046 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
5052 PyObject
*res
= NULL
;
5053 /* pointers to the beginning and end+1 of input */
5054 const Py_UNICODE
*startp
= p
;
5055 const Py_UNICODE
*endp
= p
+ size
;
5056 /* pointer into the output */
5058 /* current output position */
5059 Py_ssize_t respos
= 0;
5060 char *reason
= "character maps to <undefined>";
5061 PyObject
*errorHandler
= NULL
;
5062 PyObject
*exc
= NULL
;
5063 /* the following variable is used for caching string comparisons
5064 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5065 * 3=ignore, 4=xmlcharrefreplace */
5066 int known_errorHandler
= -1;
5068 if (mapping
== NULL
) {
5069 PyErr_BadArgument();
5073 /* allocate enough for a simple 1:1 translation without
5074 replacements, if we need more, we'll resize */
5075 res
= PyUnicode_FromUnicode(NULL
, size
);
5080 str
= PyUnicode_AS_UNICODE(res
);
5083 /* try to encode it */
5085 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
5090 if (x
!=Py_None
) /* it worked => adjust input pointer */
5092 else { /* untranslatable character */
5093 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
5097 /* startpos for collecting untranslatable chars */
5098 const Py_UNICODE
*collstart
= p
;
5099 const Py_UNICODE
*collend
= p
+1;
5100 const Py_UNICODE
*coll
;
5102 /* find all untranslatable characters */
5103 while (collend
< endp
) {
5104 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
5111 /* cache callback name lookup
5112 * (if not done yet, i.e. it's the first error) */
5113 if (known_errorHandler
==-1) {
5114 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5115 known_errorHandler
= 1;
5116 else if (!strcmp(errors
, "replace"))
5117 known_errorHandler
= 2;
5118 else if (!strcmp(errors
, "ignore"))
5119 known_errorHandler
= 3;
5120 else if (!strcmp(errors
, "xmlcharrefreplace"))
5121 known_errorHandler
= 4;
5123 known_errorHandler
= 0;
5125 switch (known_errorHandler
) {
5126 case 1: /* strict */
5127 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
5129 case 2: /* replace */
5130 /* No need to check for space, this is a 1:1 replacement */
5131 for (coll
= collstart
; coll
<collend
; ++coll
)
5134 case 3: /* ignore */
5137 case 4: /* xmlcharrefreplace */
5138 /* generate replacement (temporarily (mis)uses p) */
5139 for (p
= collstart
; p
< collend
;) {
5140 char buffer
[2+29+1+1];
5142 Py_UCS4 ch
= _Py_UNICODE_NEXT(p
, collend
);
5143 sprintf(buffer
, "&#%d;", (int)ch
);
5144 if (charmaptranslate_makespace(&res
, &str
,
5145 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
5147 for (cp
= buffer
; *cp
; ++cp
)
5153 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
5154 reason
, startp
, size
, &exc
,
5155 collstart
-startp
, collend
-startp
, &newpos
);
5156 if (repunicode
== NULL
)
5158 /* generate replacement */
5159 repsize
= PyUnicode_GET_SIZE(repunicode
);
5160 if (charmaptranslate_makespace(&res
, &str
,
5161 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
5162 Py_DECREF(repunicode
);
5165 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
5167 p
= startp
+ newpos
;
5168 Py_DECREF(repunicode
);
5172 /* Resize if we allocated to much */
5173 respos
= str
-PyUnicode_AS_UNICODE(res
);
5174 if (respos
<PyUnicode_GET_SIZE(res
)) {
5175 if (PyUnicode_Resize(&res
, respos
) < 0)
5179 Py_XDECREF(errorHandler
);
5185 Py_XDECREF(errorHandler
);
5189 PyObject
*PyUnicode_Translate(PyObject
*str
,
5195 str
= PyUnicode_FromObject(str
);
5198 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
5199 PyUnicode_GET_SIZE(str
),
5210 /* --- Decimal Encoder ---------------------------------------------------- */
5212 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
5217 Py_UNICODE
*p
, *end
;
5218 PyObject
*errorHandler
= NULL
;
5219 PyObject
*exc
= NULL
;
5220 const char *encoding
= "decimal";
5221 const char *reason
= "invalid decimal Unicode string";
5222 /* the following variable is used for caching string comparisons
5223 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5224 int known_errorHandler
= -1;
5226 if (output
== NULL
) {
5227 PyErr_BadArgument();
5234 register Py_UNICODE ch
= *p
;
5236 PyObject
*repunicode
;
5240 Py_UNICODE
*collstart
;
5241 Py_UNICODE
*collend
;
5243 if (Py_UNICODE_ISSPACE(ch
)) {
5248 decimal
= Py_UNICODE_TODECIMAL(ch
);
5250 *output
++ = '0' + decimal
;
5254 if (0 < ch
&& ch
< 256) {
5255 *output
++ = (char)ch
;
5259 /* All other characters are considered unencodable */
5261 for (collend
= p
+1; collend
< end
; collend
++) {
5262 if ((0 < *collend
&& *collend
< 256) ||
5263 Py_UNICODE_ISSPACE(*collend
) ||
5264 0 <= Py_UNICODE_TODECIMAL(*collend
))
5267 /* cache callback name lookup
5268 * (if not done yet, i.e. it's the first error) */
5269 if (known_errorHandler
==-1) {
5270 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5271 known_errorHandler
= 1;
5272 else if (!strcmp(errors
, "replace"))
5273 known_errorHandler
= 2;
5274 else if (!strcmp(errors
, "ignore"))
5275 known_errorHandler
= 3;
5276 else if (!strcmp(errors
, "xmlcharrefreplace"))
5277 known_errorHandler
= 4;
5279 known_errorHandler
= 0;
5281 switch (known_errorHandler
) {
5282 case 1: /* strict */
5283 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5285 case 2: /* replace */
5286 for (p
= collstart
; p
< collend
; ++p
)
5289 case 3: /* ignore */
5292 case 4: /* xmlcharrefreplace */
5293 /* generate replacement (temporarily (mis)uses p) */
5294 for (p
= collstart
; p
< collend
;) {
5295 Py_UCS4 ch
= _Py_UNICODE_NEXT(p
, collend
);
5296 output
+= sprintf(output
, "&#%d;", ch
);
5301 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5302 encoding
, reason
, s
, length
, &exc
,
5303 collstart
-s
, collend
-s
, &newpos
);
5304 if (repunicode
== NULL
)
5306 /* generate replacement */
5307 repsize
= PyUnicode_GET_SIZE(repunicode
);
5308 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5309 Py_UNICODE ch
= *uni2
;
5310 if (Py_UNICODE_ISSPACE(ch
))
5313 decimal
= Py_UNICODE_TODECIMAL(ch
);
5315 *output
++ = '0' + decimal
;
5316 else if (0 < ch
&& ch
< 256)
5317 *output
++ = (char)ch
;
5319 Py_DECREF(repunicode
);
5320 raise_encode_exception(&exc
, encoding
,
5321 s
, length
, collstart
-s
, collend
-s
, reason
);
5327 Py_DECREF(repunicode
);
5330 /* 0-terminate the output string */
5333 Py_XDECREF(errorHandler
);
5338 Py_XDECREF(errorHandler
);
5342 /* --- Helpers ------------------------------------------------------------ */
5344 #include "stringlib/unicodedefs.h"
5345 #include "stringlib/fastsearch.h"
5347 #include "stringlib/count.h"
5348 #include "stringlib/find.h"
5349 #include "stringlib/partition.h"
5350 #include "stringlib/split.h"
5352 /* helper macro to fixup start/end slice values */
5353 #define ADJUST_INDICES(start, end, len) \
5356 else if (end < 0) { \
5367 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5373 PyUnicodeObject
* str_obj
;
5374 PyUnicodeObject
* sub_obj
;
5376 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5379 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5385 ADJUST_INDICES(start
, end
, str_obj
->length
);
5386 result
= stringlib_count(
5387 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
,
5397 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5405 str
= PyUnicode_FromObject(str
);
5408 sub
= PyUnicode_FromObject(sub
);
5415 result
= stringlib_find_slice(
5416 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5417 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5421 result
= stringlib_rfind_slice(
5422 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5423 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5434 int tailmatch(PyUnicodeObject
*self
,
5435 PyUnicodeObject
*substring
,
5440 if (substring
->length
== 0)
5443 ADJUST_INDICES(start
, end
, self
->length
);
5444 end
-= substring
->length
;
5448 if (direction
> 0) {
5449 if (Py_UNICODE_MATCH(self
, end
, substring
))
5452 if (Py_UNICODE_MATCH(self
, start
, substring
))
5459 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5467 str
= PyUnicode_FromObject(str
);
5470 substr
= PyUnicode_FromObject(substr
);
5471 if (substr
== NULL
) {
5476 result
= tailmatch((PyUnicodeObject
*)str
,
5477 (PyUnicodeObject
*)substr
,
5478 start
, end
, direction
);
5484 /* Apply fixfct filter to the Unicode object self and return a
5485 reference to the modified object */
5488 PyObject
*fixup(PyUnicodeObject
*self
,
5489 int (*fixfct
)(PyUnicodeObject
*s
))
5494 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5498 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5500 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5501 /* fixfct should return TRUE if it modified the buffer. If
5502 FALSE, return a reference to the original buffer instead
5503 (to save space, not time) */
5506 return (PyObject
*) self
;
5508 return (PyObject
*) u
;
5512 int fixupper(PyUnicodeObject
*self
)
5514 Py_ssize_t len
= self
->length
;
5515 Py_UNICODE
*s
= self
->str
;
5519 register Py_UNICODE ch
;
5521 ch
= Py_UNICODE_TOUPPER(*s
);
5533 int fixlower(PyUnicodeObject
*self
)
5535 Py_ssize_t len
= self
->length
;
5536 Py_UNICODE
*s
= self
->str
;
5540 register Py_UNICODE ch
;
5542 ch
= Py_UNICODE_TOLOWER(*s
);
5554 int fixswapcase(PyUnicodeObject
*self
)
5556 Py_ssize_t len
= self
->length
;
5557 Py_UNICODE
*s
= self
->str
;
5561 if (Py_UNICODE_ISUPPER(*s
)) {
5562 *s
= Py_UNICODE_TOLOWER(*s
);
5564 } else if (Py_UNICODE_ISLOWER(*s
)) {
5565 *s
= Py_UNICODE_TOUPPER(*s
);
5575 int fixcapitalize(PyUnicodeObject
*self
)
5577 Py_ssize_t len
= self
->length
;
5578 Py_UNICODE
*s
= self
->str
;
5583 if (!Py_UNICODE_ISUPPER(*s
)) {
5584 *s
= Py_UNICODE_TOUPPER(*s
);
5589 if (!Py_UNICODE_ISLOWER(*s
)) {
5590 *s
= Py_UNICODE_TOLOWER(*s
);
5599 int fixtitle(PyUnicodeObject
*self
)
5601 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5602 register Py_UNICODE
*e
;
5603 int previous_is_cased
;
5605 /* Shortcut for single character strings */
5606 if (PyUnicode_GET_SIZE(self
) == 1) {
5607 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5616 e
= p
+ PyUnicode_GET_SIZE(self
);
5617 previous_is_cased
= 0;
5618 for (; p
< e
; p
++) {
5619 register const Py_UNICODE ch
= *p
;
5621 if (previous_is_cased
)
5622 *p
= Py_UNICODE_TOLOWER(ch
);
5624 *p
= Py_UNICODE_TOTITLE(ch
);
5626 if (Py_UNICODE_ISLOWER(ch
) ||
5627 Py_UNICODE_ISUPPER(ch
) ||
5628 Py_UNICODE_ISTITLE(ch
))
5629 previous_is_cased
= 1;
5631 previous_is_cased
= 0;
5637 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5639 PyObject
*internal_separator
= NULL
;
5640 const Py_UNICODE blank
= ' ';
5641 const Py_UNICODE
*sep
= &blank
;
5642 Py_ssize_t seplen
= 1;
5643 PyUnicodeObject
*res
= NULL
; /* the result */
5644 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5645 Py_ssize_t res_used
; /* # used bytes */
5646 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5647 PyObject
*fseq
; /* PySequence_Fast(seq) */
5648 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5652 fseq
= PySequence_Fast(seq
, "can only join an iterable");
5657 /* Grrrr. A codec may be invoked to convert str objects to
5658 * Unicode, and so it's possible to call back into Python code
5659 * during PyUnicode_FromObject(), and so it's possible for a sick
5660 * codec to change the size of fseq (if seq is a list). Therefore
5661 * we have to keep refetching the size -- can't assume seqlen
5664 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5665 /* If empty sequence, return u"". */
5667 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5670 /* If singleton sequence with an exact Unicode, return that. */
5672 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5673 if (PyUnicode_CheckExact(item
)) {
5675 res
= (PyUnicodeObject
*)item
;
5680 /* At least two items to join, or one that isn't exact Unicode. */
5682 /* Set up sep and seplen -- they're needed. */
5683 if (separator
== NULL
) {
5688 internal_separator
= PyUnicode_FromObject(separator
);
5689 if (internal_separator
== NULL
)
5691 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5692 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5693 /* In case PyUnicode_FromObject() mutated seq. */
5694 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5699 res
= _PyUnicode_New(res_alloc
);
5702 res_p
= PyUnicode_AS_UNICODE(res
);
5705 for (i
= 0; i
< seqlen
; ++i
) {
5707 Py_ssize_t new_res_used
;
5709 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5710 /* Convert item to Unicode. */
5711 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5712 PyErr_Format(PyExc_TypeError
,
5713 "sequence item %zd: expected string or Unicode,"
5715 i
, Py_TYPE(item
)->tp_name
);
5718 item
= PyUnicode_FromObject(item
);
5721 /* We own a reference to item from here on. */
5723 /* In case PyUnicode_FromObject() mutated seq. */
5724 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5726 /* Make sure we have enough space for the separator and the item. */
5727 itemlen
= PyUnicode_GET_SIZE(item
);
5728 new_res_used
= res_used
+ itemlen
;
5729 if (new_res_used
< 0)
5731 if (i
< seqlen
- 1) {
5732 new_res_used
+= seplen
;
5733 if (new_res_used
< 0)
5736 if (new_res_used
> res_alloc
) {
5737 /* double allocated size until it's big enough */
5739 res_alloc
+= res_alloc
;
5742 } while (new_res_used
> res_alloc
);
5743 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5747 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5750 /* Copy item, and maybe the separator. */
5751 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5753 if (i
< seqlen
- 1) {
5754 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5758 res_used
= new_res_used
;
5761 /* Shrink res to match the used area; this probably can't fail,
5762 * but it's cheap to check.
5764 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5768 Py_XDECREF(internal_separator
);
5770 return (PyObject
*)res
;
5773 PyErr_SetString(PyExc_OverflowError
,
5774 "join() result is too long for a Python string");
5779 Py_XDECREF(internal_separator
);
5786 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5798 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5803 if (left
> PY_SSIZE_T_MAX
- self
->length
||
5804 right
> PY_SSIZE_T_MAX
- (left
+ self
->length
)) {
5805 PyErr_SetString(PyExc_OverflowError
, "padded string is too long");
5808 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5811 Py_UNICODE_FILL(u
->str
, fill
, left
);
5812 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5814 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5820 PyObject
*PyUnicode_Splitlines(PyObject
*string
, int keepends
)
5824 string
= PyUnicode_FromObject(string
);
5828 list
= stringlib_splitlines(
5829 (PyObject
*) string
, PyUnicode_AS_UNICODE(string
),
5830 PyUnicode_GET_SIZE(string
), keepends
);
5837 PyObject
*split(PyUnicodeObject
*self
,
5838 PyUnicodeObject
*substring
,
5839 Py_ssize_t maxcount
)
5842 maxcount
= PY_SSIZE_T_MAX
;
5844 if (substring
== NULL
)
5845 return stringlib_split_whitespace(
5846 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5849 return stringlib_split(
5850 (PyObject
*) self
, self
->str
, self
->length
,
5851 substring
->str
, substring
->length
,
5857 PyObject
*rsplit(PyUnicodeObject
*self
,
5858 PyUnicodeObject
*substring
,
5859 Py_ssize_t maxcount
)
5862 maxcount
= PY_SSIZE_T_MAX
;
5864 if (substring
== NULL
)
5865 return stringlib_rsplit_whitespace(
5866 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5869 return stringlib_rsplit(
5870 (PyObject
*) self
, self
->str
, self
->length
,
5871 substring
->str
, substring
->length
,
5877 PyObject
*replace(PyUnicodeObject
*self
,
5878 PyUnicodeObject
*str1
,
5879 PyUnicodeObject
*str2
,
5880 Py_ssize_t maxcount
)
5885 maxcount
= PY_SSIZE_T_MAX
;
5886 else if (maxcount
== 0 || self
->length
== 0)
5889 if (str1
->length
== str2
->length
) {
5892 if (str1
->length
== 0)
5894 if (str1
->length
== 1) {
5895 /* replace characters */
5897 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5899 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5902 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5905 for (i
= 0; i
< u
->length
; i
++)
5906 if (u
->str
[i
] == u1
) {
5913 self
->str
, self
->length
, str1
->str
, str1
->length
, 0
5917 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5920 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5922 /* change everything in-place, starting with this one */
5923 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5926 while ( --maxcount
> 0) {
5927 i
= stringlib_find(self
->str
+i
, self
->length
-i
,
5928 str1
->str
, str1
->length
,
5932 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5939 Py_ssize_t product
, new_size
, delta
;
5942 /* replace strings */
5943 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
,
5947 /* new_size = self->length + n * (str2->length - str1->length)); */
5948 delta
= (str2
->length
- str1
->length
);
5950 new_size
= self
->length
;
5952 product
= n
* (str2
->length
- str1
->length
);
5953 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5954 PyErr_SetString(PyExc_OverflowError
,
5955 "replace string is too long");
5958 new_size
= self
->length
+ product
;
5960 PyErr_SetString(PyExc_OverflowError
,
5961 "replace string is too long");
5965 u
= _PyUnicode_New(new_size
);
5970 if (str1
->length
> 0) {
5972 /* look for next match */
5973 j
= stringlib_find(self
->str
+i
, self
->length
-i
,
5974 str1
->str
, str1
->length
,
5979 /* copy unchanged part [i:j] */
5980 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
5983 /* copy substitution string */
5984 if (str2
->length
> 0) {
5985 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5988 i
= j
+ str1
->length
;
5990 if (i
< self
->length
)
5991 /* copy tail [i:] */
5992 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5996 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
6000 *p
++ = self
->str
[i
++];
6002 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
6005 return (PyObject
*) u
;
6008 /* nothing to replace; return original string (when possible) */
6009 if (PyUnicode_CheckExact(self
)) {
6011 return (PyObject
*) self
;
6013 return PyUnicode_FromUnicode(self
->str
, self
->length
);
6016 /* --- Unicode Object Methods --------------------------------------------- */
6018 PyDoc_STRVAR(title__doc__
,
6019 "S.title() -> unicode\n\
6021 Return a titlecased version of S, i.e. words start with title case\n\
6022 characters, all remaining cased characters have lower case.");
6025 unicode_title(PyUnicodeObject
*self
)
6027 return fixup(self
, fixtitle
);
6030 PyDoc_STRVAR(capitalize__doc__
,
6031 "S.capitalize() -> unicode\n\
6033 Return a capitalized version of S, i.e. make the first character\n\
6034 have upper case and the rest lower case.");
6037 unicode_capitalize(PyUnicodeObject
*self
)
6039 return fixup(self
, fixcapitalize
);
6043 PyDoc_STRVAR(capwords__doc__
,
6044 "S.capwords() -> unicode\n\
6046 Apply .capitalize() to all words in S and return the result with\n\
6047 normalized whitespace (all whitespace strings are replaced by ' ').");
6050 unicode_capwords(PyUnicodeObject
*self
)
6056 /* Split into words */
6057 list
= split(self
, NULL
, -1);
6061 /* Capitalize each word */
6062 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
6063 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
6067 Py_DECREF(PyList_GET_ITEM(list
, i
));
6068 PyList_SET_ITEM(list
, i
, item
);
6071 /* Join the words to form a new string */
6072 item
= PyUnicode_Join(NULL
, list
);
6076 return (PyObject
*)item
;
6080 /* Argument converter. Coerces to a single unicode character */
6083 convert_uc(PyObject
*obj
, void *addr
)
6085 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
6089 uniobj
= PyUnicode_FromObject(obj
);
6090 if (uniobj
== NULL
) {
6091 PyErr_SetString(PyExc_TypeError
,
6092 "The fill character cannot be converted to Unicode");
6095 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6096 PyErr_SetString(PyExc_TypeError
,
6097 "The fill character must be exactly one character long");
6101 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6102 *fillcharloc
= unistr
[0];
6107 PyDoc_STRVAR(center__doc__
,
6108 "S.center(width[, fillchar]) -> unicode\n\
6110 Return S centered in a Unicode string of length width. Padding is\n\
6111 done using the specified fill character (default is a space)");
6114 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6116 Py_ssize_t marg
, left
;
6118 Py_UNICODE fillchar
= ' ';
6120 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6123 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6125 return (PyObject
*) self
;
6128 marg
= width
- self
->length
;
6129 left
= marg
/ 2 + (marg
& width
& 1);
6131 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6136 /* This code should go into some future Unicode collation support
6137 module. The basic comparison should compare ordinals on a naive
6138 basis (this is what Java does and thus Jython too). */
6140 /* speedy UTF-16 code point order comparison */
6142 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6144 static short utf16Fixup
[32] =
6146 0, 0, 0, 0, 0, 0, 0, 0,
6147 0, 0, 0, 0, 0, 0, 0, 0,
6148 0, 0, 0, 0, 0, 0, 0, 0,
6149 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6153 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6155 Py_ssize_t len1
, len2
;
6157 Py_UNICODE
*s1
= str1
->str
;
6158 Py_UNICODE
*s2
= str2
->str
;
6160 len1
= str1
->length
;
6161 len2
= str2
->length
;
6163 while (len1
> 0 && len2
> 0) {
6169 if (c1
> (1<<11) * 26)
6170 c1
+= utf16Fixup
[c1
>>11];
6171 if (c2
> (1<<11) * 26)
6172 c2
+= utf16Fixup
[c2
>>11];
6173 /* now c1 and c2 are in UTF-32-compatible order */
6176 return (c1
< c2
) ? -1 : 1;
6181 return (len1
< len2
) ? -1 : (len1
!= len2
);
6187 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6189 register Py_ssize_t len1
, len2
;
6191 Py_UNICODE
*s1
= str1
->str
;
6192 Py_UNICODE
*s2
= str2
->str
;
6194 len1
= str1
->length
;
6195 len2
= str2
->length
;
6197 while (len1
> 0 && len2
> 0) {
6204 return (c1
< c2
) ? -1 : 1;
6209 return (len1
< len2
) ? -1 : (len1
!= len2
);
6214 int PyUnicode_Compare(PyObject
*left
,
6217 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6220 /* Coerce the two arguments */
6221 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6224 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6228 /* Shortcut for empty or interned objects */
6235 result
= unicode_compare(u
, v
);
6247 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6253 result
= PyUnicode_Compare(left
, right
);
6254 if (result
== -1 && PyErr_Occurred())
6257 /* Convert the return value to a Boolean */
6260 result
= (result
== 0);
6263 result
= (result
!= 0);
6266 result
= (result
<= 0);
6269 result
= (result
>= 0);
6272 result
= (result
== -1);
6275 result
= (result
== 1);
6278 return PyBool_FromLong(result
);
6284 Type errors mean that PyUnicode_FromObject() could not convert
6285 one of the arguments (usually the right hand side) to Unicode,
6286 ie. we can't handle the comparison request. However, it is
6287 possible that the other object knows a comparison method, which
6288 is why we return Py_NotImplemented to give the other object a
6292 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6294 Py_INCREF(Py_NotImplemented
);
6295 return Py_NotImplemented
;
6297 if (op
!= Py_EQ
&& op
!= Py_NE
)
6300 /* Equality comparison.
6302 This is a special case: we silence any PyExc_UnicodeDecodeError
6303 and instead turn it into a PyErr_UnicodeWarning.
6306 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6309 if (PyErr_Warn(PyExc_UnicodeWarning
,
6311 "Unicode equal comparison "
6312 "failed to convert both arguments to Unicode - "
6313 "interpreting them as being unequal" :
6314 "Unicode unequal comparison "
6315 "failed to convert both arguments to Unicode - "
6316 "interpreting them as being unequal"
6319 result
= (op
== Py_NE
);
6320 return PyBool_FromLong(result
);
6323 int PyUnicode_Contains(PyObject
*container
,
6326 PyObject
*str
, *sub
;
6329 /* Coerce the two arguments */
6330 sub
= PyUnicode_FromObject(element
);
6335 str
= PyUnicode_FromObject(container
);
6341 result
= stringlib_contains_obj(str
, sub
);
6349 /* Concat to string or Unicode object giving a new Unicode object. */
6351 PyObject
*PyUnicode_Concat(PyObject
*left
,
6354 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6356 /* Coerce the two arguments */
6357 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6360 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6365 if (v
== unicode_empty
) {
6367 return (PyObject
*)u
;
6369 if (u
== unicode_empty
) {
6371 return (PyObject
*)v
;
6374 /* Concat the two Unicode strings */
6375 w
= _PyUnicode_New(u
->length
+ v
->length
);
6378 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6379 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6383 return (PyObject
*)w
;
6391 PyDoc_STRVAR(count__doc__
,
6392 "S.count(sub[, start[, end]]) -> int\n\
6394 Return the number of non-overlapping occurrences of substring sub in\n\
6395 Unicode string S[start:end]. Optional arguments start and end are\n\
6396 interpreted as in slice notation.");
6399 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6401 PyUnicodeObject
*substring
;
6402 Py_ssize_t start
= 0;
6403 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6406 if (!stringlib_parse_args_finds_unicode("count", args
, &substring
,
6410 ADJUST_INDICES(start
, end
, self
->length
);
6411 result
= PyInt_FromSsize_t(
6412 stringlib_count(self
->str
+ start
, end
- start
,
6413 substring
->str
, substring
->length
,
6417 Py_DECREF(substring
);
6422 PyDoc_STRVAR(encode__doc__
,
6423 "S.encode([encoding[,errors]]) -> string or unicode\n\
6425 Encodes S using the codec registered for encoding. encoding defaults\n\
6426 to the default encoding. errors may be given to set a different error\n\
6427 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6428 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6429 'xmlcharrefreplace' as well as any other name registered with\n\
6430 codecs.register_error that can handle UnicodeEncodeErrors.");
6433 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6435 static char *kwlist
[] = {"encoding", "errors", 0};
6436 char *encoding
= NULL
;
6437 char *errors
= NULL
;
6440 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:encode",
6441 kwlist
, &encoding
, &errors
))
6443 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6446 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6447 PyErr_Format(PyExc_TypeError
,
6448 "encoder did not return a string/unicode object "
6450 Py_TYPE(v
)->tp_name
);
6460 PyDoc_STRVAR(decode__doc__
,
6461 "S.decode([encoding[,errors]]) -> string or unicode\n\
6463 Decodes S using the codec registered for encoding. encoding defaults\n\
6464 to the default encoding. errors may be given to set a different error\n\
6465 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6466 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6467 as well as any other name registered with codecs.register_error that is\n\
6468 able to handle UnicodeDecodeErrors.");
6471 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6473 static char *kwlist
[] = {"encoding", "errors", 0};
6474 char *encoding
= NULL
;
6475 char *errors
= NULL
;
6478 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:decode",
6479 kwlist
, &encoding
, &errors
))
6481 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6484 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6485 PyErr_Format(PyExc_TypeError
,
6486 "decoder did not return a string/unicode object "
6488 Py_TYPE(v
)->tp_name
);
6498 PyDoc_STRVAR(expandtabs__doc__
,
6499 "S.expandtabs([tabsize]) -> unicode\n\
6501 Return a copy of S where all tab characters are expanded using spaces.\n\
6502 If tabsize is not given, a tab size of 8 characters is assumed.");
6505 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6511 Py_ssize_t i
, j
, incr
;
6515 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6518 /* First pass: determine size of output string */
6519 i
= 0; /* chars up to and including most recent \n or \r */
6520 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6521 e
= self
->str
+ self
->length
; /* end of input */
6522 for (p
= self
->str
; p
< e
; p
++)
6525 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6526 if (j
> PY_SSIZE_T_MAX
- incr
)
6532 if (j
> PY_SSIZE_T_MAX
- 1)
6535 if (*p
== '\n' || *p
== '\r') {
6536 if (i
> PY_SSIZE_T_MAX
- j
)
6543 if (i
> PY_SSIZE_T_MAX
- j
)
6546 /* Second pass: create output string and fill it */
6547 u
= _PyUnicode_New(i
+ j
);
6551 j
= 0; /* same as in first pass */
6552 q
= u
->str
; /* next output char */
6553 qe
= u
->str
+ u
->length
; /* end of output */
6555 for (p
= self
->str
; p
< e
; p
++)
6558 i
= tabsize
- (j
% tabsize
);
6572 if (*p
== '\n' || *p
== '\r')
6576 return (PyObject
*) u
;
6581 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6585 PyDoc_STRVAR(find__doc__
,
6586 "S.find(sub [,start [,end]]) -> int\n\
6588 Return the lowest index in S where substring sub is found,\n\
6589 such that sub is contained within S[start:end]. Optional\n\
6590 arguments start and end are interpreted as in slice notation.\n\
6592 Return -1 on failure.");
6595 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6597 PyUnicodeObject
*substring
;
6602 if (!stringlib_parse_args_finds_unicode("find", args
, &substring
,
6606 result
= stringlib_find_slice(
6607 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6608 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6612 Py_DECREF(substring
);
6614 return PyInt_FromSsize_t(result
);
6618 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6620 if (index
< 0 || index
>= self
->length
) {
6621 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6625 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6629 unicode_hash(PyUnicodeObject
*self
)
6631 /* Since Unicode objects compare equal to their ASCII string
6632 counterparts, they should use the individual character values
6633 as basis for their hash value. This is needed to assure that
6634 strings and Unicode objects behave in the same way as
6637 register Py_ssize_t len
;
6638 register Py_UNICODE
*p
;
6642 assert(_Py_HashSecret_Initialized
);
6644 if (self
->hash
!= -1)
6646 len
= PyUnicode_GET_SIZE(self
);
6648 We make the hash of the empty string be 0, rather than using
6649 (prefix ^ suffix), since this slightly obfuscates the hash secret
6655 p
= PyUnicode_AS_UNICODE(self
);
6656 x
= _Py_HashSecret
.prefix
;
6659 x
= (1000003*x
) ^ *p
++;
6660 x
^= PyUnicode_GET_SIZE(self
);
6661 x
^= _Py_HashSecret
.suffix
;
6668 PyDoc_STRVAR(index__doc__
,
6669 "S.index(sub [,start [,end]]) -> int\n\
6671 Like S.find() but raise ValueError when the substring is not found.");
6674 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6677 PyUnicodeObject
*substring
;
6681 if (!stringlib_parse_args_finds_unicode("index", args
, &substring
,
6685 result
= stringlib_find_slice(
6686 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6687 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6691 Py_DECREF(substring
);
6694 PyErr_SetString(PyExc_ValueError
, "substring not found");
6698 return PyInt_FromSsize_t(result
);
6701 PyDoc_STRVAR(islower__doc__
,
6702 "S.islower() -> bool\n\
6704 Return True if all cased characters in S are lowercase and there is\n\
6705 at least one cased character in S, False otherwise.");
6708 unicode_islower(PyUnicodeObject
*self
)
6710 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6711 register const Py_UNICODE
*e
;
6714 /* Shortcut for single character strings */
6715 if (PyUnicode_GET_SIZE(self
) == 1)
6716 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6718 /* Special case for empty strings */
6719 if (PyUnicode_GET_SIZE(self
) == 0)
6720 return PyBool_FromLong(0);
6722 e
= p
+ PyUnicode_GET_SIZE(self
);
6724 for (; p
< e
; p
++) {
6725 register const Py_UNICODE ch
= *p
;
6727 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6728 return PyBool_FromLong(0);
6729 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6732 return PyBool_FromLong(cased
);
6735 PyDoc_STRVAR(isupper__doc__
,
6736 "S.isupper() -> bool\n\
6738 Return True if all cased characters in S are uppercase and there is\n\
6739 at least one cased character in S, False otherwise.");
6742 unicode_isupper(PyUnicodeObject
*self
)
6744 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6745 register const Py_UNICODE
*e
;
6748 /* Shortcut for single character strings */
6749 if (PyUnicode_GET_SIZE(self
) == 1)
6750 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6752 /* Special case for empty strings */
6753 if (PyUnicode_GET_SIZE(self
) == 0)
6754 return PyBool_FromLong(0);
6756 e
= p
+ PyUnicode_GET_SIZE(self
);
6758 for (; p
< e
; p
++) {
6759 register const Py_UNICODE ch
= *p
;
6761 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6762 return PyBool_FromLong(0);
6763 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6766 return PyBool_FromLong(cased
);
6769 PyDoc_STRVAR(istitle__doc__
,
6770 "S.istitle() -> bool\n\
6772 Return True if S is a titlecased string and there is at least one\n\
6773 character in S, i.e. upper- and titlecase characters may only\n\
6774 follow uncased characters and lowercase characters only cased ones.\n\
6775 Return False otherwise.");
6778 unicode_istitle(PyUnicodeObject
*self
)
6780 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6781 register const Py_UNICODE
*e
;
6782 int cased
, previous_is_cased
;
6784 /* Shortcut for single character strings */
6785 if (PyUnicode_GET_SIZE(self
) == 1)
6786 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6787 (Py_UNICODE_ISUPPER(*p
) != 0));
6789 /* Special case for empty strings */
6790 if (PyUnicode_GET_SIZE(self
) == 0)
6791 return PyBool_FromLong(0);
6793 e
= p
+ PyUnicode_GET_SIZE(self
);
6795 previous_is_cased
= 0;
6796 for (; p
< e
; p
++) {
6797 register const Py_UNICODE ch
= *p
;
6799 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6800 if (previous_is_cased
)
6801 return PyBool_FromLong(0);
6802 previous_is_cased
= 1;
6805 else if (Py_UNICODE_ISLOWER(ch
)) {
6806 if (!previous_is_cased
)
6807 return PyBool_FromLong(0);
6808 previous_is_cased
= 1;
6812 previous_is_cased
= 0;
6814 return PyBool_FromLong(cased
);
6817 PyDoc_STRVAR(isspace__doc__
,
6818 "S.isspace() -> bool\n\
6820 Return True if all characters in S are whitespace\n\
6821 and there is at least one character in S, False otherwise.");
6824 unicode_isspace(PyUnicodeObject
*self
)
6826 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6827 register const Py_UNICODE
*e
;
6829 /* Shortcut for single character strings */
6830 if (PyUnicode_GET_SIZE(self
) == 1 &&
6831 Py_UNICODE_ISSPACE(*p
))
6832 return PyBool_FromLong(1);
6834 /* Special case for empty strings */
6835 if (PyUnicode_GET_SIZE(self
) == 0)
6836 return PyBool_FromLong(0);
6838 e
= p
+ PyUnicode_GET_SIZE(self
);
6839 for (; p
< e
; p
++) {
6840 if (!Py_UNICODE_ISSPACE(*p
))
6841 return PyBool_FromLong(0);
6843 return PyBool_FromLong(1);
6846 PyDoc_STRVAR(isalpha__doc__
,
6847 "S.isalpha() -> bool\n\
6849 Return True if all characters in S are alphabetic\n\
6850 and there is at least one character in S, False otherwise.");
6853 unicode_isalpha(PyUnicodeObject
*self
)
6855 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6856 register const Py_UNICODE
*e
;
6858 /* Shortcut for single character strings */
6859 if (PyUnicode_GET_SIZE(self
) == 1 &&
6860 Py_UNICODE_ISALPHA(*p
))
6861 return PyBool_FromLong(1);
6863 /* Special case for empty strings */
6864 if (PyUnicode_GET_SIZE(self
) == 0)
6865 return PyBool_FromLong(0);
6867 e
= p
+ PyUnicode_GET_SIZE(self
);
6868 for (; p
< e
; p
++) {
6869 if (!Py_UNICODE_ISALPHA(*p
))
6870 return PyBool_FromLong(0);
6872 return PyBool_FromLong(1);
6875 PyDoc_STRVAR(isalnum__doc__
,
6876 "S.isalnum() -> bool\n\
6878 Return True if all characters in S are alphanumeric\n\
6879 and there is at least one character in S, False otherwise.");
6882 unicode_isalnum(PyUnicodeObject
*self
)
6884 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6885 register const Py_UNICODE
*e
;
6887 /* Shortcut for single character strings */
6888 if (PyUnicode_GET_SIZE(self
) == 1 &&
6889 Py_UNICODE_ISALNUM(*p
))
6890 return PyBool_FromLong(1);
6892 /* Special case for empty strings */
6893 if (PyUnicode_GET_SIZE(self
) == 0)
6894 return PyBool_FromLong(0);
6896 e
= p
+ PyUnicode_GET_SIZE(self
);
6897 for (; p
< e
; p
++) {
6898 if (!Py_UNICODE_ISALNUM(*p
))
6899 return PyBool_FromLong(0);
6901 return PyBool_FromLong(1);
6904 PyDoc_STRVAR(isdecimal__doc__
,
6905 "S.isdecimal() -> bool\n\
6907 Return True if there are only decimal characters in S,\n\
6911 unicode_isdecimal(PyUnicodeObject
*self
)
6913 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6914 register const Py_UNICODE
*e
;
6916 /* Shortcut for single character strings */
6917 if (PyUnicode_GET_SIZE(self
) == 1 &&
6918 Py_UNICODE_ISDECIMAL(*p
))
6919 return PyBool_FromLong(1);
6921 /* Special case for empty strings */
6922 if (PyUnicode_GET_SIZE(self
) == 0)
6923 return PyBool_FromLong(0);
6925 e
= p
+ PyUnicode_GET_SIZE(self
);
6926 for (; p
< e
; p
++) {
6927 if (!Py_UNICODE_ISDECIMAL(*p
))
6928 return PyBool_FromLong(0);
6930 return PyBool_FromLong(1);
6933 PyDoc_STRVAR(isdigit__doc__
,
6934 "S.isdigit() -> bool\n\
6936 Return True if all characters in S are digits\n\
6937 and there is at least one character in S, False otherwise.");
6940 unicode_isdigit(PyUnicodeObject
*self
)
6942 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6943 register const Py_UNICODE
*e
;
6945 /* Shortcut for single character strings */
6946 if (PyUnicode_GET_SIZE(self
) == 1 &&
6947 Py_UNICODE_ISDIGIT(*p
))
6948 return PyBool_FromLong(1);
6950 /* Special case for empty strings */
6951 if (PyUnicode_GET_SIZE(self
) == 0)
6952 return PyBool_FromLong(0);
6954 e
= p
+ PyUnicode_GET_SIZE(self
);
6955 for (; p
< e
; p
++) {
6956 if (!Py_UNICODE_ISDIGIT(*p
))
6957 return PyBool_FromLong(0);
6959 return PyBool_FromLong(1);
6962 PyDoc_STRVAR(isnumeric__doc__
,
6963 "S.isnumeric() -> bool\n\
6965 Return True if there are only numeric characters in S,\n\
6969 unicode_isnumeric(PyUnicodeObject
*self
)
6971 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6972 register const Py_UNICODE
*e
;
6974 /* Shortcut for single character strings */
6975 if (PyUnicode_GET_SIZE(self
) == 1 &&
6976 Py_UNICODE_ISNUMERIC(*p
))
6977 return PyBool_FromLong(1);
6979 /* Special case for empty strings */
6980 if (PyUnicode_GET_SIZE(self
) == 0)
6981 return PyBool_FromLong(0);
6983 e
= p
+ PyUnicode_GET_SIZE(self
);
6984 for (; p
< e
; p
++) {
6985 if (!Py_UNICODE_ISNUMERIC(*p
))
6986 return PyBool_FromLong(0);
6988 return PyBool_FromLong(1);
6991 PyDoc_STRVAR(join__doc__
,
6992 "S.join(iterable) -> unicode\n\
6994 Return a string which is the concatenation of the strings in the\n\
6995 iterable. The separator between elements is S.");
6998 unicode_join(PyObject
*self
, PyObject
*data
)
7000 return PyUnicode_Join(self
, data
);
7004 unicode_length(PyUnicodeObject
*self
)
7006 return self
->length
;
7009 PyDoc_STRVAR(ljust__doc__
,
7010 "S.ljust(width[, fillchar]) -> int\n\
7012 Return S left-justified in a Unicode string of length width. Padding is\n\
7013 done using the specified fill character (default is a space).");
7016 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
7019 Py_UNICODE fillchar
= ' ';
7021 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
7024 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7026 return (PyObject
*) self
;
7029 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
7032 PyDoc_STRVAR(lower__doc__
,
7033 "S.lower() -> unicode\n\
7035 Return a copy of the string S converted to lowercase.");
7038 unicode_lower(PyUnicodeObject
*self
)
7040 return fixup(self
, fixlower
);
7044 #define RIGHTSTRIP 1
7047 /* Arrays indexed by above */
7048 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7050 #define STRIPNAME(i) (stripformat[i]+3)
7052 /* externally visible for str.strip(unicode) */
7054 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
7056 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7057 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
7058 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
7059 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
7062 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
7065 if (striptype
!= RIGHTSTRIP
) {
7066 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
7072 if (striptype
!= LEFTSTRIP
) {
7075 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
7079 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7081 return (PyObject
*)self
;
7084 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7089 do_strip(PyUnicodeObject
*self
, int striptype
)
7091 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
7092 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
7095 if (striptype
!= RIGHTSTRIP
) {
7096 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
7102 if (striptype
!= LEFTSTRIP
) {
7105 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7109 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7111 return (PyObject
*)self
;
7114 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7119 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7121 PyObject
*sep
= NULL
;
7123 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7126 if (sep
!= NULL
&& sep
!= Py_None
) {
7127 if (PyUnicode_Check(sep
))
7128 return _PyUnicode_XStrip(self
, striptype
, sep
);
7129 else if (PyString_Check(sep
)) {
7131 sep
= PyUnicode_FromObject(sep
);
7134 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7139 PyErr_Format(PyExc_TypeError
,
7140 "%s arg must be None, unicode or str",
7141 STRIPNAME(striptype
));
7146 return do_strip(self
, striptype
);
7150 PyDoc_STRVAR(strip__doc__
,
7151 "S.strip([chars]) -> unicode\n\
7153 Return a copy of the string S with leading and trailing\n\
7154 whitespace removed.\n\
7155 If chars is given and not None, remove characters in chars instead.\n\
7156 If chars is a str, it will be converted to unicode before stripping");
7159 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7161 if (PyTuple_GET_SIZE(args
) == 0)
7162 return do_strip(self
, BOTHSTRIP
); /* Common case */
7164 return do_argstrip(self
, BOTHSTRIP
, args
);
7168 PyDoc_STRVAR(lstrip__doc__
,
7169 "S.lstrip([chars]) -> unicode\n\
7171 Return a copy of the string S with leading whitespace removed.\n\
7172 If chars is given and not None, remove characters in chars instead.\n\
7173 If chars is a str, it will be converted to unicode before stripping");
7176 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7178 if (PyTuple_GET_SIZE(args
) == 0)
7179 return do_strip(self
, LEFTSTRIP
); /* Common case */
7181 return do_argstrip(self
, LEFTSTRIP
, args
);
7185 PyDoc_STRVAR(rstrip__doc__
,
7186 "S.rstrip([chars]) -> unicode\n\
7188 Return a copy of the string S with trailing whitespace removed.\n\
7189 If chars is given and not None, remove characters in chars instead.\n\
7190 If chars is a str, it will be converted to unicode before stripping");
7193 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7195 if (PyTuple_GET_SIZE(args
) == 0)
7196 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7198 return do_argstrip(self
, RIGHTSTRIP
, args
);
7203 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7213 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7214 /* no repeat, return original string */
7216 return (PyObject
*) str
;
7219 /* ensure # of chars needed doesn't overflow int and # of bytes
7220 * needed doesn't overflow size_t
7222 nchars
= len
* str
->length
;
7223 if (len
&& nchars
/ len
!= str
->length
) {
7224 PyErr_SetString(PyExc_OverflowError
,
7225 "repeated string is too long");
7228 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7229 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7230 PyErr_SetString(PyExc_OverflowError
,
7231 "repeated string is too long");
7234 u
= _PyUnicode_New(nchars
);
7240 if (str
->length
== 1 && len
> 0) {
7241 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7243 Py_ssize_t done
= 0; /* number of characters copied this far */
7244 if (done
< nchars
) {
7245 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7248 while (done
< nchars
) {
7249 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7250 Py_UNICODE_COPY(p
+done
, p
, n
);
7255 return (PyObject
*) u
;
7258 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7261 Py_ssize_t maxcount
)
7268 self
= PyUnicode_FromObject(obj
);
7271 str1
= PyUnicode_FromObject(subobj
);
7276 str2
= PyUnicode_FromObject(replobj
);
7282 result
= replace((PyUnicodeObject
*)self
,
7283 (PyUnicodeObject
*)str1
,
7284 (PyUnicodeObject
*)str2
,
7292 PyDoc_STRVAR(replace__doc__
,
7293 "S.replace(old, new[, count]) -> unicode\n\
7295 Return a copy of S with all occurrences of substring\n\
7296 old replaced by new. If the optional argument count is\n\
7297 given, only the first count occurrences are replaced.");
7300 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7302 PyUnicodeObject
*str1
;
7303 PyUnicodeObject
*str2
;
7304 Py_ssize_t maxcount
= -1;
7307 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7309 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7312 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7318 result
= replace(self
, str1
, str2
, maxcount
);
7326 PyObject
*unicode_repr(PyObject
*unicode
)
7328 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7329 PyUnicode_GET_SIZE(unicode
),
7333 PyDoc_STRVAR(rfind__doc__
,
7334 "S.rfind(sub [,start [,end]]) -> int\n\
7336 Return the highest index in S where substring sub is found,\n\
7337 such that sub is contained within S[start:end]. Optional\n\
7338 arguments start and end are interpreted as in slice notation.\n\
7340 Return -1 on failure.");
7343 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7345 PyUnicodeObject
*substring
;
7350 if (!stringlib_parse_args_finds_unicode("rfind", args
, &substring
,
7354 result
= stringlib_rfind_slice(
7355 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7356 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7360 Py_DECREF(substring
);
7362 return PyInt_FromSsize_t(result
);
7365 PyDoc_STRVAR(rindex__doc__
,
7366 "S.rindex(sub [,start [,end]]) -> int\n\
7368 Like S.rfind() but raise ValueError when the substring is not found.");
7371 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7373 PyUnicodeObject
*substring
;
7378 if (!stringlib_parse_args_finds_unicode("rindex", args
, &substring
,
7382 result
= stringlib_rfind_slice(
7383 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7384 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7388 Py_DECREF(substring
);
7391 PyErr_SetString(PyExc_ValueError
, "substring not found");
7394 return PyInt_FromSsize_t(result
);
7397 PyDoc_STRVAR(rjust__doc__
,
7398 "S.rjust(width[, fillchar]) -> unicode\n\
7400 Return S right-justified in a Unicode string of length width. Padding is\n\
7401 done using the specified fill character (default is a space).");
7404 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7407 Py_UNICODE fillchar
= ' ';
7409 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7412 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7414 return (PyObject
*) self
;
7417 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7421 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7423 /* standard clamping */
7428 if (end
> self
->length
)
7430 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7431 /* full slice, return original string */
7433 return (PyObject
*) self
;
7438 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7442 PyObject
*PyUnicode_Split(PyObject
*s
,
7444 Py_ssize_t maxsplit
)
7448 s
= PyUnicode_FromObject(s
);
7452 sep
= PyUnicode_FromObject(sep
);
7459 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7466 PyDoc_STRVAR(split__doc__
,
7467 "S.split([sep [,maxsplit]]) -> list of strings\n\
7469 Return a list of the words in S, using sep as the\n\
7470 delimiter string. If maxsplit is given, at most maxsplit\n\
7471 splits are done. If sep is not specified or is None, any\n\
7472 whitespace string is a separator and empty strings are\n\
7473 removed from the result.");
7476 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7478 PyObject
*substring
= Py_None
;
7479 Py_ssize_t maxcount
= -1;
7481 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7484 if (substring
== Py_None
)
7485 return split(self
, NULL
, maxcount
);
7486 else if (PyUnicode_Check(substring
))
7487 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7489 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7493 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7499 str_obj
= PyUnicode_FromObject(str_in
);
7502 sep_obj
= PyUnicode_FromObject(sep_in
);
7508 out
= stringlib_partition(
7509 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7510 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7521 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7527 str_obj
= PyUnicode_FromObject(str_in
);
7530 sep_obj
= PyUnicode_FromObject(sep_in
);
7536 out
= stringlib_rpartition(
7537 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7538 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7547 PyDoc_STRVAR(partition__doc__
,
7548 "S.partition(sep) -> (head, sep, tail)\n\
7550 Search for the separator sep in S, and return the part before it,\n\
7551 the separator itself, and the part after it. If the separator is not\n\
7552 found, return S and two empty strings.");
7555 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7557 return PyUnicode_Partition((PyObject
*)self
, separator
);
7560 PyDoc_STRVAR(rpartition__doc__
,
7561 "S.rpartition(sep) -> (head, sep, tail)\n\
7563 Search for the separator sep in S, starting at the end of S, and return\n\
7564 the part before it, the separator itself, and the part after it. If the\n\
7565 separator is not found, return two empty strings and S.");
7568 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7570 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7573 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7575 Py_ssize_t maxsplit
)
7579 s
= PyUnicode_FromObject(s
);
7583 sep
= PyUnicode_FromObject(sep
);
7590 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7597 PyDoc_STRVAR(rsplit__doc__
,
7598 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7600 Return a list of the words in S, using sep as the\n\
7601 delimiter string, starting at the end of the string and\n\
7602 working to the front. If maxsplit is given, at most maxsplit\n\
7603 splits are done. If sep is not specified, any whitespace string\n\
7607 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7609 PyObject
*substring
= Py_None
;
7610 Py_ssize_t maxcount
= -1;
7612 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7615 if (substring
== Py_None
)
7616 return rsplit(self
, NULL
, maxcount
);
7617 else if (PyUnicode_Check(substring
))
7618 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7620 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7623 PyDoc_STRVAR(splitlines__doc__
,
7624 "S.splitlines(keepends=False) -> list of strings\n\
7626 Return a list of the lines in S, breaking at line boundaries.\n\
7627 Line breaks are not included in the resulting list unless keepends\n\
7628 is given and true.");
7631 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7635 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7638 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7642 PyObject
*unicode_str(PyUnicodeObject
*self
)
7644 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7647 PyDoc_STRVAR(swapcase__doc__
,
7648 "S.swapcase() -> unicode\n\
7650 Return a copy of S with uppercase characters converted to lowercase\n\
7654 unicode_swapcase(PyUnicodeObject
*self
)
7656 return fixup(self
, fixswapcase
);
7659 PyDoc_STRVAR(translate__doc__
,
7660 "S.translate(table) -> unicode\n\
7662 Return a copy of the string S, where all characters have been mapped\n\
7663 through the given translation table, which must be a mapping of\n\
7664 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7665 Unmapped characters are left untouched. Characters mapped to None\n\
7669 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7671 return PyUnicode_TranslateCharmap(self
->str
,
7677 PyDoc_STRVAR(upper__doc__
,
7678 "S.upper() -> unicode\n\
7680 Return a copy of S converted to uppercase.");
7683 unicode_upper(PyUnicodeObject
*self
)
7685 return fixup(self
, fixupper
);
7688 PyDoc_STRVAR(zfill__doc__
,
7689 "S.zfill(width) -> unicode\n\
7691 Pad a numeric string S with zeros on the left, to fill a field\n\
7692 of the specified width. The string S is never truncated.");
7695 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7701 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7704 if (self
->length
>= width
) {
7705 if (PyUnicode_CheckExact(self
)) {
7707 return (PyObject
*) self
;
7710 return PyUnicode_FromUnicode(
7711 PyUnicode_AS_UNICODE(self
),
7712 PyUnicode_GET_SIZE(self
)
7716 fill
= width
- self
->length
;
7718 u
= pad(self
, fill
, 0, '0');
7723 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7724 /* move sign to beginning of string */
7725 u
->str
[0] = u
->str
[fill
];
7729 return (PyObject
*) u
;
7734 free_listsize(PyUnicodeObject
*self
)
7736 return PyInt_FromLong(numfree
);
7740 PyDoc_STRVAR(startswith__doc__
,
7741 "S.startswith(prefix[, start[, end]]) -> bool\n\
7743 Return True if S starts with the specified prefix, False otherwise.\n\
7744 With optional start, test S beginning at that position.\n\
7745 With optional end, stop comparing S at that position.\n\
7746 prefix can also be a tuple of strings to try.");
7749 unicode_startswith(PyUnicodeObject
*self
,
7753 PyUnicodeObject
*substring
;
7754 Py_ssize_t start
= 0;
7755 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7758 if (!stringlib_parse_args_finds("startswith", args
, &subobj
, &start
, &end
))
7760 if (PyTuple_Check(subobj
)) {
7762 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7763 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7764 PyTuple_GET_ITEM(subobj
, i
));
7765 if (substring
== NULL
)
7767 result
= tailmatch(self
, substring
, start
, end
, -1);
7768 Py_DECREF(substring
);
7773 /* nothing matched */
7776 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7777 if (substring
== NULL
) {
7778 if (PyErr_ExceptionMatches(PyExc_TypeError
))
7779 PyErr_Format(PyExc_TypeError
, "startswith first arg must be str, "
7780 "unicode, or tuple, not %s", Py_TYPE(subobj
)->tp_name
);
7783 result
= tailmatch(self
, substring
, start
, end
, -1);
7784 Py_DECREF(substring
);
7785 return PyBool_FromLong(result
);
7789 PyDoc_STRVAR(endswith__doc__
,
7790 "S.endswith(suffix[, start[, end]]) -> bool\n\
7792 Return True if S ends with the specified suffix, False otherwise.\n\
7793 With optional start, test S beginning at that position.\n\
7794 With optional end, stop comparing S at that position.\n\
7795 suffix can also be a tuple of strings to try.");
7798 unicode_endswith(PyUnicodeObject
*self
,
7802 PyUnicodeObject
*substring
;
7803 Py_ssize_t start
= 0;
7804 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7807 if (!stringlib_parse_args_finds("endswith", args
, &subobj
, &start
, &end
))
7809 if (PyTuple_Check(subobj
)) {
7811 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7812 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7813 PyTuple_GET_ITEM(subobj
, i
));
7814 if (substring
== NULL
)
7816 result
= tailmatch(self
, substring
, start
, end
, +1);
7817 Py_DECREF(substring
);
7824 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7825 if (substring
== NULL
) {
7826 if (PyErr_ExceptionMatches(PyExc_TypeError
))
7827 PyErr_Format(PyExc_TypeError
, "endswith first arg must be str, "
7828 "unicode, or tuple, not %s", Py_TYPE(subobj
)->tp_name
);
7831 result
= tailmatch(self
, substring
, start
, end
, +1);
7832 Py_DECREF(substring
);
7833 return PyBool_FromLong(result
);
7837 /* Implements do_string_format, which is unicode because of stringlib */
7838 #include "stringlib/string_format.h"
7840 PyDoc_STRVAR(format__doc__
,
7841 "S.format(*args, **kwargs) -> unicode\n\
7843 Return a formatted version of S, using substitutions from args and kwargs.\n\
7844 The substitutions are identified by braces ('{' and '}').");
7847 unicode__format__(PyObject
*self
, PyObject
*args
)
7849 PyObject
*format_spec
;
7850 PyObject
*result
= NULL
;
7851 PyObject
*tmp
= NULL
;
7853 /* If 2.x, convert format_spec to the same type as value */
7854 /* This is to allow things like u''.format('') */
7855 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
7857 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
7858 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
7859 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
7862 tmp
= PyObject_Unicode(format_spec
);
7867 result
= _PyUnicode_FormatAdvanced(self
,
7868 PyUnicode_AS_UNICODE(format_spec
),
7869 PyUnicode_GET_SIZE(format_spec
));
7875 PyDoc_STRVAR(p_format__doc__
,
7876 "S.__format__(format_spec) -> unicode\n\
7878 Return a formatted version of S as described by format_spec.");
7881 unicode__sizeof__(PyUnicodeObject
*v
)
7883 return PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
7884 sizeof(Py_UNICODE
) * (v
->length
+ 1));
7887 PyDoc_STRVAR(sizeof__doc__
,
7888 "S.__sizeof__() -> size of S in memory, in bytes\n\
7893 unicode_getnewargs(PyUnicodeObject
*v
)
7895 return Py_BuildValue("(u#)", v
->str
, v
->length
);
7899 static PyMethodDef unicode_methods
[] = {
7900 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
| METH_KEYWORDS
, encode__doc__
},
7901 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7902 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7903 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7904 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7905 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7906 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7907 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7908 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7909 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7910 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7911 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7912 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7913 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7914 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7915 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7916 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
| METH_KEYWORDS
, decode__doc__
},
7917 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7918 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7919 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7920 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7921 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7922 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7923 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7924 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7925 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7926 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7927 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7928 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7929 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7930 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7931 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7932 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7933 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7934 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7935 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7936 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7937 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7938 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7939 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7940 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
7941 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
7942 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
7943 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
7944 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
7946 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
7950 /* This one is just used for debugging the implementation. */
7951 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
7954 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
7959 unicode_mod(PyObject
*v
, PyObject
*w
)
7961 if (!PyUnicode_Check(v
)) {
7962 Py_INCREF(Py_NotImplemented
);
7963 return Py_NotImplemented
;
7965 return PyUnicode_Format(v
, w
);
7968 static PyNumberMethods unicode_as_number
= {
7973 unicode_mod
, /*nb_remainder*/
7976 static PySequenceMethods unicode_as_sequence
= {
7977 (lenfunc
) unicode_length
, /* sq_length */
7978 PyUnicode_Concat
, /* sq_concat */
7979 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
7980 (ssizeargfunc
) unicode_getitem
, /* sq_item */
7981 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
7982 0, /* sq_ass_item */
7983 0, /* sq_ass_slice */
7984 PyUnicode_Contains
, /* sq_contains */
7988 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
7990 if (PyIndex_Check(item
)) {
7991 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
7992 if (i
== -1 && PyErr_Occurred())
7995 i
+= PyUnicode_GET_SIZE(self
);
7996 return unicode_getitem(self
, i
);
7997 } else if (PySlice_Check(item
)) {
7998 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
7999 Py_UNICODE
* source_buf
;
8000 Py_UNICODE
* result_buf
;
8003 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
8004 &start
, &stop
, &step
, &slicelength
) < 0) {
8008 if (slicelength
<= 0) {
8009 return PyUnicode_FromUnicode(NULL
, 0);
8010 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
8011 PyUnicode_CheckExact(self
)) {
8013 return (PyObject
*)self
;
8014 } else if (step
== 1) {
8015 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
8017 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
8018 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
8019 sizeof(Py_UNICODE
));
8021 if (result_buf
== NULL
)
8022 return PyErr_NoMemory();
8024 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
8025 result_buf
[i
] = source_buf
[cur
];
8028 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
8029 PyObject_FREE(result_buf
);
8033 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
8038 static PyMappingMethods unicode_as_mapping
= {
8039 (lenfunc
)unicode_length
, /* mp_length */
8040 (binaryfunc
)unicode_subscript
, /* mp_subscript */
8041 (objobjargproc
)0, /* mp_ass_subscript */
8045 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
8050 PyErr_SetString(PyExc_SystemError
,
8051 "accessing non-existent unicode segment");
8054 *ptr
= (void *) self
->str
;
8055 return PyUnicode_GET_DATA_SIZE(self
);
8059 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
8062 PyErr_SetString(PyExc_TypeError
,
8063 "cannot use unicode as modifiable buffer");
8068 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
8072 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
8077 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
8084 PyErr_SetString(PyExc_SystemError
,
8085 "accessing non-existent unicode segment");
8088 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
8091 *ptr
= (void *) PyString_AS_STRING(str
);
8092 return PyString_GET_SIZE(str
);
8095 /* Helpers for PyUnicode_Format() */
8098 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
8100 Py_ssize_t argidx
= *p_argidx
;
8101 if (argidx
< arglen
) {
8106 return PyTuple_GetItem(args
, argidx
);
8108 PyErr_SetString(PyExc_TypeError
,
8109 "not enough arguments for format string");
8113 #define F_LJUST (1<<0)
8114 #define F_SIGN (1<<1)
8115 #define F_BLANK (1<<2)
8116 #define F_ALT (1<<3)
8117 #define F_ZERO (1<<4)
8120 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8122 register Py_ssize_t i
;
8123 Py_ssize_t len
= strlen(charbuffer
);
8124 for (i
= len
- 1; i
>= 0; i
--)
8125 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8131 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8135 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8136 result
= strtounicode(buffer
, (char *)buffer
);
8137 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8140 /* XXX To save some code duplication, formatfloat/long/int could have been
8141 shared with stringobject.c, converting from 8-bit to Unicode after the
8142 formatting is done. */
8144 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8147 formatfloat(PyObject
*v
, int flags
, int prec
, int type
)
8153 x
= PyFloat_AsDouble(v
);
8154 if (x
== -1.0 && PyErr_Occurred())
8160 p
= PyOS_double_to_string(x
, type
, prec
,
8161 (flags
& F_ALT
) ? Py_DTSF_ALT
: 0, NULL
);
8164 result
= PyUnicode_FromStringAndSize(p
, strlen(p
));
8170 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8174 PyObject
*str
; /* temporary string object. */
8175 PyUnicodeObject
*result
;
8177 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8180 result
= _PyUnicode_New(len
);
8185 for (i
= 0; i
< len
; i
++)
8186 result
->str
[i
] = buf
[i
];
8187 result
->str
[len
] = 0;
8189 return (PyObject
*)result
;
8193 formatint(Py_UNICODE
*buf
,
8200 /* fmt = '%#.' + `prec` + 'l' + `type`
8201 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8205 char fmt
[64]; /* plenty big enough! */
8209 x
= PyInt_AsLong(v
);
8210 if (x
== -1 && PyErr_Occurred())
8212 if (x
< 0 && type
== 'u') {
8215 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8222 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8223 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8225 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8226 PyErr_SetString(PyExc_OverflowError
,
8227 "formatted integer is too long (precision too large?)");
8231 if ((flags
& F_ALT
) &&
8232 (type
== 'x' || type
== 'X')) {
8233 /* When converting under %#x or %#X, there are a number
8234 * of issues that cause pain:
8235 * - when 0 is being converted, the C standard leaves off
8236 * the '0x' or '0X', which is inconsistent with other
8237 * %#x/%#X conversions and inconsistent with Python's
8239 * - there are platforms that violate the standard and
8240 * convert 0 with the '0x' or '0X'
8241 * (Metrowerks, Compaq Tru64)
8242 * - there are platforms that give '0x' when converting
8243 * under %#X, but convert 0 in accordance with the
8244 * standard (OS/2 EMX)
8246 * We can achieve the desired consistency by inserting our
8247 * own '0x' or '0X' prefix, and substituting %x/%X in place
8250 * Note that this is the same approach as used in
8251 * formatint() in stringobject.c
8253 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8254 sign
, type
, prec
, type
);
8257 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8258 sign
, (flags
&F_ALT
) ? "#" : "",
8262 return longtounicode(buf
, buflen
, fmt
, -x
);
8264 return longtounicode(buf
, buflen
, fmt
, x
);
8268 formatchar(Py_UNICODE
*buf
,
8274 /* presume that the buffer is at least 2 characters long */
8275 if (PyUnicode_Check(v
)) {
8276 if (PyUnicode_GET_SIZE(v
) != 1)
8278 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8281 else if (PyString_Check(v
)) {
8282 if (PyString_GET_SIZE(v
) != 1)
8284 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8285 with a UnicodeDecodeError if 'char' is not decodable with the
8286 default encoding (usually ASCII, but it might be something else) */
8287 str
= PyString_AS_STRING(v
);
8288 if ((unsigned char)str
[0] > 0x7F) {
8289 /* the char is not ASCII; try to decode the string using the
8290 default encoding and return -1 to let the UnicodeDecodeError
8291 be raised if the string can't be decoded */
8292 unistr
= PyUnicode_Decode(str
, 1, NULL
, "strict");
8295 buf
[0] = PyUnicode_AS_UNICODE(unistr
)[0];
8299 buf
[0] = (Py_UNICODE
)str
[0];
8303 /* Integer input truncated to a character */
8305 x
= PyInt_AsLong(v
);
8306 if (x
== -1 && PyErr_Occurred())
8308 #ifdef Py_UNICODE_WIDE
8309 if (x
< 0 || x
> 0x10ffff) {
8310 PyErr_SetString(PyExc_OverflowError
,
8311 "%c arg not in range(0x110000) "
8312 "(wide Python build)");
8316 if (x
< 0 || x
> 0xffff) {
8317 PyErr_SetString(PyExc_OverflowError
,
8318 "%c arg not in range(0x10000) "
8319 "(narrow Python build)");
8323 buf
[0] = (Py_UNICODE
) x
;
8329 PyErr_SetString(PyExc_TypeError
,
8330 "%c requires int or char");
8334 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8336 FORMATBUFLEN is the length of the buffer in which the ints &
8337 chars are formatted. XXX This is a magic number. Each formatting
8338 routine does bounds checking to ensure no overflow, but a better
8339 solution may be to malloc a buffer of appropriate size for each
8340 format. For now, the current solution is sufficient.
8342 #define FORMATBUFLEN (size_t)120
8344 PyObject
*PyUnicode_Format(PyObject
*format
,
8347 Py_UNICODE
*fmt
, *res
;
8348 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8350 PyUnicodeObject
*result
= NULL
;
8351 PyObject
*dict
= NULL
;
8354 if (format
== NULL
|| args
== NULL
) {
8355 PyErr_BadInternalCall();
8358 uformat
= PyUnicode_FromObject(format
);
8359 if (uformat
== NULL
)
8361 fmt
= PyUnicode_AS_UNICODE(uformat
);
8362 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8364 reslen
= rescnt
= fmtcnt
+ 100;
8365 result
= _PyUnicode_New(reslen
);
8368 res
= PyUnicode_AS_UNICODE(result
);
8370 if (PyTuple_Check(args
)) {
8371 arglen
= PyTuple_Size(args
);
8378 if (Py_TYPE(args
)->tp_as_mapping
&& Py_TYPE(args
)->tp_as_mapping
->mp_subscript
&&
8379 !PyTuple_Check(args
) && !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8382 while (--fmtcnt
>= 0) {
8385 rescnt
= fmtcnt
+ 100;
8387 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8389 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8395 /* Got a format specifier */
8397 Py_ssize_t width
= -1;
8399 Py_UNICODE c
= '\0';
8403 PyObject
*temp
= NULL
;
8407 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{int,char}() */
8411 Py_UNICODE
*keystart
;
8417 PyErr_SetString(PyExc_TypeError
,
8418 "format requires a mapping");
8424 /* Skip over balanced parentheses */
8425 while (pcount
> 0 && --fmtcnt
>= 0) {
8428 else if (*fmt
== '(')
8432 keylen
= fmt
- keystart
- 1;
8433 if (fmtcnt
< 0 || pcount
> 0) {
8434 PyErr_SetString(PyExc_ValueError
,
8435 "incomplete format key");
8439 /* keys are converted to strings using UTF-8 and
8440 then looked up since Python uses strings to hold
8441 variables names etc. in its namespaces and we
8442 wouldn't want to break common idioms. */
8443 key
= PyUnicode_EncodeUTF8(keystart
,
8447 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8455 args
= PyObject_GetItem(dict
, key
);
8464 while (--fmtcnt
>= 0) {
8465 switch (c
= *fmt
++) {
8466 case '-': flags
|= F_LJUST
; continue;
8467 case '+': flags
|= F_SIGN
; continue;
8468 case ' ': flags
|= F_BLANK
; continue;
8469 case '#': flags
|= F_ALT
; continue;
8470 case '0': flags
|= F_ZERO
; continue;
8475 v
= getnextarg(args
, arglen
, &argidx
);
8478 if (!PyInt_Check(v
)) {
8479 PyErr_SetString(PyExc_TypeError
,
8483 width
= PyInt_AsSsize_t(v
);
8484 if (width
== -1 && PyErr_Occurred())
8493 else if (c
>= '0' && c
<= '9') {
8495 while (--fmtcnt
>= 0) {
8497 if (c
< '0' || c
> '9')
8499 if (width
> (PY_SSIZE_T_MAX
- ((int)c
- '0')) / 10) {
8500 PyErr_SetString(PyExc_ValueError
,
8504 width
= width
*10 + (c
- '0');
8512 v
= getnextarg(args
, arglen
, &argidx
);
8515 if (!PyInt_Check(v
)) {
8516 PyErr_SetString(PyExc_TypeError
,
8520 prec
= _PyInt_AsInt(v
);
8521 if (prec
== -1 && PyErr_Occurred())
8528 else if (c
>= '0' && c
<= '9') {
8530 while (--fmtcnt
>= 0) {
8532 if (c
< '0' || c
> '9')
8534 if (prec
> (INT_MAX
- ((int)c
- '0')) / 10) {
8535 PyErr_SetString(PyExc_ValueError
,
8539 prec
= prec
*10 + (c
- '0');
8544 if (c
== 'h' || c
== 'l' || c
== 'L') {
8550 PyErr_SetString(PyExc_ValueError
,
8551 "incomplete format");
8555 v
= getnextarg(args
, arglen
, &argidx
);
8565 /* presume that buffer length is at least 1 */
8572 if (PyUnicode_CheckExact(v
) && c
== 's') {
8579 temp
= PyObject_Unicode(v
);
8581 temp
= PyObject_Repr(v
);
8584 if (PyUnicode_Check(temp
))
8585 /* nothing to do */;
8586 else if (PyString_Check(temp
)) {
8587 /* convert to string to Unicode */
8588 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8589 PyString_GET_SIZE(temp
),
8599 PyErr_SetString(PyExc_TypeError
,
8600 "%s argument has non-string str()");
8604 pbuf
= PyUnicode_AS_UNICODE(temp
);
8605 len
= PyUnicode_GET_SIZE(temp
);
8606 if (prec
>= 0 && len
> prec
)
8619 if (PyNumber_Check(v
)) {
8620 PyObject
*iobj
=NULL
;
8622 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8627 iobj
= PyNumber_Int(v
);
8628 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8631 if (PyInt_Check(iobj
)) {
8634 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8635 flags
, prec
, c
, iobj
);
8641 else if (PyLong_Check(iobj
)) {
8643 temp
= formatlong(iobj
, flags
, prec
, c
);
8647 pbuf
= PyUnicode_AS_UNICODE(temp
);
8648 len
= PyUnicode_GET_SIZE(temp
);
8657 PyErr_Format(PyExc_TypeError
,
8658 "%%%c format: a number is required, "
8659 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8672 temp
= formatfloat(v
, flags
, prec
, c
);
8675 pbuf
= PyUnicode_AS_UNICODE(temp
);
8676 len
= PyUnicode_GET_SIZE(temp
);
8684 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8690 PyErr_Format(PyExc_ValueError
,
8691 "unsupported format character '%c' (0x%x) "
8693 (31<=c
&& c
<=126) ? (char)c
: '?',
8695 (Py_ssize_t
)(fmt
- 1 -
8696 PyUnicode_AS_UNICODE(uformat
)));
8700 if (*pbuf
== '-' || *pbuf
== '+') {
8704 else if (flags
& F_SIGN
)
8706 else if (flags
& F_BLANK
)
8713 if (rescnt
- (sign
!= 0) < width
) {
8715 rescnt
= width
+ fmtcnt
+ 100;
8722 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8726 res
= PyUnicode_AS_UNICODE(result
)
8736 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8737 assert(pbuf
[0] == '0');
8738 assert(pbuf
[1] == c
);
8749 if (width
> len
&& !(flags
& F_LJUST
)) {
8753 } while (--width
> len
);
8758 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8759 assert(pbuf
[0] == '0');
8760 assert(pbuf
[1] == c
);
8765 Py_UNICODE_COPY(res
, pbuf
, len
);
8768 while (--width
>= len
) {
8772 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8773 PyErr_SetString(PyExc_TypeError
,
8774 "not all arguments converted during string formatting");
8781 if (argidx
< arglen
&& !dict
) {
8782 PyErr_SetString(PyExc_TypeError
,
8783 "not all arguments converted during string formatting");
8787 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8793 return (PyObject
*)result
;
8804 static PyBufferProcs unicode_as_buffer
= {
8805 (readbufferproc
) unicode_buffer_getreadbuf
,
8806 (writebufferproc
) unicode_buffer_getwritebuf
,
8807 (segcountproc
) unicode_buffer_getsegcount
,
8808 (charbufferproc
) unicode_buffer_getcharbuf
,
8812 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8815 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8818 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8819 char *encoding
= NULL
;
8820 char *errors
= NULL
;
8822 if (type
!= &PyUnicode_Type
)
8823 return unicode_subtype_new(type
, args
, kwds
);
8824 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8825 kwlist
, &x
, &encoding
, &errors
))
8828 return (PyObject
*)_PyUnicode_New(0);
8829 if (encoding
== NULL
&& errors
== NULL
)
8830 return PyObject_Unicode(x
);
8832 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8836 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8838 PyUnicodeObject
*tmp
, *pnew
;
8841 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8842 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8845 assert(PyUnicode_Check(tmp
));
8846 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8851 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
8852 if (pnew
->str
== NULL
) {
8853 _Py_ForgetReference((PyObject
*)pnew
);
8856 return PyErr_NoMemory();
8858 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
8860 pnew
->hash
= tmp
->hash
;
8862 return (PyObject
*)pnew
;
8865 PyDoc_STRVAR(unicode_doc
,
8866 "unicode(object='') -> unicode object\n\
8867 unicode(string[, encoding[, errors]]) -> unicode object\n\
8869 Create a new Unicode object from the given encoded string.\n\
8870 encoding defaults to the current default string encoding.\n\
8871 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8873 PyTypeObject PyUnicode_Type
= {
8874 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
8875 "unicode", /* tp_name */
8876 sizeof(PyUnicodeObject
), /* tp_size */
8877 0, /* tp_itemsize */
8879 (destructor
)unicode_dealloc
, /* tp_dealloc */
8884 unicode_repr
, /* tp_repr */
8885 &unicode_as_number
, /* tp_as_number */
8886 &unicode_as_sequence
, /* tp_as_sequence */
8887 &unicode_as_mapping
, /* tp_as_mapping */
8888 (hashfunc
) unicode_hash
, /* tp_hash*/
8890 (reprfunc
) unicode_str
, /* tp_str */
8891 PyObject_GenericGetAttr
, /* tp_getattro */
8892 0, /* tp_setattro */
8893 &unicode_as_buffer
, /* tp_as_buffer */
8894 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
8895 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
8896 unicode_doc
, /* tp_doc */
8897 0, /* tp_traverse */
8899 PyUnicode_RichCompare
, /* tp_richcompare */
8900 0, /* tp_weaklistoffset */
8902 0, /* tp_iternext */
8903 unicode_methods
, /* tp_methods */
8906 &PyBaseString_Type
, /* tp_base */
8908 0, /* tp_descr_get */
8909 0, /* tp_descr_set */
8910 0, /* tp_dictoffset */
8913 unicode_new
, /* tp_new */
8914 PyObject_Del
, /* tp_free */
8917 /* Initialize the Unicode implementation */
8919 void _PyUnicode_Init(void)
8921 /* XXX - move this array to unicodectype.c ? */
8922 Py_UNICODE linebreak
[] = {
8923 0x000A, /* LINE FEED */
8924 0x000D, /* CARRIAGE RETURN */
8925 0x001C, /* FILE SEPARATOR */
8926 0x001D, /* GROUP SEPARATOR */
8927 0x001E, /* RECORD SEPARATOR */
8928 0x0085, /* NEXT LINE */
8929 0x2028, /* LINE SEPARATOR */
8930 0x2029, /* PARAGRAPH SEPARATOR */
8933 /* Init the implementation */
8934 if (!unicode_empty
) {
8935 unicode_empty
= _PyUnicode_New(0);
8940 if (PyType_Ready(&PyUnicode_Type
) < 0)
8941 Py_FatalError("Can't initialize 'unicode'");
8943 /* initialize the linebreak bloom filter */
8944 bloom_linebreak
= make_bloom_mask(
8945 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
8948 PyType_Ready(&EncodingMapType
);
8950 if (PyType_Ready(&PyFieldNameIter_Type
) < 0)
8951 Py_FatalError("Can't initialize field name iterator type");
8953 if (PyType_Ready(&PyFormatterIter_Type
) < 0)
8954 Py_FatalError("Can't initialize formatter iter type");
8957 /* Finalize the Unicode implementation */
8960 PyUnicode_ClearFreeList(void)
8962 int freelist_size
= numfree
;
8965 for (u
= free_list
; u
!= NULL
;) {
8966 PyUnicodeObject
*v
= u
;
8967 u
= *(PyUnicodeObject
**)u
;
8969 PyObject_DEL(v
->str
);
8970 Py_XDECREF(v
->defenc
);
8975 assert(numfree
== 0);
8976 return freelist_size
;
8980 _PyUnicode_Fini(void)
8984 Py_CLEAR(unicode_empty
);
8986 for (i
= 0; i
< 256; i
++)
8987 Py_CLEAR(unicode_latin1
[i
]);
8989 (void)PyUnicode_ClearFreeList();