3 Unicode implementation based on original code by Fredrik Lundh,
4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5 Unicode Integration Proposal (see file Misc/unicode.txt).
7 Major speed upgrades to the method implementations at the Reykjavik
8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
10 Copyright (c) Corporation for National Research Initiatives.
12 --------------------------------------------------------------------
13 The original string type implementation is:
15 Copyright (c) 1999 by Secret Labs AB
16 Copyright (c) 1999 by Fredrik Lundh
18 By obtaining, using, and/or copying this software and/or its
19 associated documentation, you agree that you have read, understood,
20 and will comply with the following terms and conditions:
22 Permission to use, copy, modify, and distribute this software and its
23 associated documentation for any purpose and without fee is hereby
24 granted, provided that the above copyright notice appears in all
25 copies, and that both that copyright notice and this permission notice
26 appear in supporting documentation, and that the name of Secret Labs
27 AB or the author not be used in advertising or publicity pertaining to
28 distribution of the software without specific, written prior
31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33 FITNESS. IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38 --------------------------------------------------------------------
42 #define PY_SSIZE_T_CLEAN
45 #include "unicodeobject.h"
52 /* Limit for the Unicode object free list */
54 #define PyUnicode_MAXFREELIST 1024
56 /* Limit for the Unicode object free list stay alive optimization.
58 The implementation will keep allocated Unicode memory intact for
59 all objects on the free list having a size less than this
60 limit. This reduces malloc() overhead for small Unicode objects.
62 At worst this will result in PyUnicode_MAXFREELIST *
63 (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64 malloc()-overhead) bytes of unused garbage.
66 Setting the limit to 0 effectively turns the feature off.
68 Note: This is an experimental feature ! If you get core dumps when
69 using Unicode objects, turn this feature off.
73 #define KEEPALIVE_SIZE_LIMIT 9
75 /* Endianness switches; defaults to little endian */
77 #ifdef WORDS_BIGENDIAN
78 # define BYTEORDER_IS_BIG_ENDIAN
80 # define BYTEORDER_IS_LITTLE_ENDIAN
83 /* --- Globals ------------------------------------------------------------
85 The globals are initialized by the _PyUnicode_Init() API and should
86 not be used before calling that API.
95 /* Free list for Unicode objects */
96 static PyUnicodeObject
*free_list
;
99 /* The empty Unicode object is shared to improve performance. */
100 static PyUnicodeObject
*unicode_empty
;
102 /* Single character Unicode strings in the Latin-1 range are being
104 static PyUnicodeObject
*unicode_latin1
[256];
106 /* Default encoding to use and assume when NULL is passed as encoding
107 parameter; it is initialized by _PyUnicode_Init().
109 Always use the PyUnicode_SetDefaultEncoding() and
110 PyUnicode_GetDefaultEncoding() APIs to access this global.
113 static char unicode_default_encoding
[100];
115 /* Fast detection of the most frequent whitespace characters */
116 const unsigned char _Py_ascii_whitespace
[] = {
117 0, 0, 0, 0, 0, 0, 0, 0,
118 /* case 0x0009: * CHARACTER TABULATION */
119 /* case 0x000A: * LINE FEED */
120 /* case 0x000B: * LINE TABULATION */
121 /* case 0x000C: * FORM FEED */
122 /* case 0x000D: * CARRIAGE RETURN */
123 0, 1, 1, 1, 1, 1, 0, 0,
124 0, 0, 0, 0, 0, 0, 0, 0,
125 /* case 0x001C: * FILE SEPARATOR */
126 /* case 0x001D: * GROUP SEPARATOR */
127 /* case 0x001E: * RECORD SEPARATOR */
128 /* case 0x001F: * UNIT SEPARATOR */
129 0, 0, 0, 0, 1, 1, 1, 1,
130 /* case 0x0020: * SPACE */
131 1, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0,
136 0, 0, 0, 0, 0, 0, 0, 0,
137 0, 0, 0, 0, 0, 0, 0, 0,
138 0, 0, 0, 0, 0, 0, 0, 0,
139 0, 0, 0, 0, 0, 0, 0, 0,
140 0, 0, 0, 0, 0, 0, 0, 0,
141 0, 0, 0, 0, 0, 0, 0, 0,
142 0, 0, 0, 0, 0, 0, 0, 0,
143 0, 0, 0, 0, 0, 0, 0, 0
146 /* Same for linebreaks */
147 static unsigned char ascii_linebreak
[] = {
148 0, 0, 0, 0, 0, 0, 0, 0,
149 /* 0x000A, * LINE FEED */
150 /* 0x000B, * LINE TABULATION */
151 /* 0x000C, * FORM FEED */
152 /* 0x000D, * CARRIAGE RETURN */
153 0, 0, 1, 1, 1, 1, 0, 0,
154 0, 0, 0, 0, 0, 0, 0, 0,
155 /* 0x001C, * FILE SEPARATOR */
156 /* 0x001D, * GROUP SEPARATOR */
157 /* 0x001E, * RECORD SEPARATOR */
158 0, 0, 0, 0, 1, 1, 1, 0,
159 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0,
162 0, 0, 0, 0, 0, 0, 0, 0,
164 0, 0, 0, 0, 0, 0, 0, 0,
165 0, 0, 0, 0, 0, 0, 0, 0,
166 0, 0, 0, 0, 0, 0, 0, 0,
167 0, 0, 0, 0, 0, 0, 0, 0,
168 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0,
170 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0
176 PyUnicode_GetMax(void)
178 #ifdef Py_UNICODE_WIDE
181 /* This is actually an illegal character, so it should
182 not be passed to unichr. */
187 /* --- Bloom Filters ----------------------------------------------------- */
189 /* stuff to implement simple "bloom filters" for Unicode characters.
190 to keep things simple, we use a single bitmask, using the least 5
191 bits from each unicode characters as the bit index. */
193 /* the linebreak mask is set up by Unicode_Init below */
196 #define BLOOM_WIDTH 128
198 #define BLOOM_WIDTH 64
200 #define BLOOM_WIDTH 32
202 #error "LONG_BIT is smaller than 32"
205 #define BLOOM_MASK unsigned long
207 static BLOOM_MASK bloom_linebreak
;
209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210 #define BLOOM(mask, ch) ((mask & (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
212 #define BLOOM_LINEBREAK(ch) \
213 ((ch) < 128U ? ascii_linebreak[(ch)] : \
214 (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
216 Py_LOCAL_INLINE(BLOOM_MASK
) make_bloom_mask(Py_UNICODE
* ptr
, Py_ssize_t len
)
218 /* calculate simple bloom-style bitmask for a given unicode string */
224 for (i
= 0; i
< len
; i
++)
225 BLOOM_ADD(mask
, ptr
[i
]);
230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr
, Py_UNICODE
* set
, Py_ssize_t setlen
)
234 for (i
= 0; i
< setlen
; i
++)
241 #define BLOOM_MEMBER(mask, chr, set, setlen) \
242 BLOOM(mask, chr) && unicode_member(chr, set, setlen)
244 /* --- Unicode Object ----------------------------------------------------- */
247 int unicode_resize(register PyUnicodeObject
*unicode
,
252 /* Shortcut if there's nothing much to do. */
253 if (unicode
->length
== length
)
256 /* Resizing shared object (unicode_empty or single character
257 objects) in-place is not allowed. Use PyUnicode_Resize()
260 if (unicode
== unicode_empty
||
261 (unicode
->length
== 1 &&
262 unicode
->str
[0] < 256U &&
263 unicode_latin1
[unicode
->str
[0]] == unicode
)) {
264 PyErr_SetString(PyExc_SystemError
,
265 "can't resize shared unicode objects");
269 /* We allocate one more byte to make sure the string is Ux0000 terminated.
270 The overallocation is also used by fastsearch, which assumes that it's
271 safe to look at str[length] (without making any assumptions about what
274 oldstr
= unicode
->str
;
275 unicode
->str
= PyObject_REALLOC(unicode
->str
,
276 sizeof(Py_UNICODE
) * (length
+ 1));
278 unicode
->str
= (Py_UNICODE
*)oldstr
;
282 unicode
->str
[length
] = 0;
283 unicode
->length
= length
;
286 /* Reset the object caches */
287 if (unicode
->defenc
) {
288 Py_CLEAR(unicode
->defenc
);
295 /* We allocate one more byte to make sure the string is
296 Ux0000 terminated; some code relies on that.
298 XXX This allocator could further be enhanced by assuring that the
299 free list never reduces its size below 1.
304 PyUnicodeObject
*_PyUnicode_New(Py_ssize_t length
)
306 register PyUnicodeObject
*unicode
;
308 /* Optimization for empty strings */
309 if (length
== 0 && unicode_empty
!= NULL
) {
310 Py_INCREF(unicode_empty
);
311 return unicode_empty
;
314 /* Ensure we won't overflow the size. */
315 if (length
> ((PY_SSIZE_T_MAX
/ sizeof(Py_UNICODE
)) - 1)) {
316 return (PyUnicodeObject
*)PyErr_NoMemory();
319 /* Unicode freelist & memory allocation */
322 free_list
= *(PyUnicodeObject
**)unicode
;
325 /* Keep-Alive optimization: we only upsize the buffer,
326 never downsize it. */
327 if ((unicode
->length
< length
) &&
328 unicode_resize(unicode
, length
) < 0) {
329 PyObject_DEL(unicode
->str
);
334 size_t new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
335 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
337 PyObject_INIT(unicode
, &PyUnicode_Type
);
341 unicode
= PyObject_New(PyUnicodeObject
, &PyUnicode_Type
);
344 new_size
= sizeof(Py_UNICODE
) * ((size_t)length
+ 1);
345 unicode
->str
= (Py_UNICODE
*) PyObject_MALLOC(new_size
);
352 /* Initialize the first element to guard against cases where
353 * the caller fails before initializing str -- unicode_resize()
354 * reads str[0], and the Keep-Alive optimization can keep memory
355 * allocated for str alive across a call to unicode_dealloc(unicode).
356 * We don't want unicode_resize to read uninitialized memory in
360 unicode
->str
[length
] = 0;
361 unicode
->length
= length
;
363 unicode
->defenc
= NULL
;
367 /* XXX UNREF/NEWREF interface should be more symmetrical */
369 _Py_ForgetReference((PyObject
*)unicode
);
370 PyObject_Del(unicode
);
375 void unicode_dealloc(register PyUnicodeObject
*unicode
)
377 if (PyUnicode_CheckExact(unicode
) &&
378 numfree
< PyUnicode_MAXFREELIST
) {
379 /* Keep-Alive optimization */
380 if (unicode
->length
>= KEEPALIVE_SIZE_LIMIT
) {
381 PyObject_DEL(unicode
->str
);
385 if (unicode
->defenc
) {
386 Py_CLEAR(unicode
->defenc
);
388 /* Add to free list */
389 *(PyUnicodeObject
**)unicode
= free_list
;
394 PyObject_DEL(unicode
->str
);
395 Py_XDECREF(unicode
->defenc
);
396 Py_TYPE(unicode
)->tp_free((PyObject
*)unicode
);
401 int _PyUnicode_Resize(PyUnicodeObject
**unicode
, Py_ssize_t length
)
403 register PyUnicodeObject
*v
;
405 /* Argument checks */
406 if (unicode
== NULL
) {
407 PyErr_BadInternalCall();
411 if (v
== NULL
|| !PyUnicode_Check(v
) || Py_REFCNT(v
) != 1 || length
< 0) {
412 PyErr_BadInternalCall();
416 /* Resizing unicode_empty and single character objects is not
417 possible since these are being shared. We simply return a fresh
418 copy with the same Unicode content. */
419 if (v
->length
!= length
&&
420 (v
== unicode_empty
|| v
->length
== 1)) {
421 PyUnicodeObject
*w
= _PyUnicode_New(length
);
424 Py_UNICODE_COPY(w
->str
, v
->str
,
425 length
< v
->length
? length
: v
->length
);
431 /* Note that we don't have to modify *unicode for unshared Unicode
432 objects, since we can modify them in-place. */
433 return unicode_resize(v
, length
);
436 int PyUnicode_Resize(PyObject
**unicode
, Py_ssize_t length
)
438 return _PyUnicode_Resize((PyUnicodeObject
**)unicode
, length
);
441 PyObject
*PyUnicode_FromUnicode(const Py_UNICODE
*u
,
444 PyUnicodeObject
*unicode
;
446 /* If the Unicode data is known at construction time, we can apply
447 some optimizations which share commonly used objects. */
450 /* Optimization for empty strings */
451 if (size
== 0 && unicode_empty
!= NULL
) {
452 Py_INCREF(unicode_empty
);
453 return (PyObject
*)unicode_empty
;
456 /* Single character Unicode objects in the Latin-1 range are
457 shared when using this constructor */
458 if (size
== 1 && *u
< 256) {
459 unicode
= unicode_latin1
[*u
];
461 unicode
= _PyUnicode_New(1);
464 unicode
->str
[0] = *u
;
465 unicode_latin1
[*u
] = unicode
;
468 return (PyObject
*)unicode
;
472 unicode
= _PyUnicode_New(size
);
476 /* Copy the Unicode data into the new object */
478 Py_UNICODE_COPY(unicode
->str
, u
, size
);
480 return (PyObject
*)unicode
;
483 PyObject
*PyUnicode_FromStringAndSize(const char *u
, Py_ssize_t size
)
485 PyUnicodeObject
*unicode
;
488 PyErr_SetString(PyExc_SystemError
,
489 "Negative size passed to PyUnicode_FromStringAndSize");
493 /* If the Unicode data is known at construction time, we can apply
494 some optimizations which share commonly used objects.
495 Also, this means the input must be UTF-8, so fall back to the
496 UTF-8 decoder at the end. */
499 /* Optimization for empty strings */
500 if (size
== 0 && unicode_empty
!= NULL
) {
501 Py_INCREF(unicode_empty
);
502 return (PyObject
*)unicode_empty
;
505 /* Single characters are shared when using this constructor.
506 Restrict to ASCII, since the input must be UTF-8. */
507 if (size
== 1 && Py_CHARMASK(*u
) < 128) {
508 unicode
= unicode_latin1
[Py_CHARMASK(*u
)];
510 unicode
= _PyUnicode_New(1);
513 unicode
->str
[0] = Py_CHARMASK(*u
);
514 unicode_latin1
[Py_CHARMASK(*u
)] = unicode
;
517 return (PyObject
*)unicode
;
520 return PyUnicode_DecodeUTF8(u
, size
, NULL
);
523 unicode
= _PyUnicode_New(size
);
527 return (PyObject
*)unicode
;
530 PyObject
*PyUnicode_FromString(const char *u
)
532 size_t size
= strlen(u
);
533 if (size
> PY_SSIZE_T_MAX
) {
534 PyErr_SetString(PyExc_OverflowError
, "input too long");
538 return PyUnicode_FromStringAndSize(u
, size
);
543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544 # define CONVERT_WCHAR_TO_SURROGATES
547 #ifdef CONVERT_WCHAR_TO_SURROGATES
549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550 to convert from UTF32 to UTF16. */
552 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
555 PyUnicodeObject
*unicode
;
556 register Py_ssize_t i
;
558 const wchar_t *orig_w
;
561 PyErr_BadInternalCall();
567 for (i
= size
; i
> 0; i
--) {
573 unicode
= _PyUnicode_New(alloc
);
577 /* Copy the wchar_t data into the new object */
579 register Py_UNICODE
*u
;
580 u
= PyUnicode_AS_UNICODE(unicode
);
581 for (i
= size
; i
> 0; i
--) {
583 wchar_t ordinal
= *w
++;
585 *u
++ = 0xD800 | (ordinal
>> 10);
586 *u
++ = 0xDC00 | (ordinal
& 0x3FF);
592 return (PyObject
*)unicode
;
597 PyObject
*PyUnicode_FromWideChar(register const wchar_t *w
,
600 PyUnicodeObject
*unicode
;
603 PyErr_BadInternalCall();
607 unicode
= _PyUnicode_New(size
);
611 /* Copy the wchar_t data into the new object */
612 #ifdef HAVE_USABLE_WCHAR_T
613 memcpy(unicode
->str
, w
, size
* sizeof(wchar_t));
616 register Py_UNICODE
*u
;
617 register Py_ssize_t i
;
618 u
= PyUnicode_AS_UNICODE(unicode
);
619 for (i
= size
; i
> 0; i
--)
624 return (PyObject
*)unicode
;
627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
629 #undef CONVERT_WCHAR_TO_SURROGATES
632 makefmt(char *fmt
, int longflag
, int size_tflag
, int zeropad
, int width
, int precision
, char c
)
638 fmt
+= sprintf(fmt
, "%d", width
);
641 fmt
+= sprintf(fmt
, ".%d", precision
);
644 else if (size_tflag
) {
645 char *f
= PY_FORMAT_SIZE_T
;
653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
656 PyUnicode_FromFormatV(const char *format
, va_list vargs
)
659 Py_ssize_t callcount
= 0;
660 PyObject
**callresults
= NULL
;
661 PyObject
**callresult
= NULL
;
669 /* used by sprintf */
671 /* use abuffer instead of buffer, if we need more space
672 * (which can happen if there's a format specifier with width). */
673 char *abuffer
= NULL
;
675 Py_ssize_t abuffersize
= 0;
676 char fmt
[60]; /* should be enough for %0width.precisionld */
679 #ifdef VA_LIST_IS_ARRAY
680 Py_MEMCPY(count
, vargs
, sizeof(va_list));
683 __va_copy(count
, vargs
);
688 /* step 1: count the number of %S/%R/%s format specifications
689 * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690 * objects once during step 3 and put the result in an array) */
691 for (f
= format
; *f
; f
++) {
695 if (*(f
+1)=='S' || *(f
+1)=='R')
697 while (isdigit((unsigned)*f
))
698 width
= (width
*10) + *f
++ - '0';
699 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
705 /* step 2: allocate memory for the results of
706 * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
708 callresults
= PyObject_Malloc(sizeof(PyObject
*)*callcount
);
713 callresult
= callresults
;
715 /* step 3: figure out how large a buffer we need */
716 for (f
= format
; *f
; f
++) {
720 while (isdigit((unsigned)*f
))
721 width
= (width
*10) + *f
++ - '0';
722 while (*++f
&& *f
!= '%' && !isalpha((unsigned)*f
))
725 /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726 * they don't affect the amount of space we reserve.
728 if ((*f
== 'l' || *f
== 'z') &&
729 (f
[1] == 'd' || f
[1] == 'u'))
734 (void)va_arg(count
, int);
735 /* fall through... */
739 case 'd': case 'u': case 'i': case 'x':
740 (void) va_arg(count
, int);
741 /* 20 bytes is enough to hold a 64-bit
742 integer. Decimal takes the most space.
743 This isn't enough for octal.
744 If a width is specified we need more
745 (which we allocate later). */
749 if (abuffersize
< width
)
755 const char *s
= va_arg(count
, const char*);
756 PyObject
*str
= PyUnicode_DecodeUTF8(s
, strlen(s
), "replace");
759 n
+= PyUnicode_GET_SIZE(str
);
760 /* Remember the str and switch to the next slot */
766 PyObject
*obj
= va_arg(count
, PyObject
*);
767 assert(obj
&& PyUnicode_Check(obj
));
768 n
+= PyUnicode_GET_SIZE(obj
);
773 PyObject
*obj
= va_arg(count
, PyObject
*);
774 const char *str
= va_arg(count
, const char *);
776 assert(!obj
|| PyUnicode_Check(obj
));
778 n
+= PyUnicode_GET_SIZE(obj
);
785 PyObject
*obj
= va_arg(count
, PyObject
*);
788 str
= PyObject_Str(obj
);
791 n
+= PyUnicode_GET_SIZE(str
);
792 /* Remember the str and switch to the next slot */
798 PyObject
*obj
= va_arg(count
, PyObject
*);
801 repr
= PyObject_Repr(obj
);
804 n
+= PyUnicode_GET_SIZE(repr
);
805 /* Remember the repr and switch to the next slot */
806 *callresult
++ = repr
;
810 (void) va_arg(count
, int);
811 /* maximum 64-bit pointer representation:
813 * so 19 characters is enough.
814 * XXX I count 18 -- what's the extra for?
819 /* if we stumble upon an unknown
820 formatting code, copy the rest of
821 the format string to the output
822 string. (we cannot just skip the
823 code, since there's no way to know
824 what's in the argument list) */
832 if (abuffersize
> 20) {
833 abuffer
= PyObject_Malloc(abuffersize
);
838 realbuffer
= abuffer
;
842 /* step 4: fill the buffer */
843 /* Since we've analyzed how much space we need for the worst case,
844 we don't have to resize the string.
845 There can be no errors beyond this point. */
846 string
= PyUnicode_FromUnicode(NULL
, n
);
850 s
= PyUnicode_AS_UNICODE(string
);
851 callresult
= callresults
;
853 for (f
= format
; *f
; f
++) {
858 zeropad
= (*f
== '0');
859 /* parse the width.precision part */
861 while (isdigit((unsigned)*f
))
862 width
= (width
*10) + *f
++ - '0';
866 while (isdigit((unsigned)*f
))
867 precision
= (precision
*10) + *f
++ - '0';
869 /* handle the long flag, but only for %ld and %lu.
870 others can be added when necessary. */
871 if (*f
== 'l' && (f
[1] == 'd' || f
[1] == 'u')) {
875 /* handle the size_t flag. */
876 if (*f
== 'z' && (f
[1] == 'd' || f
[1] == 'u')) {
883 *s
++ = va_arg(vargs
, int);
886 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'd');
888 sprintf(realbuffer
, fmt
, va_arg(vargs
, long));
890 sprintf(realbuffer
, fmt
, va_arg(vargs
, Py_ssize_t
));
892 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
893 appendstring(realbuffer
);
896 makefmt(fmt
, longflag
, size_tflag
, zeropad
, width
, precision
, 'u');
898 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned long));
900 sprintf(realbuffer
, fmt
, va_arg(vargs
, size_t));
902 sprintf(realbuffer
, fmt
, va_arg(vargs
, unsigned int));
903 appendstring(realbuffer
);
906 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'i');
907 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
908 appendstring(realbuffer
);
911 makefmt(fmt
, 0, 0, zeropad
, width
, precision
, 'x');
912 sprintf(realbuffer
, fmt
, va_arg(vargs
, int));
913 appendstring(realbuffer
);
917 /* unused, since we already have the result */
918 (void) va_arg(vargs
, char *);
919 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(*callresult
),
920 PyUnicode_GET_SIZE(*callresult
));
921 s
+= PyUnicode_GET_SIZE(*callresult
);
922 /* We're done with the unicode()/repr() => forget it */
923 Py_DECREF(*callresult
);
924 /* switch to next unicode()/repr() result */
930 PyObject
*obj
= va_arg(vargs
, PyObject
*);
931 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
932 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
938 PyObject
*obj
= va_arg(vargs
, PyObject
*);
939 const char *str
= va_arg(vargs
, const char *);
941 Py_ssize_t size
= PyUnicode_GET_SIZE(obj
);
942 Py_UNICODE_COPY(s
, PyUnicode_AS_UNICODE(obj
), size
);
955 /* unused, since we already have the result */
956 (void) va_arg(vargs
, PyObject
*);
957 ucopy
= PyUnicode_AS_UNICODE(*callresult
);
958 usize
= PyUnicode_GET_SIZE(*callresult
);
959 for (upos
= 0; upos
<usize
;)
960 *s
++ = ucopy
[upos
++];
961 /* We're done with the unicode()/repr() => forget it */
962 Py_DECREF(*callresult
);
963 /* switch to next unicode()/repr() result */
968 sprintf(buffer
, "%p", va_arg(vargs
, void*));
969 /* %p is ill-defined: ensure leading 0x. */
970 if (buffer
[1] == 'X')
972 else if (buffer
[1] != 'x') {
973 memmove(buffer
+2, buffer
, strlen(buffer
)+1);
977 appendstring(buffer
);
992 PyObject_Free(callresults
);
994 PyObject_Free(abuffer
);
995 PyUnicode_Resize(&string
, s
- PyUnicode_AS_UNICODE(string
));
999 PyObject
**callresult2
= callresults
;
1000 while (callresult2
< callresult
) {
1001 Py_DECREF(*callresult2
);
1004 PyObject_Free(callresults
);
1007 PyObject_Free(abuffer
);
1014 PyUnicode_FromFormat(const char *format
, ...)
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020 va_start(vargs
, format
);
1024 ret
= PyUnicode_FromFormatV(format
, vargs
);
1029 Py_ssize_t
PyUnicode_AsWideChar(PyUnicodeObject
*unicode
,
1033 if (unicode
== NULL
) {
1034 PyErr_BadInternalCall();
1038 /* If possible, try to copy the 0-termination as well */
1039 if (size
> PyUnicode_GET_SIZE(unicode
))
1040 size
= PyUnicode_GET_SIZE(unicode
) + 1;
1042 #ifdef HAVE_USABLE_WCHAR_T
1043 memcpy(w
, unicode
->str
, size
* sizeof(wchar_t));
1046 register Py_UNICODE
*u
;
1047 register Py_ssize_t i
;
1048 u
= PyUnicode_AS_UNICODE(unicode
);
1049 for (i
= size
; i
> 0; i
--)
1054 if (size
> PyUnicode_GET_SIZE(unicode
))
1055 return PyUnicode_GET_SIZE(unicode
);
1062 PyObject
*PyUnicode_FromOrdinal(int ordinal
)
1066 #ifdef Py_UNICODE_WIDE
1067 if (ordinal
< 0 || ordinal
> 0x10ffff) {
1068 PyErr_SetString(PyExc_ValueError
,
1069 "unichr() arg not in range(0x110000) "
1070 "(wide Python build)");
1074 if (ordinal
< 0 || ordinal
> 0xffff) {
1075 PyErr_SetString(PyExc_ValueError
,
1076 "unichr() arg not in range(0x10000) "
1077 "(narrow Python build)");
1082 s
[0] = (Py_UNICODE
)ordinal
;
1083 return PyUnicode_FromUnicode(s
, 1);
1086 PyObject
*PyUnicode_FromObject(register PyObject
*obj
)
1088 /* XXX Perhaps we should make this API an alias of
1089 PyObject_Unicode() instead ?! */
1090 if (PyUnicode_CheckExact(obj
)) {
1094 if (PyUnicode_Check(obj
)) {
1095 /* For a Unicode subtype that's not a Unicode object,
1096 return a true Unicode object with the same data. */
1097 return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj
),
1098 PyUnicode_GET_SIZE(obj
));
1100 return PyUnicode_FromEncodedObject(obj
, NULL
, "strict");
1103 PyObject
*PyUnicode_FromEncodedObject(register PyObject
*obj
,
1104 const char *encoding
,
1107 const char *s
= NULL
;
1112 PyErr_BadInternalCall();
1117 /* For b/w compatibility we also accept Unicode objects provided
1118 that no encodings is given and then redirect to
1119 PyObject_Unicode() which then applies the additional logic for
1122 NOTE: This API should really only be used for object which
1123 represent *encoded* Unicode !
1126 if (PyUnicode_Check(obj
)) {
1128 PyErr_SetString(PyExc_TypeError
,
1129 "decoding Unicode is not supported");
1132 return PyObject_Unicode(obj
);
1135 if (PyUnicode_Check(obj
)) {
1136 PyErr_SetString(PyExc_TypeError
,
1137 "decoding Unicode is not supported");
1143 if (PyString_Check(obj
)) {
1144 s
= PyString_AS_STRING(obj
);
1145 len
= PyString_GET_SIZE(obj
);
1147 else if (PyByteArray_Check(obj
)) {
1148 /* Python 2.x specific */
1149 PyErr_Format(PyExc_TypeError
,
1150 "decoding bytearray is not supported");
1153 else if (PyObject_AsCharBuffer(obj
, &s
, &len
)) {
1154 /* Overwrite the error message with something more useful in
1155 case of a TypeError. */
1156 if (PyErr_ExceptionMatches(PyExc_TypeError
))
1157 PyErr_Format(PyExc_TypeError
,
1158 "coercing to Unicode: need string or buffer, "
1160 Py_TYPE(obj
)->tp_name
);
1164 /* Convert to Unicode */
1166 Py_INCREF(unicode_empty
);
1167 v
= (PyObject
*)unicode_empty
;
1170 v
= PyUnicode_Decode(s
, len
, encoding
, errors
);
1178 PyObject
*PyUnicode_Decode(const char *s
,
1180 const char *encoding
,
1183 PyObject
*buffer
= NULL
, *unicode
;
1185 if (encoding
== NULL
)
1186 encoding
= PyUnicode_GetDefaultEncoding();
1188 /* Shortcuts for common default encodings */
1189 if (strcmp(encoding
, "utf-8") == 0)
1190 return PyUnicode_DecodeUTF8(s
, size
, errors
);
1191 else if (strcmp(encoding
, "latin-1") == 0)
1192 return PyUnicode_DecodeLatin1(s
, size
, errors
);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194 else if (strcmp(encoding
, "mbcs") == 0)
1195 return PyUnicode_DecodeMBCS(s
, size
, errors
);
1197 else if (strcmp(encoding
, "ascii") == 0)
1198 return PyUnicode_DecodeASCII(s
, size
, errors
);
1200 /* Decode via the codec registry */
1201 buffer
= PyBuffer_FromMemory((void *)s
, size
);
1204 unicode
= PyCodec_Decode(buffer
, encoding
, errors
);
1205 if (unicode
== NULL
)
1207 if (!PyUnicode_Check(unicode
)) {
1208 PyErr_Format(PyExc_TypeError
,
1209 "decoder did not return an unicode object (type=%.400s)",
1210 Py_TYPE(unicode
)->tp_name
);
1222 PyObject
*PyUnicode_AsDecodedObject(PyObject
*unicode
,
1223 const char *encoding
,
1228 if (!PyUnicode_Check(unicode
)) {
1229 PyErr_BadArgument();
1233 if (encoding
== NULL
)
1234 encoding
= PyUnicode_GetDefaultEncoding();
1236 /* Decode via the codec registry */
1237 v
= PyCodec_Decode(unicode
, encoding
, errors
);
1246 PyObject
*PyUnicode_Encode(const Py_UNICODE
*s
,
1248 const char *encoding
,
1251 PyObject
*v
, *unicode
;
1253 unicode
= PyUnicode_FromUnicode(s
, size
);
1254 if (unicode
== NULL
)
1256 v
= PyUnicode_AsEncodedString(unicode
, encoding
, errors
);
1261 PyObject
*PyUnicode_AsEncodedObject(PyObject
*unicode
,
1262 const char *encoding
,
1267 if (!PyUnicode_Check(unicode
)) {
1268 PyErr_BadArgument();
1272 if (encoding
== NULL
)
1273 encoding
= PyUnicode_GetDefaultEncoding();
1275 /* Encode via the codec registry */
1276 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1285 PyObject
*PyUnicode_AsEncodedString(PyObject
*unicode
,
1286 const char *encoding
,
1291 if (!PyUnicode_Check(unicode
)) {
1292 PyErr_BadArgument();
1296 if (encoding
== NULL
)
1297 encoding
= PyUnicode_GetDefaultEncoding();
1299 /* Shortcuts for common default encodings */
1300 if (errors
== NULL
) {
1301 if (strcmp(encoding
, "utf-8") == 0)
1302 return PyUnicode_AsUTF8String(unicode
);
1303 else if (strcmp(encoding
, "latin-1") == 0)
1304 return PyUnicode_AsLatin1String(unicode
);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306 else if (strcmp(encoding
, "mbcs") == 0)
1307 return PyUnicode_AsMBCSString(unicode
);
1309 else if (strcmp(encoding
, "ascii") == 0)
1310 return PyUnicode_AsASCIIString(unicode
);
1313 /* Encode via the codec registry */
1314 v
= PyCodec_Encode(unicode
, encoding
, errors
);
1317 if (!PyString_Check(v
)) {
1318 PyErr_Format(PyExc_TypeError
,
1319 "encoder did not return a string object (type=%.400s)",
1320 Py_TYPE(v
)->tp_name
);
1330 PyObject
*_PyUnicode_AsDefaultEncodedString(PyObject
*unicode
,
1333 PyObject
*v
= ((PyUnicodeObject
*)unicode
)->defenc
;
1337 v
= PyUnicode_AsEncodedString(unicode
, NULL
, errors
);
1338 if (v
&& errors
== NULL
)
1339 ((PyUnicodeObject
*)unicode
)->defenc
= v
;
1343 Py_UNICODE
*PyUnicode_AsUnicode(PyObject
*unicode
)
1345 if (!PyUnicode_Check(unicode
)) {
1346 PyErr_BadArgument();
1349 return PyUnicode_AS_UNICODE(unicode
);
1355 Py_ssize_t
PyUnicode_GetSize(PyObject
*unicode
)
1357 if (!PyUnicode_Check(unicode
)) {
1358 PyErr_BadArgument();
1361 return PyUnicode_GET_SIZE(unicode
);
1367 const char *PyUnicode_GetDefaultEncoding(void)
1369 return unicode_default_encoding
;
1372 int PyUnicode_SetDefaultEncoding(const char *encoding
)
1376 /* Make sure the encoding is valid. As side effect, this also
1377 loads the encoding into the codec registry cache. */
1378 v
= _PyCodec_Lookup(encoding
);
1382 strncpy(unicode_default_encoding
,
1384 sizeof(unicode_default_encoding
));
1391 /* error handling callback helper:
1392 build arguments, call the callback and check the arguments,
1393 if no exception occurred, copy the replacement to the output
1394 and adjust various state variables.
1395 return 0 on success, -1 on error
1399 int unicode_decode_call_errorhandler(const char *errors
, PyObject
**errorHandler
,
1400 const char *encoding
, const char *reason
,
1401 const char *input
, Py_ssize_t insize
, Py_ssize_t
*startinpos
,
1402 Py_ssize_t
*endinpos
, PyObject
**exceptionObject
, const char **inptr
,
1403 PyUnicodeObject
**output
, Py_ssize_t
*outpos
, Py_UNICODE
**outptr
)
1405 static char *argparse
= "O!n;decoding error handler must return (unicode, int) tuple";
1407 PyObject
*restuple
= NULL
;
1408 PyObject
*repunicode
= NULL
;
1409 Py_ssize_t outsize
= PyUnicode_GET_SIZE(*output
);
1410 Py_ssize_t requiredsize
;
1416 if (*errorHandler
== NULL
) {
1417 *errorHandler
= PyCodec_LookupError(errors
);
1418 if (*errorHandler
== NULL
)
1422 if (*exceptionObject
== NULL
) {
1423 *exceptionObject
= PyUnicodeDecodeError_Create(
1424 encoding
, input
, insize
, *startinpos
, *endinpos
, reason
);
1425 if (*exceptionObject
== NULL
)
1429 if (PyUnicodeDecodeError_SetStart(*exceptionObject
, *startinpos
))
1431 if (PyUnicodeDecodeError_SetEnd(*exceptionObject
, *endinpos
))
1433 if (PyUnicodeDecodeError_SetReason(*exceptionObject
, reason
))
1437 restuple
= PyObject_CallFunctionObjArgs(*errorHandler
, *exceptionObject
, NULL
);
1438 if (restuple
== NULL
)
1440 if (!PyTuple_Check(restuple
)) {
1441 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
1444 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
, &repunicode
, &newpos
))
1447 newpos
= insize
+newpos
;
1448 if (newpos
<0 || newpos
>insize
) {
1449 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", newpos
);
1453 /* need more space? (at least enough for what we
1454 have+the replacement+the rest of the string (starting
1455 at the new input position), so we won't have to check space
1456 when there are no errors in the rest of the string) */
1457 repptr
= PyUnicode_AS_UNICODE(repunicode
);
1458 repsize
= PyUnicode_GET_SIZE(repunicode
);
1459 requiredsize
= *outpos
+ repsize
+ insize
-newpos
;
1460 if (requiredsize
> outsize
) {
1461 if (requiredsize
<2*outsize
)
1462 requiredsize
= 2*outsize
;
1463 if (_PyUnicode_Resize(output
, requiredsize
) < 0)
1465 *outptr
= PyUnicode_AS_UNICODE(*output
) + *outpos
;
1468 *inptr
= input
+ newpos
;
1469 Py_UNICODE_COPY(*outptr
, repptr
, repsize
);
1476 Py_XDECREF(restuple
);
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1482 /* See RFC2152 for details. We encode conservatively and decode liberally. */
1484 /* Three simple macros defining base-64. */
1486 /* Is c a base-64 character? */
1488 #define IS_BASE64(c) \
1489 (isalnum(c) || (c) == '+' || (c) == '/')
1491 /* given that c is a base-64 character, what is its base-64 value? */
1493 #define FROM_BASE64(c) \
1494 (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' : \
1495 ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 : \
1496 ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 : \
1497 (c) == '+' ? 62 : 63)
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1501 #define TO_BASE64(n) \
1502 ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself. We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1509 #define DECODE_DIRECT(c) \
1510 ((c) <= 127 && (c) != '+')
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above). See RFC2152. This array identifies these different
1517 * alphanumeric and '(),-./:?
1519 * !"#$%&*;<=>@[]^_`{|}
1522 * 3 : special (must be base64 encoded)
1523 * everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1527 char utf7_category
[128] = {
1528 /* nul soh stx etx eot enq ack bel bs ht nl vt np cr so si */
1529 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 3, 3, 2, 3, 3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em sub esc fs gs rs us */
1531 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1532 /* sp ! " # $ % & ' ( ) * + , - . / */
1533 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 3, 0, 0, 0, 0,
1534 /* 0 1 2 3 4 5 6 7 8 9 : ; < = > ? */
1535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
1536 /* @ A B C D E F G H I J K L M N O */
1537 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1538 /* P Q R S T U V W X Y Z [ \ ] ^ _ */
1539 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 3, 1, 1, 1,
1540 /* ` a b c d e f g h i j k l m n o */
1541 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1542 /* p q r s t u v w x y z { | } ~ del */
1543 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 3, 3,
1546 /* ENCODE_DIRECT: this character should be encoded as itself. The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself. RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible. */
1552 #define ENCODE_DIRECT(c, directO, directWS) \
1553 ((c) < 128 && (c) > 0 && \
1554 ((utf7_category[(c)] == 0) || \
1555 (directWS && (utf7_category[(c)] == 2)) || \
1556 (directO && (utf7_category[(c)] == 1))))
1558 PyObject
*PyUnicode_DecodeUTF7(const char *s
,
1562 return PyUnicode_DecodeUTF7Stateful(s
, size
, errors
, NULL
);
1565 /* The decoder. The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed. So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1572 PyObject
*PyUnicode_DecodeUTF7Stateful(const char *s
,
1575 Py_ssize_t
*consumed
)
1577 const char *starts
= s
;
1578 Py_ssize_t startinpos
;
1579 Py_ssize_t endinpos
;
1582 PyUnicodeObject
*unicode
;
1584 const char *errmsg
= "";
1586 Py_UNICODE
*shiftOutStart
;
1587 unsigned int base64bits
= 0;
1588 unsigned long base64buffer
= 0;
1589 Py_UNICODE surrogate
= 0;
1590 PyObject
*errorHandler
= NULL
;
1591 PyObject
*exc
= NULL
;
1593 unicode
= _PyUnicode_New(size
);
1599 return (PyObject
*)unicode
;
1607 Py_UNICODE ch
= (unsigned char) *s
;
1609 if (inShift
) { /* in a base-64 section */
1610 if (IS_BASE64(ch
)) { /* consume a base-64 character */
1611 base64buffer
= (base64buffer
<< 6) | FROM_BASE64(ch
);
1614 if (base64bits
>= 16) {
1615 /* we have enough bits for a UTF-16 value */
1616 Py_UNICODE outCh
= (Py_UNICODE
)
1617 (base64buffer
>> (base64bits
-16));
1619 base64buffer
&= (1 << base64bits
) - 1; /* clear high bits */
1621 /* expecting a second surrogate */
1622 if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624 *p
++ = (((surrogate
& 0x3FF)<<10)
1625 | (outCh
& 0x3FF)) + 0x10000;
1634 errmsg
= "second surrogate missing";
1638 else if (outCh
>= 0xD800 && outCh
<= 0xDBFF) {
1639 /* first surrogate */
1642 else if (outCh
>= 0xDC00 && outCh
<= 0xDFFF) {
1643 errmsg
= "unexpected second surrogate";
1651 else { /* now leaving a base-64 section */
1655 errmsg
= "second surrogate missing at end of shift sequence";
1658 if (base64bits
> 0) { /* left-over bits */
1659 if (base64bits
>= 6) {
1660 /* We've seen at least one base-64 character */
1661 errmsg
= "partial character in shift sequence";
1665 /* Some bits remain; they should be zero */
1666 if (base64buffer
!= 0) {
1667 errmsg
= "non-zero padding bits in shift sequence";
1673 /* '-' is absorbed; other terminating
1674 characters are preserved */
1679 else if ( ch
== '+' ) {
1680 startinpos
= s
-starts
;
1681 s
++; /* consume '+' */
1682 if (s
< e
&& *s
== '-') { /* '+-' encodes '+' */
1686 else { /* begin base64-encoded section */
1692 else if (DECODE_DIRECT(ch
)) { /* character decodes as itself */
1697 startinpos
= s
-starts
;
1699 errmsg
= "unexpected special character";
1704 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1705 endinpos
= s
-starts
;
1706 if (unicode_decode_call_errorhandler(
1707 errors
, &errorHandler
,
1709 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1710 &unicode
, &outpos
, &p
))
1716 if (inShift
&& !consumed
) { /* in shift sequence, no more to follow */
1717 /* if we're in an inconsistent state, that's an error */
1719 (base64bits
>= 6) ||
1720 (base64bits
> 0 && base64buffer
!= 0)) {
1721 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
1723 if (unicode_decode_call_errorhandler(
1724 errors
, &errorHandler
,
1725 "utf7", "unterminated shift sequence",
1726 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
1727 &unicode
, &outpos
, &p
))
1735 p
= shiftOutStart
; /* back off output */
1736 *consumed
= startinpos
;
1739 *consumed
= s
-starts
;
1743 if (_PyUnicode_Resize(&unicode
, p
- PyUnicode_AS_UNICODE(unicode
)) < 0)
1746 Py_XDECREF(errorHandler
);
1748 return (PyObject
*)unicode
;
1751 Py_XDECREF(errorHandler
);
1758 PyObject
*PyUnicode_EncodeUTF7(const Py_UNICODE
*s
,
1761 int base64WhiteSpace
,
1765 /* It might be possible to tighten this worst case */
1766 Py_ssize_t allocated
= 8 * size
;
1769 unsigned int base64bits
= 0;
1770 unsigned long base64buffer
= 0;
1774 if (allocated
/ 8 != size
)
1775 return PyErr_NoMemory();
1778 return PyString_FromStringAndSize(NULL
, 0);
1780 v
= PyString_FromStringAndSize(NULL
, allocated
);
1784 start
= out
= PyString_AS_STRING(v
);
1785 for (;i
< size
; ++i
) {
1786 Py_UNICODE ch
= s
[i
];
1789 if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1791 if (base64bits
) { /* output remaining bits */
1792 *out
++ = TO_BASE64(base64buffer
<< (6-base64bits
));
1797 /* Characters not in the BASE64 set implicitly unshift the sequence
1798 so no '-' is required, except if the character is itself a '-' */
1799 if (IS_BASE64(ch
) || ch
== '-') {
1808 else { /* not in a shift sequence */
1813 else if (ENCODE_DIRECT(ch
, !base64SetO
, !base64WhiteSpace
)) {
1824 #ifdef Py_UNICODE_WIDE
1825 if (ch
>= 0x10000) {
1826 /* code first surrogate */
1828 base64buffer
= (base64buffer
<< 16) | 0xd800 | ((ch
-0x10000) >> 10);
1829 while (base64bits
>= 6) {
1830 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1833 /* prepare second surrogate */
1834 ch
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
1838 base64buffer
= (base64buffer
<< 16) | ch
;
1839 while (base64bits
>= 6) {
1840 *out
++ = TO_BASE64(base64buffer
>> (base64bits
-6));
1845 *out
++= TO_BASE64(base64buffer
<< (6-base64bits
) );
1849 if (_PyString_Resize(&v
, out
- start
))
1857 #undef DECODE_DIRECT
1858 #undef ENCODE_DIRECT
1860 /* --- UTF-8 Codec -------------------------------------------------------- */
1863 char utf8_code_length
[256] = {
1864 /* Map UTF-8 encoded prefix byte to sequence length. Zero means
1865 illegal prefix. See RFC 3629 for details */
1866 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1867 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1874 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1875 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1878 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1879 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1880 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1881 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 /* F0-F4 + F5-FF */
1884 PyObject
*PyUnicode_DecodeUTF8(const char *s
,
1888 return PyUnicode_DecodeUTF8Stateful(s
, size
, errors
, NULL
);
1891 PyObject
*PyUnicode_DecodeUTF8Stateful(const char *s
,
1894 Py_ssize_t
*consumed
)
1896 const char *starts
= s
;
1899 Py_ssize_t startinpos
;
1900 Py_ssize_t endinpos
;
1903 PyUnicodeObject
*unicode
;
1905 const char *errmsg
= "";
1906 PyObject
*errorHandler
= NULL
;
1907 PyObject
*exc
= NULL
;
1909 /* Note: size will always be longer than the resulting Unicode
1911 unicode
= _PyUnicode_New(size
);
1917 return (PyObject
*)unicode
;
1920 /* Unpack UTF-8 encoded data */
1925 Py_UCS4 ch
= (unsigned char)*s
;
1928 *p
++ = (Py_UNICODE
)ch
;
1933 n
= utf8_code_length
[ch
];
1939 errmsg
= "unexpected end of data";
1940 startinpos
= s
-starts
;
1941 endinpos
= startinpos
+1;
1942 for (k
=1; (k
< size
-startinpos
) && ((s
[k
]&0xC0) == 0x80); k
++)
1951 errmsg
= "invalid start byte";
1952 startinpos
= s
-starts
;
1953 endinpos
= startinpos
+1;
1957 errmsg
= "internal error";
1958 startinpos
= s
-starts
;
1959 endinpos
= startinpos
+1;
1963 if ((s
[1] & 0xc0) != 0x80) {
1964 errmsg
= "invalid continuation byte";
1965 startinpos
= s
-starts
;
1966 endinpos
= startinpos
+ 1;
1969 ch
= ((s
[0] & 0x1f) << 6) + (s
[1] & 0x3f);
1970 assert ((ch
> 0x007F) && (ch
<= 0x07FF));
1971 *p
++ = (Py_UNICODE
)ch
;
1975 /* XXX: surrogates shouldn't be valid UTF-8!
1976 see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1977 (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1978 Uncomment the 2 lines below to make them invalid,
1979 codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1980 if ((s
[1] & 0xc0) != 0x80 ||
1981 (s
[2] & 0xc0) != 0x80 ||
1982 ((unsigned char)s
[0] == 0xE0 &&
1983 (unsigned char)s
[1] < 0xA0)/* ||
1984 ((unsigned char)s[0] == 0xED &&
1985 (unsigned char)s[1] > 0x9F)*/) {
1986 errmsg
= "invalid continuation byte";
1987 startinpos
= s
-starts
;
1988 endinpos
= startinpos
+ 1;
1990 /* if s[1] first two bits are 1 and 0, then the invalid
1991 continuation byte is s[2], so increment endinpos by 1,
1992 if not, s[1] is invalid and endinpos doesn't need to
1994 if ((s
[1] & 0xC0) == 0x80)
1998 ch
= ((s
[0] & 0x0f) << 12) + ((s
[1] & 0x3f) << 6) + (s
[2] & 0x3f);
1999 assert ((ch
> 0x07FF) && (ch
<= 0xFFFF));
2000 *p
++ = (Py_UNICODE
)ch
;
2004 if ((s
[1] & 0xc0) != 0x80 ||
2005 (s
[2] & 0xc0) != 0x80 ||
2006 (s
[3] & 0xc0) != 0x80 ||
2007 ((unsigned char)s
[0] == 0xF0 &&
2008 (unsigned char)s
[1] < 0x90) ||
2009 ((unsigned char)s
[0] == 0xF4 &&
2010 (unsigned char)s
[1] > 0x8F)) {
2011 errmsg
= "invalid continuation byte";
2012 startinpos
= s
-starts
;
2013 endinpos
= startinpos
+ 1;
2014 if ((s
[1] & 0xC0) == 0x80) {
2016 if ((s
[2] & 0xC0) == 0x80)
2021 ch
= ((s
[0] & 0x7) << 18) + ((s
[1] & 0x3f) << 12) +
2022 ((s
[2] & 0x3f) << 6) + (s
[3] & 0x3f);
2023 assert ((ch
> 0xFFFF) && (ch
<= 0x10ffff));
2025 #ifdef Py_UNICODE_WIDE
2026 *p
++ = (Py_UNICODE
)ch
;
2028 /* compute and append the two surrogates: */
2030 /* translate from 10000..10FFFF to 0..FFFF */
2033 /* high surrogate = top 10 bits added to D800 */
2034 *p
++ = (Py_UNICODE
)(0xD800 + (ch
>> 10));
2036 /* low surrogate = bottom 10 bits added to DC00 */
2037 *p
++ = (Py_UNICODE
)(0xDC00 + (ch
& 0x03FF));
2045 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2046 if (unicode_decode_call_errorhandler(
2047 errors
, &errorHandler
,
2049 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2050 &unicode
, &outpos
, &p
))
2054 *consumed
= s
-starts
;
2057 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2060 Py_XDECREF(errorHandler
);
2062 return (PyObject
*)unicode
;
2065 Py_XDECREF(errorHandler
);
2071 /* Allocation strategy: if the string is short, convert into a stack buffer
2072 and allocate exactly as much space needed at the end. Else allocate the
2073 maximum possible needed (4 result bytes per Unicode character), and return
2074 the excess memory at the end.
2077 PyUnicode_EncodeUTF8(const Py_UNICODE
*s
,
2081 #define MAX_SHORT_UNICHARS 300 /* largest size we'll do on the stack */
2083 Py_ssize_t i
; /* index into s of next input byte */
2084 PyObject
*v
; /* result string object */
2085 char *p
; /* next free byte in output buffer */
2086 Py_ssize_t nallocated
; /* number of result bytes allocated */
2087 Py_ssize_t nneeded
; /* number of result bytes needed */
2088 char stackbuf
[MAX_SHORT_UNICHARS
* 4];
2093 if (size
<= MAX_SHORT_UNICHARS
) {
2094 /* Write into the stack buffer; nallocated can't overflow.
2095 * At the end, we'll allocate exactly as much heap space as it
2096 * turns out we need.
2098 nallocated
= Py_SAFE_DOWNCAST(sizeof(stackbuf
), size_t, int);
2099 v
= NULL
; /* will allocate after we're done */
2103 /* Overallocate on the heap, and give the excess back at the end. */
2104 nallocated
= size
* 4;
2105 if (nallocated
/ 4 != size
) /* overflow! */
2106 return PyErr_NoMemory();
2107 v
= PyString_FromStringAndSize(NULL
, nallocated
);
2110 p
= PyString_AS_STRING(v
);
2113 for (i
= 0; i
< size
;) {
2114 Py_UCS4 ch
= s
[i
++];
2120 else if (ch
< 0x0800) {
2121 /* Encode Latin-1 */
2122 *p
++ = (char)(0xc0 | (ch
>> 6));
2123 *p
++ = (char)(0x80 | (ch
& 0x3f));
2126 /* Encode UCS2 Unicode ordinals */
2128 /* Special case: check for high surrogate */
2129 if (0xD800 <= ch
&& ch
<= 0xDBFF && i
!= size
) {
2131 /* Check for low surrogate and combine the two to
2132 form a UCS4 value */
2133 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2134 ch
= ((ch
- 0xD800) << 10 | (ch2
- 0xDC00)) + 0x10000;
2138 /* Fall through: handles isolated high surrogates */
2140 *p
++ = (char)(0xe0 | (ch
>> 12));
2141 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2142 *p
++ = (char)(0x80 | (ch
& 0x3f));
2146 /* Encode UCS4 Unicode ordinals */
2147 *p
++ = (char)(0xf0 | (ch
>> 18));
2148 *p
++ = (char)(0x80 | ((ch
>> 12) & 0x3f));
2149 *p
++ = (char)(0x80 | ((ch
>> 6) & 0x3f));
2150 *p
++ = (char)(0x80 | (ch
& 0x3f));
2155 /* This was stack allocated. */
2156 nneeded
= p
- stackbuf
;
2157 assert(nneeded
<= nallocated
);
2158 v
= PyString_FromStringAndSize(stackbuf
, nneeded
);
2161 /* Cut back to size actually needed. */
2162 nneeded
= p
- PyString_AS_STRING(v
);
2163 assert(nneeded
<= nallocated
);
2164 if (_PyString_Resize(&v
, nneeded
))
2169 #undef MAX_SHORT_UNICHARS
2172 PyObject
*PyUnicode_AsUTF8String(PyObject
*unicode
)
2174 if (!PyUnicode_Check(unicode
)) {
2175 PyErr_BadArgument();
2178 return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode
),
2179 PyUnicode_GET_SIZE(unicode
),
2183 /* --- UTF-32 Codec ------------------------------------------------------- */
2186 PyUnicode_DecodeUTF32(const char *s
,
2191 return PyUnicode_DecodeUTF32Stateful(s
, size
, errors
, byteorder
, NULL
);
2195 PyUnicode_DecodeUTF32Stateful(const char *s
,
2199 Py_ssize_t
*consumed
)
2201 const char *starts
= s
;
2202 Py_ssize_t startinpos
;
2203 Py_ssize_t endinpos
;
2205 PyUnicodeObject
*unicode
;
2207 #ifndef Py_UNICODE_WIDE
2209 const unsigned char *qq
;
2211 const int pairs
= 0;
2213 const unsigned char *q
, *e
;
2214 int bo
= 0; /* assume native ordering by default */
2215 const char *errmsg
= "";
2216 /* Offsets from q for retrieving bytes in the right order. */
2217 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2218 int iorder
[] = {0, 1, 2, 3};
2220 int iorder
[] = {3, 2, 1, 0};
2222 PyObject
*errorHandler
= NULL
;
2223 PyObject
*exc
= NULL
;
2225 q
= (unsigned char *)s
;
2231 /* Check for BOM marks (U+FEFF) in the input and adjust current
2232 byte order setting accordingly. In native mode, the leading BOM
2233 mark is skipped, in all other modes, it is copied to the output
2234 stream as-is (giving a ZWNBSP character). */
2237 const Py_UCS4 bom
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2238 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2239 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2240 if (bom
== 0x0000FEFF) {
2244 else if (bom
== 0xFFFE0000) {
2249 if (bom
== 0x0000FEFF) {
2253 else if (bom
== 0xFFFE0000) {
2276 /* On narrow builds we split characters outside the BMP into two
2277 codepoints => count how much extra space we need. */
2278 #ifndef Py_UNICODE_WIDE
2279 for (qq
= q
; qq
< e
; qq
+= 4)
2280 if (qq
[iorder
[2]] != 0 || qq
[iorder
[3]] != 0)
2284 /* This might be one to much, because of a BOM */
2285 unicode
= _PyUnicode_New((size
+3)/4+pairs
);
2289 return (PyObject
*)unicode
;
2291 /* Unpack UTF-32 encoded data */
2296 /* remaining bytes at the end? (size should be divisible by 4) */
2300 errmsg
= "truncated data";
2301 startinpos
= ((const char *)q
)-starts
;
2302 endinpos
= ((const char *)e
)-starts
;
2304 /* The remaining input chars are ignored if the callback
2305 chooses to skip the input */
2307 ch
= (q
[iorder
[3]] << 24) | (q
[iorder
[2]] << 16) |
2308 (q
[iorder
[1]] << 8) | q
[iorder
[0]];
2312 errmsg
= "codepoint not in range(0x110000)";
2313 startinpos
= ((const char *)q
)-starts
;
2314 endinpos
= startinpos
+4;
2317 #ifndef Py_UNICODE_WIDE
2320 *p
++ = 0xD800 | ((ch
-0x10000) >> 10);
2321 *p
++ = 0xDC00 | ((ch
-0x10000) & 0x3FF);
2329 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2330 if (unicode_decode_call_errorhandler(
2331 errors
, &errorHandler
,
2333 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2334 &unicode
, &outpos
, &p
))
2342 *consumed
= (const char *)q
-starts
;
2345 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2348 Py_XDECREF(errorHandler
);
2350 return (PyObject
*)unicode
;
2354 Py_XDECREF(errorHandler
);
2360 PyUnicode_EncodeUTF32(const Py_UNICODE
*s
,
2367 Py_ssize_t nsize
, bytesize
;
2368 #ifndef Py_UNICODE_WIDE
2369 Py_ssize_t i
, pairs
;
2371 const int pairs
= 0;
2373 /* Offsets from p for storing byte pairs in the right order. */
2374 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2375 int iorder
[] = {0, 1, 2, 3};
2377 int iorder
[] = {3, 2, 1, 0};
2380 #define STORECHAR(CH) \
2382 p[iorder[3]] = ((CH) >> 24) & 0xff; \
2383 p[iorder[2]] = ((CH) >> 16) & 0xff; \
2384 p[iorder[1]] = ((CH) >> 8) & 0xff; \
2385 p[iorder[0]] = (CH) & 0xff; \
2389 /* In narrow builds we can output surrogate pairs as one codepoint,
2390 so we need less space. */
2391 #ifndef Py_UNICODE_WIDE
2392 for (i
= pairs
= 0; i
< size
-1; i
++)
2393 if (0xD800 <= s
[i
] && s
[i
] <= 0xDBFF &&
2394 0xDC00 <= s
[i
+1] && s
[i
+1] <= 0xDFFF)
2397 nsize
= (size
- pairs
+ (byteorder
== 0));
2398 bytesize
= nsize
* 4;
2399 if (bytesize
/ 4 != nsize
)
2400 return PyErr_NoMemory();
2401 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2405 p
= (unsigned char *)PyString_AS_STRING(v
);
2411 if (byteorder
== -1) {
2418 else if (byteorder
== 1) {
2426 while (size
-- > 0) {
2428 #ifndef Py_UNICODE_WIDE
2429 if (0xD800 <= ch
&& ch
<= 0xDBFF && size
> 0) {
2431 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2432 ch
= (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2444 PyObject
*PyUnicode_AsUTF32String(PyObject
*unicode
)
2446 if (!PyUnicode_Check(unicode
)) {
2447 PyErr_BadArgument();
2450 return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode
),
2451 PyUnicode_GET_SIZE(unicode
),
2456 /* --- UTF-16 Codec ------------------------------------------------------- */
2459 PyUnicode_DecodeUTF16(const char *s
,
2464 return PyUnicode_DecodeUTF16Stateful(s
, size
, errors
, byteorder
, NULL
);
2468 PyUnicode_DecodeUTF16Stateful(const char *s
,
2472 Py_ssize_t
*consumed
)
2474 const char *starts
= s
;
2475 Py_ssize_t startinpos
;
2476 Py_ssize_t endinpos
;
2478 PyUnicodeObject
*unicode
;
2480 const unsigned char *q
, *e
;
2481 int bo
= 0; /* assume native ordering by default */
2482 const char *errmsg
= "";
2483 /* Offsets from q for retrieving byte pairs in the right order. */
2484 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2485 int ihi
= 1, ilo
= 0;
2487 int ihi
= 0, ilo
= 1;
2489 PyObject
*errorHandler
= NULL
;
2490 PyObject
*exc
= NULL
;
2492 /* Note: size will always be longer than the resulting Unicode
2494 unicode
= _PyUnicode_New(size
);
2498 return (PyObject
*)unicode
;
2500 /* Unpack UTF-16 encoded data */
2502 q
= (unsigned char *)s
;
2508 /* Check for BOM marks (U+FEFF) in the input and adjust current
2509 byte order setting accordingly. In native mode, the leading BOM
2510 mark is skipped, in all other modes, it is copied to the output
2511 stream as-is (giving a ZWNBSP character). */
2514 const Py_UNICODE bom
= (q
[ihi
] << 8) | q
[ilo
];
2515 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2516 if (bom
== 0xFEFF) {
2520 else if (bom
== 0xFFFE) {
2525 if (bom
== 0xFEFF) {
2529 else if (bom
== 0xFFFE) {
2550 /* remaining bytes at the end? (size should be even) */
2554 errmsg
= "truncated data";
2555 startinpos
= ((const char *)q
)-starts
;
2556 endinpos
= ((const char *)e
)-starts
;
2558 /* The remaining input chars are ignored if the callback
2559 chooses to skip the input */
2561 ch
= (q
[ihi
] << 8) | q
[ilo
];
2565 if (ch
< 0xD800 || ch
> 0xDFFF) {
2570 /* UTF-16 code pair: */
2572 errmsg
= "unexpected end of data";
2573 startinpos
= (((const char *)q
)-2)-starts
;
2574 endinpos
= ((const char *)e
)-starts
;
2577 if (0xD800 <= ch
&& ch
<= 0xDBFF) {
2578 Py_UNICODE ch2
= (q
[ihi
] << 8) | q
[ilo
];
2580 if (0xDC00 <= ch2
&& ch2
<= 0xDFFF) {
2581 #ifndef Py_UNICODE_WIDE
2585 *p
++ = (((ch
& 0x3FF)<<10) | (ch2
& 0x3FF)) + 0x10000;
2590 errmsg
= "illegal UTF-16 surrogate";
2591 startinpos
= (((const char *)q
)-4)-starts
;
2592 endinpos
= startinpos
+2;
2597 errmsg
= "illegal encoding";
2598 startinpos
= (((const char *)q
)-2)-starts
;
2599 endinpos
= startinpos
+2;
2600 /* Fall through to report the error */
2603 outpos
= p
-PyUnicode_AS_UNICODE(unicode
);
2604 if (unicode_decode_call_errorhandler(
2605 errors
, &errorHandler
,
2607 starts
, size
, &startinpos
, &endinpos
, &exc
, (const char **)&q
,
2608 &unicode
, &outpos
, &p
))
2616 *consumed
= (const char *)q
-starts
;
2619 if (_PyUnicode_Resize(&unicode
, p
- unicode
->str
) < 0)
2622 Py_XDECREF(errorHandler
);
2624 return (PyObject
*)unicode
;
2628 Py_XDECREF(errorHandler
);
2634 PyUnicode_EncodeUTF16(const Py_UNICODE
*s
,
2641 Py_ssize_t nsize
, bytesize
;
2642 #ifdef Py_UNICODE_WIDE
2643 Py_ssize_t i
, pairs
;
2645 const int pairs
= 0;
2647 /* Offsets from p for storing byte pairs in the right order. */
2648 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2649 int ihi
= 1, ilo
= 0;
2651 int ihi
= 0, ilo
= 1;
2654 #define STORECHAR(CH) \
2656 p[ihi] = ((CH) >> 8) & 0xff; \
2657 p[ilo] = (CH) & 0xff; \
2661 #ifdef Py_UNICODE_WIDE
2662 for (i
= pairs
= 0; i
< size
; i
++)
2663 if (s
[i
] >= 0x10000)
2666 /* 2 * (size + pairs + (byteorder == 0)) */
2667 if (size
> PY_SSIZE_T_MAX
||
2668 size
> PY_SSIZE_T_MAX
- pairs
- (byteorder
== 0))
2669 return PyErr_NoMemory();
2670 nsize
= size
+ pairs
+ (byteorder
== 0);
2671 bytesize
= nsize
* 2;
2672 if (bytesize
/ 2 != nsize
)
2673 return PyErr_NoMemory();
2674 v
= PyString_FromStringAndSize(NULL
, bytesize
);
2678 p
= (unsigned char *)PyString_AS_STRING(v
);
2684 if (byteorder
== -1) {
2689 else if (byteorder
== 1) {
2695 while (size
-- > 0) {
2696 Py_UNICODE ch
= *s
++;
2698 #ifdef Py_UNICODE_WIDE
2699 if (ch
>= 0x10000) {
2700 ch2
= 0xDC00 | ((ch
-0x10000) & 0x3FF);
2701 ch
= 0xD800 | ((ch
-0x10000) >> 10);
2712 PyObject
*PyUnicode_AsUTF16String(PyObject
*unicode
)
2714 if (!PyUnicode_Check(unicode
)) {
2715 PyErr_BadArgument();
2718 return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode
),
2719 PyUnicode_GET_SIZE(unicode
),
2724 /* --- Unicode Escape Codec ----------------------------------------------- */
2726 static _PyUnicode_Name_CAPI
*ucnhash_CAPI
= NULL
;
2728 PyObject
*PyUnicode_DecodeUnicodeEscape(const char *s
,
2732 const char *starts
= s
;
2733 Py_ssize_t startinpos
;
2734 Py_ssize_t endinpos
;
2741 Py_UCS4 chr
= 0xffffffff; /* in case 'getcode' messes up */
2742 PyObject
*errorHandler
= NULL
;
2743 PyObject
*exc
= NULL
;
2745 /* Escaped strings will always be longer than the resulting
2746 Unicode string, so we start with size here and then reduce the
2747 length after conversion to the true value.
2748 (but if the error callback returns a long replacement string
2749 we'll have to allocate more space) */
2750 v
= _PyUnicode_New(size
);
2754 return (PyObject
*)v
;
2756 p
= PyUnicode_AS_UNICODE(v
);
2764 /* Non-escape characters are interpreted as Unicode ordinals */
2766 *p
++ = (unsigned char) *s
++;
2770 startinpos
= s
-starts
;
2775 c
= '\0'; /* Invalid after \ */
2780 case '\\': *p
++ = '\\'; break;
2781 case '\'': *p
++ = '\''; break;
2782 case '\"': *p
++ = '\"'; break;
2783 case 'b': *p
++ = '\b'; break;
2784 case 'f': *p
++ = '\014'; break; /* FF */
2785 case 't': *p
++ = '\t'; break;
2786 case 'n': *p
++ = '\n'; break;
2787 case 'r': *p
++ = '\r'; break;
2788 case 'v': *p
++ = '\013'; break; /* VT */
2789 case 'a': *p
++ = '\007'; break; /* BEL, not classic C */
2791 /* \OOO (octal) escapes */
2792 case '0': case '1': case '2': case '3':
2793 case '4': case '5': case '6': case '7':
2795 if (s
< end
&& '0' <= *s
&& *s
<= '7') {
2796 x
= (x
<<3) + *s
++ - '0';
2797 if (s
< end
&& '0' <= *s
&& *s
<= '7')
2798 x
= (x
<<3) + *s
++ - '0';
2807 message
= "truncated \\xXX escape";
2813 message
= "truncated \\uXXXX escape";
2819 message
= "truncated \\UXXXXXXXX escape";
2822 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2825 if (unicode_decode_call_errorhandler(
2826 errors
, &errorHandler
,
2827 "unicodeescape", "end of string in escape sequence",
2828 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2833 for (i
= 0; i
< digits
; ++i
) {
2834 c
= (unsigned char) s
[i
];
2836 endinpos
= (s
+i
+1)-starts
;
2837 if (unicode_decode_call_errorhandler(
2838 errors
, &errorHandler
,
2839 "unicodeescape", message
,
2840 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2845 chr
= (chr
<<4) & ~0xF;
2846 if (c
>= '0' && c
<= '9')
2848 else if (c
>= 'a' && c
<= 'f')
2849 chr
+= 10 + c
- 'a';
2851 chr
+= 10 + c
- 'A';
2854 if (chr
== 0xffffffff && PyErr_Occurred())
2855 /* _decoding_error will have already written into the
2859 /* when we get here, chr is a 32-bit unicode character */
2861 /* UCS-2 character */
2862 *p
++ = (Py_UNICODE
) chr
;
2863 else if (chr
<= 0x10ffff) {
2864 /* UCS-4 character. Either store directly, or as
2866 #ifdef Py_UNICODE_WIDE
2870 *p
++ = 0xD800 + (Py_UNICODE
) (chr
>> 10);
2871 *p
++ = 0xDC00 + (Py_UNICODE
) (chr
& 0x03FF);
2874 endinpos
= s
-starts
;
2875 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2876 if (unicode_decode_call_errorhandler(
2877 errors
, &errorHandler
,
2878 "unicodeescape", "illegal Unicode character",
2879 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2887 message
= "malformed \\N character escape";
2888 if (ucnhash_CAPI
== NULL
) {
2889 /* load the unicode data module */
2890 ucnhash_CAPI
= (_PyUnicode_Name_CAPI
*)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME
, 1);
2891 if (ucnhash_CAPI
== NULL
)
2895 const char *start
= s
+1;
2896 /* look for the closing brace */
2897 while (*s
!= '}' && s
< end
)
2899 if (s
> start
&& s
< end
&& *s
== '}') {
2900 /* found a name. look it up in the unicode database */
2901 message
= "unknown Unicode character name";
2903 if (ucnhash_CAPI
->getcode(NULL
, start
, (int)(s
-start
-1), &chr
))
2907 endinpos
= s
-starts
;
2908 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2909 if (unicode_decode_call_errorhandler(
2910 errors
, &errorHandler
,
2911 "unicodeescape", message
,
2912 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2919 message
= "\\ at end of string";
2921 endinpos
= s
-starts
;
2922 outpos
= p
-PyUnicode_AS_UNICODE(v
);
2923 if (unicode_decode_call_errorhandler(
2924 errors
, &errorHandler
,
2925 "unicodeescape", message
,
2926 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
2932 *p
++ = (unsigned char)s
[-1];
2939 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
2941 Py_XDECREF(errorHandler
);
2943 return (PyObject
*)v
;
2948 "\\N escapes not supported (can't load unicodedata module)"
2951 Py_XDECREF(errorHandler
);
2957 Py_XDECREF(errorHandler
);
2962 /* Return a Unicode-Escape string version of the Unicode object.
2964 If quotes is true, the string is enclosed in u"" or u'' quotes as
2969 Py_LOCAL_INLINE(const Py_UNICODE
*) findchar(const Py_UNICODE
*s
,
2973 /* like wcschr, but doesn't stop at NULL characters */
2975 while (size
-- > 0) {
2985 PyObject
*unicodeescape_string(const Py_UNICODE
*s
,
2992 static const char *hexdigit
= "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994 const Py_ssize_t expandsize
= 10;
2996 const Py_ssize_t expandsize
= 6;
2999 /* XXX(nnorwitz): rather than over-allocating, it would be
3000 better to choose a different scheme. Perhaps scan the
3001 first N-chars of the string and allocate based on that size.
3003 /* Initial allocation is based on the longest-possible unichr
3006 In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007 unichr, so in this case it's the longest unichr escape. In
3008 narrow (UTF-16) builds this is five chars per source unichr
3009 since there are two unichrs in the surrogate pair, so in narrow
3010 (UTF-16) builds it's not the longest unichr escape.
3012 In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013 so in the narrow (UTF-16) build case it's the longest unichr
3017 if (size
> (PY_SSIZE_T_MAX
- 2 - 1) / expandsize
)
3018 return PyErr_NoMemory();
3020 repr
= PyString_FromStringAndSize(NULL
,
3027 p
= PyString_AS_STRING(repr
);
3031 *p
++ = (findchar(s
, size
, '\'') &&
3032 !findchar(s
, size
, '"')) ? '"' : '\'';
3034 while (size
-- > 0) {
3035 Py_UNICODE ch
= *s
++;
3037 /* Escape quotes and backslashes */
3039 ch
== (Py_UNICODE
) PyString_AS_STRING(repr
)[1]) || ch
== '\\') {
3045 #ifdef Py_UNICODE_WIDE
3046 /* Map 21-bit characters to '\U00xxxxxx' */
3047 else if (ch
>= 0x10000) {
3050 *p
++ = hexdigit
[(ch
>> 28) & 0x0000000F];
3051 *p
++ = hexdigit
[(ch
>> 24) & 0x0000000F];
3052 *p
++ = hexdigit
[(ch
>> 20) & 0x0000000F];
3053 *p
++ = hexdigit
[(ch
>> 16) & 0x0000000F];
3054 *p
++ = hexdigit
[(ch
>> 12) & 0x0000000F];
3055 *p
++ = hexdigit
[(ch
>> 8) & 0x0000000F];
3056 *p
++ = hexdigit
[(ch
>> 4) & 0x0000000F];
3057 *p
++ = hexdigit
[ch
& 0x0000000F];
3061 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062 else if (ch
>= 0xD800 && ch
< 0xDC00) {
3068 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3069 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3072 *p
++ = hexdigit
[(ucs
>> 28) & 0x0000000F];
3073 *p
++ = hexdigit
[(ucs
>> 24) & 0x0000000F];
3074 *p
++ = hexdigit
[(ucs
>> 20) & 0x0000000F];
3075 *p
++ = hexdigit
[(ucs
>> 16) & 0x0000000F];
3076 *p
++ = hexdigit
[(ucs
>> 12) & 0x0000000F];
3077 *p
++ = hexdigit
[(ucs
>> 8) & 0x0000000F];
3078 *p
++ = hexdigit
[(ucs
>> 4) & 0x0000000F];
3079 *p
++ = hexdigit
[ucs
& 0x0000000F];
3082 /* Fall through: isolated surrogates are copied as-is */
3088 /* Map 16-bit characters to '\uxxxx' */
3092 *p
++ = hexdigit
[(ch
>> 12) & 0x000F];
3093 *p
++ = hexdigit
[(ch
>> 8) & 0x000F];
3094 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3095 *p
++ = hexdigit
[ch
& 0x000F];
3098 /* Map special whitespace to '\t', \n', '\r' */
3099 else if (ch
== '\t') {
3103 else if (ch
== '\n') {
3107 else if (ch
== '\r') {
3112 /* Map non-printable US ASCII to '\xhh' */
3113 else if (ch
< ' ' || ch
>= 0x7F) {
3116 *p
++ = hexdigit
[(ch
>> 4) & 0x000F];
3117 *p
++ = hexdigit
[ch
& 0x000F];
3120 /* Copy everything else as-is */
3125 *p
++ = PyString_AS_STRING(repr
)[1];
3128 if (_PyString_Resize(&repr
, p
- PyString_AS_STRING(repr
)))
3133 PyObject
*PyUnicode_EncodeUnicodeEscape(const Py_UNICODE
*s
,
3136 return unicodeescape_string(s
, size
, 0);
3139 PyObject
*PyUnicode_AsUnicodeEscapeString(PyObject
*unicode
)
3141 if (!PyUnicode_Check(unicode
)) {
3142 PyErr_BadArgument();
3145 return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3146 PyUnicode_GET_SIZE(unicode
));
3149 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3151 PyObject
*PyUnicode_DecodeRawUnicodeEscape(const char *s
,
3155 const char *starts
= s
;
3156 Py_ssize_t startinpos
;
3157 Py_ssize_t endinpos
;
3163 PyObject
*errorHandler
= NULL
;
3164 PyObject
*exc
= NULL
;
3166 /* Escaped strings will always be longer than the resulting
3167 Unicode string, so we start with size here and then reduce the
3168 length after conversion to the true value. (But decoding error
3169 handler might have to resize the string) */
3170 v
= _PyUnicode_New(size
);
3174 return (PyObject
*)v
;
3175 p
= PyUnicode_AS_UNICODE(v
);
3183 /* Non-escape characters are interpreted as Unicode ordinals */
3185 *p
++ = (unsigned char)*s
++;
3188 startinpos
= s
-starts
;
3190 /* \u-escapes are only interpreted iff the number of leading
3191 backslashes if odd */
3196 *p
++ = (unsigned char)*s
++;
3198 if (((s
- bs
) & 1) == 0 ||
3200 (*s
!= 'u' && *s
!= 'U')) {
3204 count
= *s
=='u' ? 4 : 8;
3207 /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3209 for (x
= 0, i
= 0; i
< count
; ++i
, ++s
) {
3210 c
= (unsigned char)*s
;
3212 endinpos
= s
-starts
;
3213 if (unicode_decode_call_errorhandler(
3214 errors
, &errorHandler
,
3215 "rawunicodeescape", "truncated \\uXXXX",
3216 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3222 if (c
>= '0' && c
<= '9')
3224 else if (c
>= 'a' && c
<= 'f')
3230 /* UCS-2 character */
3231 *p
++ = (Py_UNICODE
) x
;
3232 else if (x
<= 0x10ffff) {
3233 /* UCS-4 character. Either store directly, or as
3235 #ifdef Py_UNICODE_WIDE
3236 *p
++ = (Py_UNICODE
) x
;
3239 *p
++ = 0xD800 + (Py_UNICODE
) (x
>> 10);
3240 *p
++ = 0xDC00 + (Py_UNICODE
) (x
& 0x03FF);
3243 endinpos
= s
-starts
;
3244 outpos
= p
-PyUnicode_AS_UNICODE(v
);
3245 if (unicode_decode_call_errorhandler(
3246 errors
, &errorHandler
,
3247 "rawunicodeescape", "\\Uxxxxxxxx out of range",
3248 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3255 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3257 Py_XDECREF(errorHandler
);
3259 return (PyObject
*)v
;
3263 Py_XDECREF(errorHandler
);
3268 PyObject
*PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE
*s
,
3275 static const char *hexdigit
= "0123456789abcdef";
3276 #ifdef Py_UNICODE_WIDE
3277 const Py_ssize_t expandsize
= 10;
3279 const Py_ssize_t expandsize
= 6;
3282 if (size
> PY_SSIZE_T_MAX
/ expandsize
)
3283 return PyErr_NoMemory();
3285 repr
= PyString_FromStringAndSize(NULL
, expandsize
* size
);
3291 p
= q
= PyString_AS_STRING(repr
);
3292 while (size
-- > 0) {
3293 Py_UNICODE ch
= *s
++;
3294 #ifdef Py_UNICODE_WIDE
3295 /* Map 32-bit characters to '\Uxxxxxxxx' */
3296 if (ch
>= 0x10000) {
3299 *p
++ = hexdigit
[(ch
>> 28) & 0xf];
3300 *p
++ = hexdigit
[(ch
>> 24) & 0xf];
3301 *p
++ = hexdigit
[(ch
>> 20) & 0xf];
3302 *p
++ = hexdigit
[(ch
>> 16) & 0xf];
3303 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3304 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3305 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3306 *p
++ = hexdigit
[ch
& 15];
3310 /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311 if (ch
>= 0xD800 && ch
< 0xDC00) {
3317 if (ch2
>= 0xDC00 && ch2
<= 0xDFFF) {
3318 ucs
= (((ch
& 0x03FF) << 10) | (ch2
& 0x03FF)) + 0x00010000;
3321 *p
++ = hexdigit
[(ucs
>> 28) & 0xf];
3322 *p
++ = hexdigit
[(ucs
>> 24) & 0xf];
3323 *p
++ = hexdigit
[(ucs
>> 20) & 0xf];
3324 *p
++ = hexdigit
[(ucs
>> 16) & 0xf];
3325 *p
++ = hexdigit
[(ucs
>> 12) & 0xf];
3326 *p
++ = hexdigit
[(ucs
>> 8) & 0xf];
3327 *p
++ = hexdigit
[(ucs
>> 4) & 0xf];
3328 *p
++ = hexdigit
[ucs
& 0xf];
3331 /* Fall through: isolated surrogates are copied as-is */
3336 /* Map 16-bit characters to '\uxxxx' */
3340 *p
++ = hexdigit
[(ch
>> 12) & 0xf];
3341 *p
++ = hexdigit
[(ch
>> 8) & 0xf];
3342 *p
++ = hexdigit
[(ch
>> 4) & 0xf];
3343 *p
++ = hexdigit
[ch
& 15];
3345 /* Copy everything else as-is */
3350 if (_PyString_Resize(&repr
, p
- q
))
3355 PyObject
*PyUnicode_AsRawUnicodeEscapeString(PyObject
*unicode
)
3357 if (!PyUnicode_Check(unicode
)) {
3358 PyErr_BadArgument();
3361 return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode
),
3362 PyUnicode_GET_SIZE(unicode
));
3365 /* --- Unicode Internal Codec ------------------------------------------- */
3367 PyObject
*_PyUnicode_DecodeUnicodeInternal(const char *s
,
3371 const char *starts
= s
;
3372 Py_ssize_t startinpos
;
3373 Py_ssize_t endinpos
;
3379 PyObject
*errorHandler
= NULL
;
3380 PyObject
*exc
= NULL
;
3382 #ifdef Py_UNICODE_WIDE
3383 Py_UNICODE unimax
= PyUnicode_GetMax();
3386 /* XXX overflow detection missing */
3387 v
= _PyUnicode_New((size
+Py_UNICODE_SIZE
-1)/ Py_UNICODE_SIZE
);
3390 if (PyUnicode_GetSize((PyObject
*)v
) == 0)
3391 return (PyObject
*)v
;
3392 p
= PyUnicode_AS_UNICODE(v
);
3396 memcpy(p
, s
, sizeof(Py_UNICODE
));
3397 /* We have to sanity check the raw data, otherwise doom looms for
3398 some malformed UCS-4 data. */
3400 #ifdef Py_UNICODE_WIDE
3401 *p
> unimax
|| *p
< 0 ||
3403 end
-s
< Py_UNICODE_SIZE
3406 startinpos
= s
- starts
;
3407 if (end
-s
< Py_UNICODE_SIZE
) {
3408 endinpos
= end
-starts
;
3409 reason
= "truncated input";
3412 endinpos
= s
- starts
+ Py_UNICODE_SIZE
;
3413 reason
= "illegal code point (> 0x10FFFF)";
3415 outpos
= p
- PyUnicode_AS_UNICODE(v
);
3416 if (unicode_decode_call_errorhandler(
3417 errors
, &errorHandler
,
3418 "unicode_internal", reason
,
3419 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3426 s
+= Py_UNICODE_SIZE
;
3430 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3432 Py_XDECREF(errorHandler
);
3434 return (PyObject
*)v
;
3438 Py_XDECREF(errorHandler
);
3443 /* --- Latin-1 Codec ------------------------------------------------------ */
3445 PyObject
*PyUnicode_DecodeLatin1(const char *s
,
3452 /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3454 Py_UNICODE r
= *(unsigned char*)s
;
3455 return PyUnicode_FromUnicode(&r
, 1);
3458 v
= _PyUnicode_New(size
);
3462 return (PyObject
*)v
;
3463 p
= PyUnicode_AS_UNICODE(v
);
3465 *p
++ = (unsigned char)*s
++;
3466 return (PyObject
*)v
;
3473 /* create or adjust a UnicodeEncodeError */
3474 static void make_encode_exception(PyObject
**exceptionObject
,
3475 const char *encoding
,
3476 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3477 Py_ssize_t startpos
, Py_ssize_t endpos
,
3480 if (*exceptionObject
== NULL
) {
3481 *exceptionObject
= PyUnicodeEncodeError_Create(
3482 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3485 if (PyUnicodeEncodeError_SetStart(*exceptionObject
, startpos
))
3487 if (PyUnicodeEncodeError_SetEnd(*exceptionObject
, endpos
))
3489 if (PyUnicodeEncodeError_SetReason(*exceptionObject
, reason
))
3493 Py_DECREF(*exceptionObject
);
3494 *exceptionObject
= NULL
;
3498 /* raises a UnicodeEncodeError */
3499 static void raise_encode_exception(PyObject
**exceptionObject
,
3500 const char *encoding
,
3501 const Py_UNICODE
*unicode
, Py_ssize_t size
,
3502 Py_ssize_t startpos
, Py_ssize_t endpos
,
3505 make_encode_exception(exceptionObject
,
3506 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3507 if (*exceptionObject
!= NULL
)
3508 PyCodec_StrictErrors(*exceptionObject
);
3511 /* error handling callback helper:
3512 build arguments, call the callback and check the arguments,
3513 put the result into newpos and return the replacement string, which
3514 has to be freed by the caller */
3515 static PyObject
*unicode_encode_call_errorhandler(const char *errors
,
3516 PyObject
**errorHandler
,
3517 const char *encoding
, const char *reason
,
3518 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
3519 Py_ssize_t startpos
, Py_ssize_t endpos
,
3522 static char *argparse
= "O!n;encoding error handler must return (unicode, int) tuple";
3525 PyObject
*resunicode
;
3527 if (*errorHandler
== NULL
) {
3528 *errorHandler
= PyCodec_LookupError(errors
);
3529 if (*errorHandler
== NULL
)
3533 make_encode_exception(exceptionObject
,
3534 encoding
, unicode
, size
, startpos
, endpos
, reason
);
3535 if (*exceptionObject
== NULL
)
3538 restuple
= PyObject_CallFunctionObjArgs(
3539 *errorHandler
, *exceptionObject
, NULL
);
3540 if (restuple
== NULL
)
3542 if (!PyTuple_Check(restuple
)) {
3543 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
3544 Py_DECREF(restuple
);
3547 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
3548 &resunicode
, newpos
)) {
3549 Py_DECREF(restuple
);
3553 *newpos
= size
+*newpos
;
3554 if (*newpos
<0 || *newpos
>size
) {
3555 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
3556 Py_DECREF(restuple
);
3559 Py_INCREF(resunicode
);
3560 Py_DECREF(restuple
);
3564 static PyObject
*unicode_encode_ucs1(const Py_UNICODE
*p
,
3571 /* pointers to the beginning and end+1 of input */
3572 const Py_UNICODE
*startp
= p
;
3573 const Py_UNICODE
*endp
= p
+ size
;
3574 /* pointer to the beginning of the unencodable characters */
3575 /* const Py_UNICODE *badp = NULL; */
3576 /* pointer into the output */
3578 /* current output position */
3579 Py_ssize_t respos
= 0;
3581 const char *encoding
= (limit
== 256) ? "latin-1" : "ascii";
3582 const char *reason
= (limit
== 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3583 PyObject
*errorHandler
= NULL
;
3584 PyObject
*exc
= NULL
;
3585 /* the following variable is used for caching string comparisons
3586 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3587 int known_errorHandler
= -1;
3589 /* allocate enough for a simple encoding without
3590 replacements, if we need more, we'll resize */
3591 res
= PyString_FromStringAndSize(NULL
, size
);
3596 str
= PyString_AS_STRING(res
);
3602 /* can we encode this? */
3604 /* no overflow check, because we know that the space is enough */
3609 Py_ssize_t unicodepos
= p
-startp
;
3610 Py_ssize_t requiredsize
;
3611 PyObject
*repunicode
;
3616 /* startpos for collecting unencodable chars */
3617 const Py_UNICODE
*collstart
= p
;
3618 const Py_UNICODE
*collend
= p
;
3619 /* find all unecodable characters */
3620 while ((collend
< endp
) && ((*collend
)>=limit
))
3622 /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3623 if (known_errorHandler
==-1) {
3624 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
3625 known_errorHandler
= 1;
3626 else if (!strcmp(errors
, "replace"))
3627 known_errorHandler
= 2;
3628 else if (!strcmp(errors
, "ignore"))
3629 known_errorHandler
= 3;
3630 else if (!strcmp(errors
, "xmlcharrefreplace"))
3631 known_errorHandler
= 4;
3633 known_errorHandler
= 0;
3635 switch (known_errorHandler
) {
3636 case 1: /* strict */
3637 raise_encode_exception(&exc
, encoding
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
3639 case 2: /* replace */
3640 while (collstart
++<collend
)
3641 *str
++ = '?'; /* fall through */
3642 case 3: /* ignore */
3645 case 4: /* xmlcharrefreplace */
3646 respos
= str
-PyString_AS_STRING(res
);
3647 /* determine replacement size (temporarily (mis)uses p) */
3648 for (p
= collstart
, repsize
= 0; p
< collend
; ++p
) {
3657 #ifndef Py_UNICODE_WIDE
3663 else if (*p
<1000000)
3669 requiredsize
= respos
+repsize
+(endp
-collend
);
3670 if (requiredsize
> ressize
) {
3671 if (requiredsize
<2*ressize
)
3672 requiredsize
= 2*ressize
;
3673 if (_PyString_Resize(&res
, requiredsize
))
3675 str
= PyString_AS_STRING(res
) + respos
;
3676 ressize
= requiredsize
;
3678 /* generate replacement (temporarily (mis)uses p) */
3679 for (p
= collstart
; p
< collend
; ++p
) {
3680 str
+= sprintf(str
, "&#%d;", (int)*p
);
3685 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
3686 encoding
, reason
, startp
, size
, &exc
,
3687 collstart
-startp
, collend
-startp
, &newpos
);
3688 if (repunicode
== NULL
)
3690 /* need more space? (at least enough for what we have+the
3691 replacement+the rest of the string, so we won't have to
3692 check space for encodable characters) */
3693 respos
= str
-PyString_AS_STRING(res
);
3694 repsize
= PyUnicode_GET_SIZE(repunicode
);
3695 requiredsize
= respos
+repsize
+(endp
-collend
);
3696 if (requiredsize
> ressize
) {
3697 if (requiredsize
<2*ressize
)
3698 requiredsize
= 2*ressize
;
3699 if (_PyString_Resize(&res
, requiredsize
)) {
3700 Py_DECREF(repunicode
);
3703 str
= PyString_AS_STRING(res
) + respos
;
3704 ressize
= requiredsize
;
3706 /* check if there is anything unencodable in the replacement
3707 and copy it to the output */
3708 for (uni2
= PyUnicode_AS_UNICODE(repunicode
);repsize
-->0; ++uni2
, ++str
) {
3711 raise_encode_exception(&exc
, encoding
, startp
, size
,
3712 unicodepos
, unicodepos
+1, reason
);
3713 Py_DECREF(repunicode
);
3718 p
= startp
+ newpos
;
3719 Py_DECREF(repunicode
);
3723 /* Resize if we allocated to much */
3724 respos
= str
-PyString_AS_STRING(res
);
3726 /* If this falls res will be NULL */
3727 _PyString_Resize(&res
, respos
);
3728 Py_XDECREF(errorHandler
);
3734 Py_XDECREF(errorHandler
);
3739 PyObject
*PyUnicode_EncodeLatin1(const Py_UNICODE
*p
,
3743 return unicode_encode_ucs1(p
, size
, errors
, 256);
3746 PyObject
*PyUnicode_AsLatin1String(PyObject
*unicode
)
3748 if (!PyUnicode_Check(unicode
)) {
3749 PyErr_BadArgument();
3752 return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode
),
3753 PyUnicode_GET_SIZE(unicode
),
3757 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3759 PyObject
*PyUnicode_DecodeASCII(const char *s
,
3763 const char *starts
= s
;
3766 Py_ssize_t startinpos
;
3767 Py_ssize_t endinpos
;
3770 PyObject
*errorHandler
= NULL
;
3771 PyObject
*exc
= NULL
;
3773 /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3774 if (size
== 1 && *(unsigned char*)s
< 128) {
3775 Py_UNICODE r
= *(unsigned char*)s
;
3776 return PyUnicode_FromUnicode(&r
, 1);
3779 v
= _PyUnicode_New(size
);
3783 return (PyObject
*)v
;
3784 p
= PyUnicode_AS_UNICODE(v
);
3787 register unsigned char c
= (unsigned char)*s
;
3793 startinpos
= s
-starts
;
3794 endinpos
= startinpos
+ 1;
3795 outpos
= p
- (Py_UNICODE
*)PyUnicode_AS_UNICODE(v
);
3796 if (unicode_decode_call_errorhandler(
3797 errors
, &errorHandler
,
3798 "ascii", "ordinal not in range(128)",
3799 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
3804 if (p
- PyUnicode_AS_UNICODE(v
) < PyString_GET_SIZE(v
))
3805 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
3807 Py_XDECREF(errorHandler
);
3809 return (PyObject
*)v
;
3813 Py_XDECREF(errorHandler
);
3818 PyObject
*PyUnicode_EncodeASCII(const Py_UNICODE
*p
,
3822 return unicode_encode_ucs1(p
, size
, errors
, 128);
3825 PyObject
*PyUnicode_AsASCIIString(PyObject
*unicode
)
3827 if (!PyUnicode_Check(unicode
)) {
3828 PyErr_BadArgument();
3831 return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode
),
3832 PyUnicode_GET_SIZE(unicode
),
3836 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3838 /* --- MBCS codecs for Windows -------------------------------------------- */
3840 #if SIZEOF_INT < SIZEOF_SIZE_T
3844 /* XXX This code is limited to "true" double-byte encodings, as
3845 a) it assumes an incomplete character consists of a single byte, and
3846 b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3847 encodings, see IsDBCSLeadByteEx documentation. */
3849 static int is_dbcs_lead_byte(const char *s
, int offset
)
3851 const char *curr
= s
+ offset
;
3853 if (IsDBCSLeadByte(*curr
)) {
3854 const char *prev
= CharPrev(s
, curr
);
3855 return (prev
== curr
) || !IsDBCSLeadByte(*prev
) || (curr
- prev
== 2);
3861 * Decode MBCS string into unicode object. If 'final' is set, converts
3862 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3864 static int decode_mbcs(PyUnicodeObject
**v
,
3865 const char *s
, /* MBCS string */
3866 int size
, /* sizeof MBCS string */
3875 /* Skip trailing lead-byte unless 'final' is set */
3876 if (!final
&& size
>= 1 && is_dbcs_lead_byte(s
, size
- 1))
3879 /* First get the size of the result */
3881 usize
= MultiByteToWideChar(CP_ACP
, 0, s
, size
, NULL
, 0);
3883 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3889 /* Create unicode object */
3890 *v
= _PyUnicode_New(usize
);
3895 /* Extend unicode object */
3896 n
= PyUnicode_GET_SIZE(*v
);
3897 if (_PyUnicode_Resize(v
, n
+ usize
) < 0)
3901 /* Do the conversion */
3903 p
= PyUnicode_AS_UNICODE(*v
) + n
;
3904 if (0 == MultiByteToWideChar(CP_ACP
, 0, s
, size
, p
, usize
)) {
3905 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3913 PyObject
*PyUnicode_DecodeMBCSStateful(const char *s
,
3916 Py_ssize_t
*consumed
)
3918 PyUnicodeObject
*v
= NULL
;
3927 done
= decode_mbcs(&v
, s
, INT_MAX
, 0);
3930 done
= decode_mbcs(&v
, s
, (int)size
, !consumed
);
3941 if (size
> INT_MAX
) {
3948 return (PyObject
*)v
;
3951 PyObject
*PyUnicode_DecodeMBCS(const char *s
,
3955 return PyUnicode_DecodeMBCSStateful(s
, size
, errors
, NULL
);
3959 * Convert unicode into string object (MBCS).
3960 * Returns 0 if succeed, -1 otherwise.
3962 static int encode_mbcs(PyObject
**repr
,
3963 const Py_UNICODE
*p
, /* unicode */
3964 int size
) /* size of unicode */
3971 /* First get the size of the result */
3973 mbcssize
= WideCharToMultiByte(CP_ACP
, 0, p
, size
, NULL
, 0, NULL
, NULL
);
3974 if (mbcssize
== 0) {
3975 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
3980 if (*repr
== NULL
) {
3981 /* Create string object */
3982 *repr
= PyString_FromStringAndSize(NULL
, mbcssize
);
3987 /* Extend string object */
3988 n
= PyString_Size(*repr
);
3989 if (_PyString_Resize(repr
, n
+ mbcssize
) < 0)
3993 /* Do the conversion */
3995 char *s
= PyString_AS_STRING(*repr
) + n
;
3996 if (0 == WideCharToMultiByte(CP_ACP
, 0, p
, size
, s
, mbcssize
, NULL
, NULL
)) {
3997 PyErr_SetFromWindowsErrWithFilename(0, NULL
);
4005 PyObject
*PyUnicode_EncodeMBCS(const Py_UNICODE
*p
,
4009 PyObject
*repr
= NULL
;
4015 ret
= encode_mbcs(&repr
, p
, INT_MAX
);
4018 ret
= encode_mbcs(&repr
, p
, (int)size
);
4026 if (size
> INT_MAX
) {
4036 PyObject
*PyUnicode_AsMBCSString(PyObject
*unicode
)
4038 if (!PyUnicode_Check(unicode
)) {
4039 PyErr_BadArgument();
4042 return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode
),
4043 PyUnicode_GET_SIZE(unicode
),
4049 #endif /* MS_WINDOWS */
4051 /* --- Character Mapping Codec -------------------------------------------- */
4053 PyObject
*PyUnicode_DecodeCharmap(const char *s
,
4058 const char *starts
= s
;
4059 Py_ssize_t startinpos
;
4060 Py_ssize_t endinpos
;
4065 Py_ssize_t extrachars
= 0;
4066 PyObject
*errorHandler
= NULL
;
4067 PyObject
*exc
= NULL
;
4068 Py_UNICODE
*mapstring
= NULL
;
4069 Py_ssize_t maplen
= 0;
4071 /* Default to Latin-1 */
4072 if (mapping
== NULL
)
4073 return PyUnicode_DecodeLatin1(s
, size
, errors
);
4075 v
= _PyUnicode_New(size
);
4079 return (PyObject
*)v
;
4080 p
= PyUnicode_AS_UNICODE(v
);
4082 if (PyUnicode_CheckExact(mapping
)) {
4083 mapstring
= PyUnicode_AS_UNICODE(mapping
);
4084 maplen
= PyUnicode_GET_SIZE(mapping
);
4086 unsigned char ch
= *s
;
4087 Py_UNICODE x
= 0xfffe; /* illegal value */
4093 /* undefined mapping */
4094 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4095 startinpos
= s
-starts
;
4096 endinpos
= startinpos
+1;
4097 if (unicode_decode_call_errorhandler(
4098 errors
, &errorHandler
,
4099 "charmap", "character maps to <undefined>",
4100 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4112 unsigned char ch
= *s
;
4115 /* Get mapping (char ordinal -> integer, Unicode char or None) */
4116 w
= PyInt_FromLong((long)ch
);
4119 x
= PyObject_GetItem(mapping
, w
);
4122 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4123 /* No mapping found means: mapping is undefined. */
4132 if (PyInt_Check(x
)) {
4133 long value
= PyInt_AS_LONG(x
);
4134 if (value
< 0 || value
> 65535) {
4135 PyErr_SetString(PyExc_TypeError
,
4136 "character mapping must be in range(65536)");
4140 *p
++ = (Py_UNICODE
)value
;
4142 else if (x
== Py_None
) {
4143 /* undefined mapping */
4144 outpos
= p
-PyUnicode_AS_UNICODE(v
);
4145 startinpos
= s
-starts
;
4146 endinpos
= startinpos
+1;
4147 if (unicode_decode_call_errorhandler(
4148 errors
, &errorHandler
,
4149 "charmap", "character maps to <undefined>",
4150 starts
, size
, &startinpos
, &endinpos
, &exc
, &s
,
4158 else if (PyUnicode_Check(x
)) {
4159 Py_ssize_t targetsize
= PyUnicode_GET_SIZE(x
);
4161 if (targetsize
== 1)
4163 *p
++ = *PyUnicode_AS_UNICODE(x
);
4165 else if (targetsize
> 1) {
4167 if (targetsize
> extrachars
) {
4169 Py_ssize_t oldpos
= p
- PyUnicode_AS_UNICODE(v
);
4170 Py_ssize_t needed
= (targetsize
- extrachars
) + \
4172 extrachars
+= needed
;
4173 /* XXX overflow detection missing */
4174 if (_PyUnicode_Resize(&v
,
4175 PyUnicode_GET_SIZE(v
) + needed
) < 0) {
4179 p
= PyUnicode_AS_UNICODE(v
) + oldpos
;
4182 PyUnicode_AS_UNICODE(x
),
4185 extrachars
-= targetsize
;
4187 /* 1-0 mapping: skip the character */
4190 /* wrong return value */
4191 PyErr_SetString(PyExc_TypeError
,
4192 "character mapping must return integer, None or unicode");
4200 if (p
- PyUnicode_AS_UNICODE(v
) < PyUnicode_GET_SIZE(v
))
4201 if (_PyUnicode_Resize(&v
, p
- PyUnicode_AS_UNICODE(v
)) < 0)
4203 Py_XDECREF(errorHandler
);
4205 return (PyObject
*)v
;
4208 Py_XDECREF(errorHandler
);
4214 /* Charmap encoding: the lookup table */
4216 struct encoding_map
{
4218 unsigned char level1
[32];
4220 unsigned char level23
[1];
4224 encoding_map_size(PyObject
*obj
, PyObject
* args
)
4226 struct encoding_map
*map
= (struct encoding_map
*)obj
;
4227 return PyInt_FromLong(sizeof(*map
) - 1 + 16*map
->count2
+
4231 static PyMethodDef encoding_map_methods
[] = {
4232 {"size", encoding_map_size
, METH_NOARGS
,
4233 PyDoc_STR("Return the size (in bytes) of this object") },
4238 encoding_map_dealloc(PyObject
* o
)
4243 static PyTypeObject EncodingMapType
= {
4244 PyVarObject_HEAD_INIT(NULL
, 0)
4245 "EncodingMap", /*tp_name*/
4246 sizeof(struct encoding_map
), /*tp_basicsize*/
4249 encoding_map_dealloc
, /*tp_dealloc*/
4256 0, /*tp_as_sequence*/
4257 0, /*tp_as_mapping*/
4264 Py_TPFLAGS_DEFAULT
, /*tp_flags*/
4268 0, /*tp_richcompare*/
4269 0, /*tp_weaklistoffset*/
4272 encoding_map_methods
, /*tp_methods*/
4279 0, /*tp_dictoffset*/
4288 PyUnicode_BuildEncodingMap(PyObject
* string
)
4292 struct encoding_map
*mresult
;
4295 unsigned char level1
[32];
4296 unsigned char level2
[512];
4297 unsigned char *mlevel1
, *mlevel2
, *mlevel3
;
4298 int count2
= 0, count3
= 0;
4300 if (!PyUnicode_Check(string
) || PyUnicode_GetSize(string
) != 256) {
4301 PyErr_BadArgument();
4304 decode
= PyUnicode_AS_UNICODE(string
);
4305 memset(level1
, 0xFF, sizeof level1
);
4306 memset(level2
, 0xFF, sizeof level2
);
4308 /* If there isn't a one-to-one mapping of NULL to \0,
4309 or if there are non-BMP characters, we need to use
4310 a mapping dictionary. */
4313 for (i
= 1; i
< 256; i
++) {
4316 #ifdef Py_UNICODE_WIDE
4317 || decode
[i
] > 0xFFFF
4323 if (decode
[i
] == 0xFFFE)
4324 /* unmapped character */
4326 l1
= decode
[i
] >> 11;
4327 l2
= decode
[i
] >> 7;
4328 if (level1
[l1
] == 0xFF)
4329 level1
[l1
] = count2
++;
4330 if (level2
[l2
] == 0xFF)
4331 level2
[l2
] = count3
++;
4334 if (count2
>= 0xFF || count3
>= 0xFF)
4338 PyObject
*result
= PyDict_New();
4339 PyObject
*key
, *value
;
4342 for (i
= 0; i
< 256; i
++) {
4344 key
= PyInt_FromLong(decode
[i
]);
4345 value
= PyInt_FromLong(i
);
4348 if (PyDict_SetItem(result
, key
, value
) == -1)
4361 /* Create a three-level trie */
4362 result
= PyObject_MALLOC(sizeof(struct encoding_map
) +
4363 16*count2
+ 128*count3
- 1);
4365 return PyErr_NoMemory();
4366 PyObject_Init(result
, &EncodingMapType
);
4367 mresult
= (struct encoding_map
*)result
;
4368 mresult
->count2
= count2
;
4369 mresult
->count3
= count3
;
4370 mlevel1
= mresult
->level1
;
4371 mlevel2
= mresult
->level23
;
4372 mlevel3
= mresult
->level23
+ 16*count2
;
4373 memcpy(mlevel1
, level1
, 32);
4374 memset(mlevel2
, 0xFF, 16*count2
);
4375 memset(mlevel3
, 0, 128*count3
);
4377 for (i
= 1; i
< 256; i
++) {
4378 int o1
, o2
, o3
, i2
, i3
;
4379 if (decode
[i
] == 0xFFFE)
4380 /* unmapped character */
4383 o2
= (decode
[i
]>>7) & 0xF;
4384 i2
= 16*mlevel1
[o1
] + o2
;
4385 if (mlevel2
[i2
] == 0xFF)
4386 mlevel2
[i2
] = count3
++;
4387 o3
= decode
[i
] & 0x7F;
4388 i3
= 128*mlevel2
[i2
] + o3
;
4395 encoding_map_lookup(Py_UNICODE c
, PyObject
*mapping
)
4397 struct encoding_map
*map
= (struct encoding_map
*)mapping
;
4399 int l2
= (c
>>7) & 0xF;
4403 #ifdef Py_UNICODE_WIDE
4411 i
= map
->level1
[l1
];
4416 i
= map
->level23
[16*i
+l2
];
4421 i
= map
->level23
[16*map
->count2
+ 128*i
+ l3
];
4428 /* Lookup the character ch in the mapping. If the character
4429 can't be found, Py_None is returned (or NULL, if another
4431 static PyObject
*charmapencode_lookup(Py_UNICODE c
, PyObject
*mapping
)
4433 PyObject
*w
= PyInt_FromLong((long)c
);
4438 x
= PyObject_GetItem(mapping
, w
);
4441 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4442 /* No mapping found means: mapping is undefined. */
4450 else if (x
== Py_None
)
4452 else if (PyInt_Check(x
)) {
4453 long value
= PyInt_AS_LONG(x
);
4454 if (value
< 0 || value
> 255) {
4455 PyErr_SetString(PyExc_TypeError
,
4456 "character mapping must be in range(256)");
4462 else if (PyString_Check(x
))
4465 /* wrong return value */
4466 PyErr_SetString(PyExc_TypeError
,
4467 "character mapping must return integer, None or str");
4474 charmapencode_resize(PyObject
**outobj
, Py_ssize_t
*outpos
, Py_ssize_t requiredsize
)
4476 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4477 /* exponentially overallocate to minimize reallocations */
4478 if (requiredsize
< 2*outsize
)
4479 requiredsize
= 2*outsize
;
4480 if (_PyString_Resize(outobj
, requiredsize
)) {
4486 typedef enum charmapencode_result
{
4487 enc_SUCCESS
, enc_FAILED
, enc_EXCEPTION
4488 }charmapencode_result
;
4489 /* lookup the character, put the result in the output string and adjust
4490 various state variables. Reallocate the output string if not enough
4491 space is available. Return a new reference to the object that
4492 was put in the output buffer, or Py_None, if the mapping was undefined
4493 (in which case no character was written) or NULL, if a
4494 reallocation error occurred. The caller must decref the result */
4496 charmapencode_result
charmapencode_output(Py_UNICODE c
, PyObject
*mapping
,
4497 PyObject
**outobj
, Py_ssize_t
*outpos
)
4501 Py_ssize_t outsize
= PyString_GET_SIZE(*outobj
);
4503 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4504 int res
= encoding_map_lookup(c
, mapping
);
4505 Py_ssize_t requiredsize
= *outpos
+1;
4508 if (outsize
<requiredsize
)
4509 if (!charmapencode_resize(outobj
, outpos
, requiredsize
))
4510 return enc_EXCEPTION
;
4511 outstart
= PyString_AS_STRING(*outobj
);
4512 outstart
[(*outpos
)++] = (char)res
;
4516 rep
= charmapencode_lookup(c
, mapping
);
4518 return enc_EXCEPTION
;
4519 else if (rep
==Py_None
) {
4523 if (PyInt_Check(rep
)) {
4524 Py_ssize_t requiredsize
= *outpos
+1;
4525 if (outsize
<requiredsize
)
4526 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4528 return enc_EXCEPTION
;
4530 outstart
= PyString_AS_STRING(*outobj
);
4531 outstart
[(*outpos
)++] = (char)PyInt_AS_LONG(rep
);
4534 const char *repchars
= PyString_AS_STRING(rep
);
4535 Py_ssize_t repsize
= PyString_GET_SIZE(rep
);
4536 Py_ssize_t requiredsize
= *outpos
+repsize
;
4537 if (outsize
<requiredsize
)
4538 if (!charmapencode_resize(outobj
, outpos
, requiredsize
)) {
4540 return enc_EXCEPTION
;
4542 outstart
= PyString_AS_STRING(*outobj
);
4543 memcpy(outstart
+ *outpos
, repchars
, repsize
);
4551 /* handle an error in PyUnicode_EncodeCharmap
4552 Return 0 on success, -1 on error */
4554 int charmap_encoding_error(
4555 const Py_UNICODE
*p
, Py_ssize_t size
, Py_ssize_t
*inpos
, PyObject
*mapping
,
4556 PyObject
**exceptionObject
,
4557 int *known_errorHandler
, PyObject
**errorHandler
, const char *errors
,
4558 PyObject
**res
, Py_ssize_t
*respos
)
4560 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
4564 /* startpos for collecting unencodable chars */
4565 Py_ssize_t collstartpos
= *inpos
;
4566 Py_ssize_t collendpos
= *inpos
+1;
4568 char *encoding
= "charmap";
4569 char *reason
= "character maps to <undefined>";
4570 charmapencode_result x
;
4572 /* find all unencodable characters */
4573 while (collendpos
< size
) {
4575 if (Py_TYPE(mapping
) == &EncodingMapType
) {
4576 int res
= encoding_map_lookup(p
[collendpos
], mapping
);
4583 rep
= charmapencode_lookup(p
[collendpos
], mapping
);
4586 else if (rep
!=Py_None
) {
4593 /* cache callback name lookup
4594 * (if not done yet, i.e. it's the first error) */
4595 if (*known_errorHandler
==-1) {
4596 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
4597 *known_errorHandler
= 1;
4598 else if (!strcmp(errors
, "replace"))
4599 *known_errorHandler
= 2;
4600 else if (!strcmp(errors
, "ignore"))
4601 *known_errorHandler
= 3;
4602 else if (!strcmp(errors
, "xmlcharrefreplace"))
4603 *known_errorHandler
= 4;
4605 *known_errorHandler
= 0;
4607 switch (*known_errorHandler
) {
4608 case 1: /* strict */
4609 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4611 case 2: /* replace */
4612 for (collpos
= collstartpos
; collpos
<collendpos
; ++collpos
) {
4613 x
= charmapencode_output('?', mapping
, res
, respos
);
4614 if (x
==enc_EXCEPTION
) {
4617 else if (x
==enc_FAILED
) {
4618 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4623 case 3: /* ignore */
4624 *inpos
= collendpos
;
4626 case 4: /* xmlcharrefreplace */
4627 /* generate replacement (temporarily (mis)uses p) */
4628 for (collpos
= collstartpos
; collpos
< collendpos
; ++collpos
) {
4629 char buffer
[2+29+1+1];
4631 sprintf(buffer
, "&#%d;", (int)p
[collpos
]);
4632 for (cp
= buffer
; *cp
; ++cp
) {
4633 x
= charmapencode_output(*cp
, mapping
, res
, respos
);
4634 if (x
==enc_EXCEPTION
)
4636 else if (x
==enc_FAILED
) {
4637 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4642 *inpos
= collendpos
;
4645 repunicode
= unicode_encode_call_errorhandler(errors
, errorHandler
,
4646 encoding
, reason
, p
, size
, exceptionObject
,
4647 collstartpos
, collendpos
, &newpos
);
4648 if (repunicode
== NULL
)
4650 /* generate replacement */
4651 repsize
= PyUnicode_GET_SIZE(repunicode
);
4652 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
4653 x
= charmapencode_output(*uni2
, mapping
, res
, respos
);
4654 if (x
==enc_EXCEPTION
) {
4657 else if (x
==enc_FAILED
) {
4658 Py_DECREF(repunicode
);
4659 raise_encode_exception(exceptionObject
, encoding
, p
, size
, collstartpos
, collendpos
, reason
);
4664 Py_DECREF(repunicode
);
4669 PyObject
*PyUnicode_EncodeCharmap(const Py_UNICODE
*p
,
4675 PyObject
*res
= NULL
;
4676 /* current input position */
4677 Py_ssize_t inpos
= 0;
4678 /* current output position */
4679 Py_ssize_t respos
= 0;
4680 PyObject
*errorHandler
= NULL
;
4681 PyObject
*exc
= NULL
;
4682 /* the following variable is used for caching string comparisons
4683 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4684 * 3=ignore, 4=xmlcharrefreplace */
4685 int known_errorHandler
= -1;
4687 /* Default to Latin-1 */
4688 if (mapping
== NULL
)
4689 return PyUnicode_EncodeLatin1(p
, size
, errors
);
4691 /* allocate enough for a simple encoding without
4692 replacements, if we need more, we'll resize */
4693 res
= PyString_FromStringAndSize(NULL
, size
);
4699 while (inpos
<size
) {
4700 /* try to encode it */
4701 charmapencode_result x
= charmapencode_output(p
[inpos
], mapping
, &res
, &respos
);
4702 if (x
==enc_EXCEPTION
) /* error */
4704 if (x
==enc_FAILED
) { /* unencodable character */
4705 if (charmap_encoding_error(p
, size
, &inpos
, mapping
,
4707 &known_errorHandler
, &errorHandler
, errors
,
4713 /* done with this character => adjust input position */
4717 /* Resize if we allocated to much */
4718 if (respos
<PyString_GET_SIZE(res
)) {
4719 if (_PyString_Resize(&res
, respos
))
4723 Py_XDECREF(errorHandler
);
4729 Py_XDECREF(errorHandler
);
4733 PyObject
*PyUnicode_AsCharmapString(PyObject
*unicode
,
4736 if (!PyUnicode_Check(unicode
) || mapping
== NULL
) {
4737 PyErr_BadArgument();
4740 return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode
),
4741 PyUnicode_GET_SIZE(unicode
),
4746 /* create or adjust a UnicodeTranslateError */
4747 static void make_translate_exception(PyObject
**exceptionObject
,
4748 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4749 Py_ssize_t startpos
, Py_ssize_t endpos
,
4752 if (*exceptionObject
== NULL
) {
4753 *exceptionObject
= PyUnicodeTranslateError_Create(
4754 unicode
, size
, startpos
, endpos
, reason
);
4757 if (PyUnicodeTranslateError_SetStart(*exceptionObject
, startpos
))
4759 if (PyUnicodeTranslateError_SetEnd(*exceptionObject
, endpos
))
4761 if (PyUnicodeTranslateError_SetReason(*exceptionObject
, reason
))
4765 Py_DECREF(*exceptionObject
);
4766 *exceptionObject
= NULL
;
4770 /* raises a UnicodeTranslateError */
4771 static void raise_translate_exception(PyObject
**exceptionObject
,
4772 const Py_UNICODE
*unicode
, Py_ssize_t size
,
4773 Py_ssize_t startpos
, Py_ssize_t endpos
,
4776 make_translate_exception(exceptionObject
,
4777 unicode
, size
, startpos
, endpos
, reason
);
4778 if (*exceptionObject
!= NULL
)
4779 PyCodec_StrictErrors(*exceptionObject
);
4782 /* error handling callback helper:
4783 build arguments, call the callback and check the arguments,
4784 put the result into newpos and return the replacement string, which
4785 has to be freed by the caller */
4786 static PyObject
*unicode_translate_call_errorhandler(const char *errors
,
4787 PyObject
**errorHandler
,
4789 const Py_UNICODE
*unicode
, Py_ssize_t size
, PyObject
**exceptionObject
,
4790 Py_ssize_t startpos
, Py_ssize_t endpos
,
4793 static char *argparse
= "O!n;translating error handler must return (unicode, int) tuple";
4795 Py_ssize_t i_newpos
;
4797 PyObject
*resunicode
;
4799 if (*errorHandler
== NULL
) {
4800 *errorHandler
= PyCodec_LookupError(errors
);
4801 if (*errorHandler
== NULL
)
4805 make_translate_exception(exceptionObject
,
4806 unicode
, size
, startpos
, endpos
, reason
);
4807 if (*exceptionObject
== NULL
)
4810 restuple
= PyObject_CallFunctionObjArgs(
4811 *errorHandler
, *exceptionObject
, NULL
);
4812 if (restuple
== NULL
)
4814 if (!PyTuple_Check(restuple
)) {
4815 PyErr_SetString(PyExc_TypeError
, &argparse
[4]);
4816 Py_DECREF(restuple
);
4819 if (!PyArg_ParseTuple(restuple
, argparse
, &PyUnicode_Type
,
4820 &resunicode
, &i_newpos
)) {
4821 Py_DECREF(restuple
);
4825 *newpos
= size
+i_newpos
;
4828 if (*newpos
<0 || *newpos
>size
) {
4829 PyErr_Format(PyExc_IndexError
, "position %zd from error handler out of bounds", *newpos
);
4830 Py_DECREF(restuple
);
4833 Py_INCREF(resunicode
);
4834 Py_DECREF(restuple
);
4838 /* Lookup the character ch in the mapping and put the result in result,
4839 which must be decrefed by the caller.
4840 Return 0 on success, -1 on error */
4842 int charmaptranslate_lookup(Py_UNICODE c
, PyObject
*mapping
, PyObject
**result
)
4844 PyObject
*w
= PyInt_FromLong((long)c
);
4849 x
= PyObject_GetItem(mapping
, w
);
4852 if (PyErr_ExceptionMatches(PyExc_LookupError
)) {
4853 /* No mapping found means: use 1:1 mapping. */
4860 else if (x
== Py_None
) {
4864 else if (PyInt_Check(x
)) {
4865 long value
= PyInt_AS_LONG(x
);
4866 long max
= PyUnicode_GetMax();
4867 if (value
< 0 || value
> max
) {
4868 PyErr_Format(PyExc_TypeError
,
4869 "character mapping must be in range(0x%lx)", max
+1);
4876 else if (PyUnicode_Check(x
)) {
4881 /* wrong return value */
4882 PyErr_SetString(PyExc_TypeError
,
4883 "character mapping must return integer, None or unicode");
4888 /* ensure that *outobj is at least requiredsize characters long,
4889 if not reallocate and adjust various state variables.
4890 Return 0 on success, -1 on error */
4892 int charmaptranslate_makespace(PyObject
**outobj
, Py_UNICODE
**outp
,
4893 Py_ssize_t requiredsize
)
4895 Py_ssize_t oldsize
= PyUnicode_GET_SIZE(*outobj
);
4896 if (requiredsize
> oldsize
) {
4897 /* remember old output position */
4898 Py_ssize_t outpos
= *outp
-PyUnicode_AS_UNICODE(*outobj
);
4899 /* exponentially overallocate to minimize reallocations */
4900 if (requiredsize
< 2 * oldsize
)
4901 requiredsize
= 2 * oldsize
;
4902 if (PyUnicode_Resize(outobj
, requiredsize
) < 0)
4904 *outp
= PyUnicode_AS_UNICODE(*outobj
) + outpos
;
4908 /* lookup the character, put the result in the output string and adjust
4909 various state variables. Return a new reference to the object that
4910 was put in the output buffer in *result, or Py_None, if the mapping was
4911 undefined (in which case no character was written).
4912 The called must decref result.
4913 Return 0 on success, -1 on error. */
4915 int charmaptranslate_output(const Py_UNICODE
*startinp
, const Py_UNICODE
*curinp
,
4916 Py_ssize_t insize
, PyObject
*mapping
, PyObject
**outobj
, Py_UNICODE
**outp
,
4919 if (charmaptranslate_lookup(*curinp
, mapping
, res
))
4922 /* not found => default to 1:1 mapping */
4923 *(*outp
)++ = *curinp
;
4925 else if (*res
==Py_None
)
4927 else if (PyInt_Check(*res
)) {
4928 /* no overflow check, because we know that the space is enough */
4929 *(*outp
)++ = (Py_UNICODE
)PyInt_AS_LONG(*res
);
4931 else if (PyUnicode_Check(*res
)) {
4932 Py_ssize_t repsize
= PyUnicode_GET_SIZE(*res
);
4934 /* no overflow check, because we know that the space is enough */
4935 *(*outp
)++ = *PyUnicode_AS_UNICODE(*res
);
4937 else if (repsize
!=0) {
4938 /* more than one character */
4939 Py_ssize_t requiredsize
= (*outp
-PyUnicode_AS_UNICODE(*outobj
)) +
4940 (insize
- (curinp
-startinp
)) +
4942 if (charmaptranslate_makespace(outobj
, outp
, requiredsize
))
4944 memcpy(*outp
, PyUnicode_AS_UNICODE(*res
), sizeof(Py_UNICODE
)*repsize
);
4953 PyObject
*PyUnicode_TranslateCharmap(const Py_UNICODE
*p
,
4959 PyObject
*res
= NULL
;
4960 /* pointers to the beginning and end+1 of input */
4961 const Py_UNICODE
*startp
= p
;
4962 const Py_UNICODE
*endp
= p
+ size
;
4963 /* pointer into the output */
4965 /* current output position */
4966 Py_ssize_t respos
= 0;
4967 char *reason
= "character maps to <undefined>";
4968 PyObject
*errorHandler
= NULL
;
4969 PyObject
*exc
= NULL
;
4970 /* the following variable is used for caching string comparisons
4971 * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4972 * 3=ignore, 4=xmlcharrefreplace */
4973 int known_errorHandler
= -1;
4975 if (mapping
== NULL
) {
4976 PyErr_BadArgument();
4980 /* allocate enough for a simple 1:1 translation without
4981 replacements, if we need more, we'll resize */
4982 res
= PyUnicode_FromUnicode(NULL
, size
);
4987 str
= PyUnicode_AS_UNICODE(res
);
4990 /* try to encode it */
4992 if (charmaptranslate_output(startp
, p
, size
, mapping
, &res
, &str
, &x
)) {
4997 if (x
!=Py_None
) /* it worked => adjust input pointer */
4999 else { /* untranslatable character */
5000 PyObject
*repunicode
= NULL
; /* initialize to prevent gcc warning */
5004 /* startpos for collecting untranslatable chars */
5005 const Py_UNICODE
*collstart
= p
;
5006 const Py_UNICODE
*collend
= p
+1;
5007 const Py_UNICODE
*coll
;
5009 /* find all untranslatable characters */
5010 while (collend
< endp
) {
5011 if (charmaptranslate_lookup(*collend
, mapping
, &x
))
5018 /* cache callback name lookup
5019 * (if not done yet, i.e. it's the first error) */
5020 if (known_errorHandler
==-1) {
5021 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5022 known_errorHandler
= 1;
5023 else if (!strcmp(errors
, "replace"))
5024 known_errorHandler
= 2;
5025 else if (!strcmp(errors
, "ignore"))
5026 known_errorHandler
= 3;
5027 else if (!strcmp(errors
, "xmlcharrefreplace"))
5028 known_errorHandler
= 4;
5030 known_errorHandler
= 0;
5032 switch (known_errorHandler
) {
5033 case 1: /* strict */
5034 raise_translate_exception(&exc
, startp
, size
, collstart
-startp
, collend
-startp
, reason
);
5036 case 2: /* replace */
5037 /* No need to check for space, this is a 1:1 replacement */
5038 for (coll
= collstart
; coll
<collend
; ++coll
)
5041 case 3: /* ignore */
5044 case 4: /* xmlcharrefreplace */
5045 /* generate replacement (temporarily (mis)uses p) */
5046 for (p
= collstart
; p
< collend
; ++p
) {
5047 char buffer
[2+29+1+1];
5049 sprintf(buffer
, "&#%d;", (int)*p
);
5050 if (charmaptranslate_makespace(&res
, &str
,
5051 (str
-PyUnicode_AS_UNICODE(res
))+strlen(buffer
)+(endp
-collend
)))
5053 for (cp
= buffer
; *cp
; ++cp
)
5059 repunicode
= unicode_translate_call_errorhandler(errors
, &errorHandler
,
5060 reason
, startp
, size
, &exc
,
5061 collstart
-startp
, collend
-startp
, &newpos
);
5062 if (repunicode
== NULL
)
5064 /* generate replacement */
5065 repsize
= PyUnicode_GET_SIZE(repunicode
);
5066 if (charmaptranslate_makespace(&res
, &str
,
5067 (str
-PyUnicode_AS_UNICODE(res
))+repsize
+(endp
-collend
))) {
5068 Py_DECREF(repunicode
);
5071 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
)
5073 p
= startp
+ newpos
;
5074 Py_DECREF(repunicode
);
5078 /* Resize if we allocated to much */
5079 respos
= str
-PyUnicode_AS_UNICODE(res
);
5080 if (respos
<PyUnicode_GET_SIZE(res
)) {
5081 if (PyUnicode_Resize(&res
, respos
) < 0)
5085 Py_XDECREF(errorHandler
);
5091 Py_XDECREF(errorHandler
);
5095 PyObject
*PyUnicode_Translate(PyObject
*str
,
5101 str
= PyUnicode_FromObject(str
);
5104 result
= PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str
),
5105 PyUnicode_GET_SIZE(str
),
5116 /* --- Decimal Encoder ---------------------------------------------------- */
5118 int PyUnicode_EncodeDecimal(Py_UNICODE
*s
,
5123 Py_UNICODE
*p
, *end
;
5124 PyObject
*errorHandler
= NULL
;
5125 PyObject
*exc
= NULL
;
5126 const char *encoding
= "decimal";
5127 const char *reason
= "invalid decimal Unicode string";
5128 /* the following variable is used for caching string comparisons
5129 * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5130 int known_errorHandler
= -1;
5132 if (output
== NULL
) {
5133 PyErr_BadArgument();
5140 register Py_UNICODE ch
= *p
;
5142 PyObject
*repunicode
;
5146 Py_UNICODE
*collstart
;
5147 Py_UNICODE
*collend
;
5149 if (Py_UNICODE_ISSPACE(ch
)) {
5154 decimal
= Py_UNICODE_TODECIMAL(ch
);
5156 *output
++ = '0' + decimal
;
5160 if (0 < ch
&& ch
< 256) {
5161 *output
++ = (char)ch
;
5165 /* All other characters are considered unencodable */
5168 while (collend
< end
) {
5169 if ((0 < *collend
&& *collend
< 256) ||
5170 !Py_UNICODE_ISSPACE(*collend
) ||
5171 Py_UNICODE_TODECIMAL(*collend
))
5174 /* cache callback name lookup
5175 * (if not done yet, i.e. it's the first error) */
5176 if (known_errorHandler
==-1) {
5177 if ((errors
==NULL
) || (!strcmp(errors
, "strict")))
5178 known_errorHandler
= 1;
5179 else if (!strcmp(errors
, "replace"))
5180 known_errorHandler
= 2;
5181 else if (!strcmp(errors
, "ignore"))
5182 known_errorHandler
= 3;
5183 else if (!strcmp(errors
, "xmlcharrefreplace"))
5184 known_errorHandler
= 4;
5186 known_errorHandler
= 0;
5188 switch (known_errorHandler
) {
5189 case 1: /* strict */
5190 raise_encode_exception(&exc
, encoding
, s
, length
, collstart
-s
, collend
-s
, reason
);
5192 case 2: /* replace */
5193 for (p
= collstart
; p
< collend
; ++p
)
5196 case 3: /* ignore */
5199 case 4: /* xmlcharrefreplace */
5200 /* generate replacement (temporarily (mis)uses p) */
5201 for (p
= collstart
; p
< collend
; ++p
)
5202 output
+= sprintf(output
, "&#%d;", (int)*p
);
5206 repunicode
= unicode_encode_call_errorhandler(errors
, &errorHandler
,
5207 encoding
, reason
, s
, length
, &exc
,
5208 collstart
-s
, collend
-s
, &newpos
);
5209 if (repunicode
== NULL
)
5211 /* generate replacement */
5212 repsize
= PyUnicode_GET_SIZE(repunicode
);
5213 for (uni2
= PyUnicode_AS_UNICODE(repunicode
); repsize
-->0; ++uni2
) {
5214 Py_UNICODE ch
= *uni2
;
5215 if (Py_UNICODE_ISSPACE(ch
))
5218 decimal
= Py_UNICODE_TODECIMAL(ch
);
5220 *output
++ = '0' + decimal
;
5221 else if (0 < ch
&& ch
< 256)
5222 *output
++ = (char)ch
;
5224 Py_DECREF(repunicode
);
5225 raise_encode_exception(&exc
, encoding
,
5226 s
, length
, collstart
-s
, collend
-s
, reason
);
5232 Py_DECREF(repunicode
);
5235 /* 0-terminate the output string */
5238 Py_XDECREF(errorHandler
);
5243 Py_XDECREF(errorHandler
);
5247 /* --- Helpers ------------------------------------------------------------ */
5249 #include "stringlib/unicodedefs.h"
5250 #include "stringlib/fastsearch.h"
5252 #include "stringlib/count.h"
5253 #include "stringlib/find.h"
5254 #include "stringlib/partition.h"
5255 #include "stringlib/split.h"
5257 /* helper macro to fixup start/end slice values */
5258 #define ADJUST_INDICES(start, end, len) \
5261 else if (end < 0) { \
5272 Py_ssize_t
PyUnicode_Count(PyObject
*str
,
5278 PyUnicodeObject
* str_obj
;
5279 PyUnicodeObject
* sub_obj
;
5281 str_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(str
);
5284 sub_obj
= (PyUnicodeObject
*) PyUnicode_FromObject(substr
);
5290 ADJUST_INDICES(start
, end
, str_obj
->length
);
5291 result
= stringlib_count(
5292 str_obj
->str
+ start
, end
- start
, sub_obj
->str
, sub_obj
->length
,
5302 Py_ssize_t
PyUnicode_Find(PyObject
*str
,
5310 str
= PyUnicode_FromObject(str
);
5313 sub
= PyUnicode_FromObject(sub
);
5320 result
= stringlib_find_slice(
5321 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5322 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5326 result
= stringlib_rfind_slice(
5327 PyUnicode_AS_UNICODE(str
), PyUnicode_GET_SIZE(str
),
5328 PyUnicode_AS_UNICODE(sub
), PyUnicode_GET_SIZE(sub
),
5339 int tailmatch(PyUnicodeObject
*self
,
5340 PyUnicodeObject
*substring
,
5345 if (substring
->length
== 0)
5348 ADJUST_INDICES(start
, end
, self
->length
);
5349 end
-= substring
->length
;
5353 if (direction
> 0) {
5354 if (Py_UNICODE_MATCH(self
, end
, substring
))
5357 if (Py_UNICODE_MATCH(self
, start
, substring
))
5364 Py_ssize_t
PyUnicode_Tailmatch(PyObject
*str
,
5372 str
= PyUnicode_FromObject(str
);
5375 substr
= PyUnicode_FromObject(substr
);
5376 if (substr
== NULL
) {
5381 result
= tailmatch((PyUnicodeObject
*)str
,
5382 (PyUnicodeObject
*)substr
,
5383 start
, end
, direction
);
5389 /* Apply fixfct filter to the Unicode object self and return a
5390 reference to the modified object */
5393 PyObject
*fixup(PyUnicodeObject
*self
,
5394 int (*fixfct
)(PyUnicodeObject
*s
))
5399 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5403 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5405 if (!fixfct(u
) && PyUnicode_CheckExact(self
)) {
5406 /* fixfct should return TRUE if it modified the buffer. If
5407 FALSE, return a reference to the original buffer instead
5408 (to save space, not time) */
5411 return (PyObject
*) self
;
5413 return (PyObject
*) u
;
5417 int fixupper(PyUnicodeObject
*self
)
5419 Py_ssize_t len
= self
->length
;
5420 Py_UNICODE
*s
= self
->str
;
5424 register Py_UNICODE ch
;
5426 ch
= Py_UNICODE_TOUPPER(*s
);
5438 int fixlower(PyUnicodeObject
*self
)
5440 Py_ssize_t len
= self
->length
;
5441 Py_UNICODE
*s
= self
->str
;
5445 register Py_UNICODE ch
;
5447 ch
= Py_UNICODE_TOLOWER(*s
);
5459 int fixswapcase(PyUnicodeObject
*self
)
5461 Py_ssize_t len
= self
->length
;
5462 Py_UNICODE
*s
= self
->str
;
5466 if (Py_UNICODE_ISUPPER(*s
)) {
5467 *s
= Py_UNICODE_TOLOWER(*s
);
5469 } else if (Py_UNICODE_ISLOWER(*s
)) {
5470 *s
= Py_UNICODE_TOUPPER(*s
);
5480 int fixcapitalize(PyUnicodeObject
*self
)
5482 Py_ssize_t len
= self
->length
;
5483 Py_UNICODE
*s
= self
->str
;
5488 if (Py_UNICODE_ISLOWER(*s
)) {
5489 *s
= Py_UNICODE_TOUPPER(*s
);
5494 if (Py_UNICODE_ISUPPER(*s
)) {
5495 *s
= Py_UNICODE_TOLOWER(*s
);
5504 int fixtitle(PyUnicodeObject
*self
)
5506 register Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
5507 register Py_UNICODE
*e
;
5508 int previous_is_cased
;
5510 /* Shortcut for single character strings */
5511 if (PyUnicode_GET_SIZE(self
) == 1) {
5512 Py_UNICODE ch
= Py_UNICODE_TOTITLE(*p
);
5521 e
= p
+ PyUnicode_GET_SIZE(self
);
5522 previous_is_cased
= 0;
5523 for (; p
< e
; p
++) {
5524 register const Py_UNICODE ch
= *p
;
5526 if (previous_is_cased
)
5527 *p
= Py_UNICODE_TOLOWER(ch
);
5529 *p
= Py_UNICODE_TOTITLE(ch
);
5531 if (Py_UNICODE_ISLOWER(ch
) ||
5532 Py_UNICODE_ISUPPER(ch
) ||
5533 Py_UNICODE_ISTITLE(ch
))
5534 previous_is_cased
= 1;
5536 previous_is_cased
= 0;
5542 PyUnicode_Join(PyObject
*separator
, PyObject
*seq
)
5544 PyObject
*internal_separator
= NULL
;
5545 const Py_UNICODE blank
= ' ';
5546 const Py_UNICODE
*sep
= &blank
;
5547 Py_ssize_t seplen
= 1;
5548 PyUnicodeObject
*res
= NULL
; /* the result */
5549 Py_ssize_t res_alloc
= 100; /* # allocated bytes for string in res */
5550 Py_ssize_t res_used
; /* # used bytes */
5551 Py_UNICODE
*res_p
; /* pointer to free byte in res's string area */
5552 PyObject
*fseq
; /* PySequence_Fast(seq) */
5553 Py_ssize_t seqlen
; /* len(fseq) -- number of items in sequence */
5557 fseq
= PySequence_Fast(seq
, "");
5562 /* Grrrr. A codec may be invoked to convert str objects to
5563 * Unicode, and so it's possible to call back into Python code
5564 * during PyUnicode_FromObject(), and so it's possible for a sick
5565 * codec to change the size of fseq (if seq is a list). Therefore
5566 * we have to keep refetching the size -- can't assume seqlen
5569 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5570 /* If empty sequence, return u"". */
5572 res
= _PyUnicode_New(0); /* empty sequence; return u"" */
5575 /* If singleton sequence with an exact Unicode, return that. */
5577 item
= PySequence_Fast_GET_ITEM(fseq
, 0);
5578 if (PyUnicode_CheckExact(item
)) {
5580 res
= (PyUnicodeObject
*)item
;
5585 /* At least two items to join, or one that isn't exact Unicode. */
5587 /* Set up sep and seplen -- they're needed. */
5588 if (separator
== NULL
) {
5593 internal_separator
= PyUnicode_FromObject(separator
);
5594 if (internal_separator
== NULL
)
5596 sep
= PyUnicode_AS_UNICODE(internal_separator
);
5597 seplen
= PyUnicode_GET_SIZE(internal_separator
);
5598 /* In case PyUnicode_FromObject() mutated seq. */
5599 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5604 res
= _PyUnicode_New(res_alloc
);
5607 res_p
= PyUnicode_AS_UNICODE(res
);
5610 for (i
= 0; i
< seqlen
; ++i
) {
5612 Py_ssize_t new_res_used
;
5614 item
= PySequence_Fast_GET_ITEM(fseq
, i
);
5615 /* Convert item to Unicode. */
5616 if (! PyUnicode_Check(item
) && ! PyString_Check(item
)) {
5617 PyErr_Format(PyExc_TypeError
,
5618 "sequence item %zd: expected string or Unicode,"
5620 i
, Py_TYPE(item
)->tp_name
);
5623 item
= PyUnicode_FromObject(item
);
5626 /* We own a reference to item from here on. */
5628 /* In case PyUnicode_FromObject() mutated seq. */
5629 seqlen
= PySequence_Fast_GET_SIZE(fseq
);
5631 /* Make sure we have enough space for the separator and the item. */
5632 itemlen
= PyUnicode_GET_SIZE(item
);
5633 new_res_used
= res_used
+ itemlen
;
5634 if (new_res_used
< 0)
5636 if (i
< seqlen
- 1) {
5637 new_res_used
+= seplen
;
5638 if (new_res_used
< 0)
5641 if (new_res_used
> res_alloc
) {
5642 /* double allocated size until it's big enough */
5644 res_alloc
+= res_alloc
;
5647 } while (new_res_used
> res_alloc
);
5648 if (_PyUnicode_Resize(&res
, res_alloc
) < 0) {
5652 res_p
= PyUnicode_AS_UNICODE(res
) + res_used
;
5655 /* Copy item, and maybe the separator. */
5656 Py_UNICODE_COPY(res_p
, PyUnicode_AS_UNICODE(item
), itemlen
);
5658 if (i
< seqlen
- 1) {
5659 Py_UNICODE_COPY(res_p
, sep
, seplen
);
5663 res_used
= new_res_used
;
5666 /* Shrink res to match the used area; this probably can't fail,
5667 * but it's cheap to check.
5669 if (_PyUnicode_Resize(&res
, res_used
) < 0)
5673 Py_XDECREF(internal_separator
);
5675 return (PyObject
*)res
;
5678 PyErr_SetString(PyExc_OverflowError
,
5679 "join() result is too long for a Python string");
5684 Py_XDECREF(internal_separator
);
5691 PyUnicodeObject
*pad(PyUnicodeObject
*self
,
5703 if (left
== 0 && right
== 0 && PyUnicode_CheckExact(self
)) {
5708 if (left
> PY_SSIZE_T_MAX
- self
->length
||
5709 right
> PY_SSIZE_T_MAX
- (left
+ self
->length
)) {
5710 PyErr_SetString(PyExc_OverflowError
, "padded string is too long");
5713 u
= _PyUnicode_New(left
+ self
->length
+ right
);
5716 Py_UNICODE_FILL(u
->str
, fill
, left
);
5717 Py_UNICODE_COPY(u
->str
+ left
, self
->str
, self
->length
);
5719 Py_UNICODE_FILL(u
->str
+ left
+ self
->length
, fill
, right
);
5725 PyObject
*PyUnicode_Splitlines(PyObject
*string
, int keepends
)
5729 string
= PyUnicode_FromObject(string
);
5733 list
= stringlib_splitlines(
5734 (PyObject
*) string
, PyUnicode_AS_UNICODE(string
),
5735 PyUnicode_GET_SIZE(string
), keepends
);
5742 PyObject
*split(PyUnicodeObject
*self
,
5743 PyUnicodeObject
*substring
,
5744 Py_ssize_t maxcount
)
5747 maxcount
= PY_SSIZE_T_MAX
;
5749 if (substring
== NULL
)
5750 return stringlib_split_whitespace(
5751 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5754 return stringlib_split(
5755 (PyObject
*) self
, self
->str
, self
->length
,
5756 substring
->str
, substring
->length
,
5762 PyObject
*rsplit(PyUnicodeObject
*self
,
5763 PyUnicodeObject
*substring
,
5764 Py_ssize_t maxcount
)
5767 maxcount
= PY_SSIZE_T_MAX
;
5769 if (substring
== NULL
)
5770 return stringlib_rsplit_whitespace(
5771 (PyObject
*) self
, self
->str
, self
->length
, maxcount
5774 return stringlib_rsplit(
5775 (PyObject
*) self
, self
->str
, self
->length
,
5776 substring
->str
, substring
->length
,
5782 PyObject
*replace(PyUnicodeObject
*self
,
5783 PyUnicodeObject
*str1
,
5784 PyUnicodeObject
*str2
,
5785 Py_ssize_t maxcount
)
5790 maxcount
= PY_SSIZE_T_MAX
;
5791 else if (maxcount
== 0 || self
->length
== 0)
5794 if (str1
->length
== str2
->length
) {
5797 if (str1
->length
== 0)
5799 if (str1
->length
== 1) {
5800 /* replace characters */
5802 if (!findchar(self
->str
, self
->length
, str1
->str
[0]))
5804 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5807 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5810 for (i
= 0; i
< u
->length
; i
++)
5811 if (u
->str
[i
] == u1
) {
5818 self
->str
, self
->length
, str1
->str
, str1
->length
, 0
5822 u
= (PyUnicodeObject
*) PyUnicode_FromUnicode(NULL
, self
->length
);
5825 Py_UNICODE_COPY(u
->str
, self
->str
, self
->length
);
5827 /* change everything in-place, starting with this one */
5828 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5831 while ( --maxcount
> 0) {
5832 i
= stringlib_find(self
->str
+i
, self
->length
-i
,
5833 str1
->str
, str1
->length
,
5837 Py_UNICODE_COPY(u
->str
+i
, str2
->str
, str2
->length
);
5844 Py_ssize_t product
, new_size
, delta
;
5847 /* replace strings */
5848 n
= stringlib_count(self
->str
, self
->length
, str1
->str
, str1
->length
,
5852 /* new_size = self->length + n * (str2->length - str1->length)); */
5853 delta
= (str2
->length
- str1
->length
);
5855 new_size
= self
->length
;
5857 product
= n
* (str2
->length
- str1
->length
);
5858 if ((product
/ (str2
->length
- str1
->length
)) != n
) {
5859 PyErr_SetString(PyExc_OverflowError
,
5860 "replace string is too long");
5863 new_size
= self
->length
+ product
;
5865 PyErr_SetString(PyExc_OverflowError
,
5866 "replace string is too long");
5870 u
= _PyUnicode_New(new_size
);
5875 if (str1
->length
> 0) {
5877 /* look for next match */
5878 j
= stringlib_find(self
->str
+i
, self
->length
-i
,
5879 str1
->str
, str1
->length
,
5884 /* copy unchanged part [i:j] */
5885 Py_UNICODE_COPY(p
, self
->str
+i
, j
-i
);
5888 /* copy substitution string */
5889 if (str2
->length
> 0) {
5890 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5893 i
= j
+ str1
->length
;
5895 if (i
< self
->length
)
5896 /* copy tail [i:] */
5897 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5901 Py_UNICODE_COPY(p
, str2
->str
, str2
->length
);
5905 *p
++ = self
->str
[i
++];
5907 Py_UNICODE_COPY(p
, self
->str
+i
, self
->length
-i
);
5910 return (PyObject
*) u
;
5913 /* nothing to replace; return original string (when possible) */
5914 if (PyUnicode_CheckExact(self
)) {
5916 return (PyObject
*) self
;
5918 return PyUnicode_FromUnicode(self
->str
, self
->length
);
5921 /* --- Unicode Object Methods --------------------------------------------- */
5923 PyDoc_STRVAR(title__doc__
,
5924 "S.title() -> unicode\n\
5926 Return a titlecased version of S, i.e. words start with title case\n\
5927 characters, all remaining cased characters have lower case.");
5930 unicode_title(PyUnicodeObject
*self
)
5932 return fixup(self
, fixtitle
);
5935 PyDoc_STRVAR(capitalize__doc__
,
5936 "S.capitalize() -> unicode\n\
5938 Return a capitalized version of S, i.e. make the first character\n\
5939 have upper case and the rest lower case.");
5942 unicode_capitalize(PyUnicodeObject
*self
)
5944 return fixup(self
, fixcapitalize
);
5948 PyDoc_STRVAR(capwords__doc__
,
5949 "S.capwords() -> unicode\n\
5951 Apply .capitalize() to all words in S and return the result with\n\
5952 normalized whitespace (all whitespace strings are replaced by ' ').");
5955 unicode_capwords(PyUnicodeObject
*self
)
5961 /* Split into words */
5962 list
= split(self
, NULL
, -1);
5966 /* Capitalize each word */
5967 for (i
= 0; i
< PyList_GET_SIZE(list
); i
++) {
5968 item
= fixup((PyUnicodeObject
*)PyList_GET_ITEM(list
, i
),
5972 Py_DECREF(PyList_GET_ITEM(list
, i
));
5973 PyList_SET_ITEM(list
, i
, item
);
5976 /* Join the words to form a new string */
5977 item
= PyUnicode_Join(NULL
, list
);
5981 return (PyObject
*)item
;
5985 /* Argument converter. Coerces to a single unicode character */
5988 convert_uc(PyObject
*obj
, void *addr
)
5990 Py_UNICODE
*fillcharloc
= (Py_UNICODE
*)addr
;
5994 uniobj
= PyUnicode_FromObject(obj
);
5995 if (uniobj
== NULL
) {
5996 PyErr_SetString(PyExc_TypeError
,
5997 "The fill character cannot be converted to Unicode");
6000 if (PyUnicode_GET_SIZE(uniobj
) != 1) {
6001 PyErr_SetString(PyExc_TypeError
,
6002 "The fill character must be exactly one character long");
6006 unistr
= PyUnicode_AS_UNICODE(uniobj
);
6007 *fillcharloc
= unistr
[0];
6012 PyDoc_STRVAR(center__doc__
,
6013 "S.center(width[, fillchar]) -> unicode\n\
6015 Return S centered in a Unicode string of length width. Padding is\n\
6016 done using the specified fill character (default is a space)");
6019 unicode_center(PyUnicodeObject
*self
, PyObject
*args
)
6021 Py_ssize_t marg
, left
;
6023 Py_UNICODE fillchar
= ' ';
6025 if (!PyArg_ParseTuple(args
, "n|O&:center", &width
, convert_uc
, &fillchar
))
6028 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6030 return (PyObject
*) self
;
6033 marg
= width
- self
->length
;
6034 left
= marg
/ 2 + (marg
& width
& 1);
6036 return (PyObject
*) pad(self
, left
, marg
- left
, fillchar
);
6041 /* This code should go into some future Unicode collation support
6042 module. The basic comparison should compare ordinals on a naive
6043 basis (this is what Java does and thus Jython too). */
6045 /* speedy UTF-16 code point order comparison */
6047 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6049 static short utf16Fixup
[32] =
6051 0, 0, 0, 0, 0, 0, 0, 0,
6052 0, 0, 0, 0, 0, 0, 0, 0,
6053 0, 0, 0, 0, 0, 0, 0, 0,
6054 0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6058 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6060 Py_ssize_t len1
, len2
;
6062 Py_UNICODE
*s1
= str1
->str
;
6063 Py_UNICODE
*s2
= str2
->str
;
6065 len1
= str1
->length
;
6066 len2
= str2
->length
;
6068 while (len1
> 0 && len2
> 0) {
6074 if (c1
> (1<<11) * 26)
6075 c1
+= utf16Fixup
[c1
>>11];
6076 if (c2
> (1<<11) * 26)
6077 c2
+= utf16Fixup
[c2
>>11];
6078 /* now c1 and c2 are in UTF-32-compatible order */
6081 return (c1
< c2
) ? -1 : 1;
6086 return (len1
< len2
) ? -1 : (len1
!= len2
);
6092 unicode_compare(PyUnicodeObject
*str1
, PyUnicodeObject
*str2
)
6094 register Py_ssize_t len1
, len2
;
6096 Py_UNICODE
*s1
= str1
->str
;
6097 Py_UNICODE
*s2
= str2
->str
;
6099 len1
= str1
->length
;
6100 len2
= str2
->length
;
6102 while (len1
> 0 && len2
> 0) {
6109 return (c1
< c2
) ? -1 : 1;
6114 return (len1
< len2
) ? -1 : (len1
!= len2
);
6119 int PyUnicode_Compare(PyObject
*left
,
6122 PyUnicodeObject
*u
= NULL
, *v
= NULL
;
6125 /* Coerce the two arguments */
6126 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6129 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6133 /* Shortcut for empty or interned objects */
6140 result
= unicode_compare(u
, v
);
6152 PyObject
*PyUnicode_RichCompare(PyObject
*left
,
6158 result
= PyUnicode_Compare(left
, right
);
6159 if (result
== -1 && PyErr_Occurred())
6162 /* Convert the return value to a Boolean */
6165 result
= (result
== 0);
6168 result
= (result
!= 0);
6171 result
= (result
<= 0);
6174 result
= (result
>= 0);
6177 result
= (result
== -1);
6180 result
= (result
== 1);
6183 return PyBool_FromLong(result
);
6189 Type errors mean that PyUnicode_FromObject() could not convert
6190 one of the arguments (usually the right hand side) to Unicode,
6191 ie. we can't handle the comparison request. However, it is
6192 possible that the other object knows a comparison method, which
6193 is why we return Py_NotImplemented to give the other object a
6197 if (PyErr_ExceptionMatches(PyExc_TypeError
)) {
6199 Py_INCREF(Py_NotImplemented
);
6200 return Py_NotImplemented
;
6202 if (op
!= Py_EQ
&& op
!= Py_NE
)
6205 /* Equality comparison.
6207 This is a special case: we silence any PyExc_UnicodeDecodeError
6208 and instead turn it into a PyErr_UnicodeWarning.
6211 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
))
6214 if (PyErr_Warn(PyExc_UnicodeWarning
,
6216 "Unicode equal comparison "
6217 "failed to convert both arguments to Unicode - "
6218 "interpreting them as being unequal" :
6219 "Unicode unequal comparison "
6220 "failed to convert both arguments to Unicode - "
6221 "interpreting them as being unequal"
6224 result
= (op
== Py_NE
);
6225 return PyBool_FromLong(result
);
6228 int PyUnicode_Contains(PyObject
*container
,
6231 PyObject
*str
, *sub
;
6234 /* Coerce the two arguments */
6235 sub
= PyUnicode_FromObject(element
);
6240 str
= PyUnicode_FromObject(container
);
6246 result
= stringlib_contains_obj(str
, sub
);
6254 /* Concat to string or Unicode object giving a new Unicode object. */
6256 PyObject
*PyUnicode_Concat(PyObject
*left
,
6259 PyUnicodeObject
*u
= NULL
, *v
= NULL
, *w
;
6261 /* Coerce the two arguments */
6262 u
= (PyUnicodeObject
*)PyUnicode_FromObject(left
);
6265 v
= (PyUnicodeObject
*)PyUnicode_FromObject(right
);
6270 if (v
== unicode_empty
) {
6272 return (PyObject
*)u
;
6274 if (u
== unicode_empty
) {
6276 return (PyObject
*)v
;
6279 /* Concat the two Unicode strings */
6280 w
= _PyUnicode_New(u
->length
+ v
->length
);
6283 Py_UNICODE_COPY(w
->str
, u
->str
, u
->length
);
6284 Py_UNICODE_COPY(w
->str
+ u
->length
, v
->str
, v
->length
);
6288 return (PyObject
*)w
;
6296 PyDoc_STRVAR(count__doc__
,
6297 "S.count(sub[, start[, end]]) -> int\n\
6299 Return the number of non-overlapping occurrences of substring sub in\n\
6300 Unicode string S[start:end]. Optional arguments start and end are\n\
6301 interpreted as in slice notation.");
6304 unicode_count(PyUnicodeObject
*self
, PyObject
*args
)
6306 PyUnicodeObject
*substring
;
6307 Py_ssize_t start
= 0;
6308 Py_ssize_t end
= PY_SSIZE_T_MAX
;
6311 if (!stringlib_parse_args_finds_unicode("count", args
, &substring
,
6315 ADJUST_INDICES(start
, end
, self
->length
);
6316 result
= PyInt_FromSsize_t(
6317 stringlib_count(self
->str
+ start
, end
- start
,
6318 substring
->str
, substring
->length
,
6322 Py_DECREF(substring
);
6327 PyDoc_STRVAR(encode__doc__
,
6328 "S.encode([encoding[,errors]]) -> string or unicode\n\
6330 Encodes S using the codec registered for encoding. encoding defaults\n\
6331 to the default encoding. errors may be given to set a different error\n\
6332 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6333 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6334 'xmlcharrefreplace' as well as any other name registered with\n\
6335 codecs.register_error that can handle UnicodeEncodeErrors.");
6338 unicode_encode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6340 static char *kwlist
[] = {"encoding", "errors", 0};
6341 char *encoding
= NULL
;
6342 char *errors
= NULL
;
6345 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:encode",
6346 kwlist
, &encoding
, &errors
))
6348 v
= PyUnicode_AsEncodedObject((PyObject
*)self
, encoding
, errors
);
6351 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6352 PyErr_Format(PyExc_TypeError
,
6353 "encoder did not return a string/unicode object "
6355 Py_TYPE(v
)->tp_name
);
6365 PyDoc_STRVAR(decode__doc__
,
6366 "S.decode([encoding[,errors]]) -> string or unicode\n\
6368 Decodes S using the codec registered for encoding. encoding defaults\n\
6369 to the default encoding. errors may be given to set a different error\n\
6370 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6371 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6372 as well as any other name registerd with codecs.register_error that is\n\
6373 able to handle UnicodeDecodeErrors.");
6376 unicode_decode(PyUnicodeObject
*self
, PyObject
*args
, PyObject
*kwargs
)
6378 static char *kwlist
[] = {"encoding", "errors", 0};
6379 char *encoding
= NULL
;
6380 char *errors
= NULL
;
6383 if (!PyArg_ParseTupleAndKeywords(args
, kwargs
, "|ss:decode",
6384 kwlist
, &encoding
, &errors
))
6386 v
= PyUnicode_AsDecodedObject((PyObject
*)self
, encoding
, errors
);
6389 if (!PyString_Check(v
) && !PyUnicode_Check(v
)) {
6390 PyErr_Format(PyExc_TypeError
,
6391 "decoder did not return a string/unicode object "
6393 Py_TYPE(v
)->tp_name
);
6403 PyDoc_STRVAR(expandtabs__doc__
,
6404 "S.expandtabs([tabsize]) -> unicode\n\
6406 Return a copy of S where all tab characters are expanded using spaces.\n\
6407 If tabsize is not given, a tab size of 8 characters is assumed.");
6410 unicode_expandtabs(PyUnicodeObject
*self
, PyObject
*args
)
6416 Py_ssize_t i
, j
, incr
;
6420 if (!PyArg_ParseTuple(args
, "|i:expandtabs", &tabsize
))
6423 /* First pass: determine size of output string */
6424 i
= 0; /* chars up to and including most recent \n or \r */
6425 j
= 0; /* chars since most recent \n or \r (use in tab calculations) */
6426 e
= self
->str
+ self
->length
; /* end of input */
6427 for (p
= self
->str
; p
< e
; p
++)
6430 incr
= tabsize
- (j
% tabsize
); /* cannot overflow */
6431 if (j
> PY_SSIZE_T_MAX
- incr
)
6437 if (j
> PY_SSIZE_T_MAX
- 1)
6440 if (*p
== '\n' || *p
== '\r') {
6441 if (i
> PY_SSIZE_T_MAX
- j
)
6448 if (i
> PY_SSIZE_T_MAX
- j
)
6451 /* Second pass: create output string and fill it */
6452 u
= _PyUnicode_New(i
+ j
);
6456 j
= 0; /* same as in first pass */
6457 q
= u
->str
; /* next output char */
6458 qe
= u
->str
+ u
->length
; /* end of output */
6460 for (p
= self
->str
; p
< e
; p
++)
6463 i
= tabsize
- (j
% tabsize
);
6477 if (*p
== '\n' || *p
== '\r')
6481 return (PyObject
*) u
;
6486 PyErr_SetString(PyExc_OverflowError
, "new string is too long");
6490 PyDoc_STRVAR(find__doc__
,
6491 "S.find(sub [,start [,end]]) -> int\n\
6493 Return the lowest index in S where substring sub is found,\n\
6494 such that sub is contained within s[start:end]. Optional\n\
6495 arguments start and end are interpreted as in slice notation.\n\
6497 Return -1 on failure.");
6500 unicode_find(PyUnicodeObject
*self
, PyObject
*args
)
6502 PyUnicodeObject
*substring
;
6507 if (!stringlib_parse_args_finds_unicode("find", args
, &substring
,
6511 result
= stringlib_find_slice(
6512 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6513 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6517 Py_DECREF(substring
);
6519 return PyInt_FromSsize_t(result
);
6523 unicode_getitem(PyUnicodeObject
*self
, Py_ssize_t index
)
6525 if (index
< 0 || index
>= self
->length
) {
6526 PyErr_SetString(PyExc_IndexError
, "string index out of range");
6530 return (PyObject
*) PyUnicode_FromUnicode(&self
->str
[index
], 1);
6534 unicode_hash(PyUnicodeObject
*self
)
6536 /* Since Unicode objects compare equal to their ASCII string
6537 counterparts, they should use the individual character values
6538 as basis for their hash value. This is needed to assure that
6539 strings and Unicode objects behave in the same way as
6542 register Py_ssize_t len
;
6543 register Py_UNICODE
*p
;
6546 if (self
->hash
!= -1)
6548 len
= PyUnicode_GET_SIZE(self
);
6549 p
= PyUnicode_AS_UNICODE(self
);
6552 x
= (1000003*x
) ^ *p
++;
6553 x
^= PyUnicode_GET_SIZE(self
);
6560 PyDoc_STRVAR(index__doc__
,
6561 "S.index(sub [,start [,end]]) -> int\n\
6563 Like S.find() but raise ValueError when the substring is not found.");
6566 unicode_index(PyUnicodeObject
*self
, PyObject
*args
)
6569 PyUnicodeObject
*substring
;
6573 if (!stringlib_parse_args_finds_unicode("index", args
, &substring
,
6577 result
= stringlib_find_slice(
6578 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
6579 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
6583 Py_DECREF(substring
);
6586 PyErr_SetString(PyExc_ValueError
, "substring not found");
6590 return PyInt_FromSsize_t(result
);
6593 PyDoc_STRVAR(islower__doc__
,
6594 "S.islower() -> bool\n\
6596 Return True if all cased characters in S are lowercase and there is\n\
6597 at least one cased character in S, False otherwise.");
6600 unicode_islower(PyUnicodeObject
*self
)
6602 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6603 register const Py_UNICODE
*e
;
6606 /* Shortcut for single character strings */
6607 if (PyUnicode_GET_SIZE(self
) == 1)
6608 return PyBool_FromLong(Py_UNICODE_ISLOWER(*p
));
6610 /* Special case for empty strings */
6611 if (PyUnicode_GET_SIZE(self
) == 0)
6612 return PyBool_FromLong(0);
6614 e
= p
+ PyUnicode_GET_SIZE(self
);
6616 for (; p
< e
; p
++) {
6617 register const Py_UNICODE ch
= *p
;
6619 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
))
6620 return PyBool_FromLong(0);
6621 else if (!cased
&& Py_UNICODE_ISLOWER(ch
))
6624 return PyBool_FromLong(cased
);
6627 PyDoc_STRVAR(isupper__doc__
,
6628 "S.isupper() -> bool\n\
6630 Return True if all cased characters in S are uppercase and there is\n\
6631 at least one cased character in S, False otherwise.");
6634 unicode_isupper(PyUnicodeObject
*self
)
6636 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6637 register const Py_UNICODE
*e
;
6640 /* Shortcut for single character strings */
6641 if (PyUnicode_GET_SIZE(self
) == 1)
6642 return PyBool_FromLong(Py_UNICODE_ISUPPER(*p
) != 0);
6644 /* Special case for empty strings */
6645 if (PyUnicode_GET_SIZE(self
) == 0)
6646 return PyBool_FromLong(0);
6648 e
= p
+ PyUnicode_GET_SIZE(self
);
6650 for (; p
< e
; p
++) {
6651 register const Py_UNICODE ch
= *p
;
6653 if (Py_UNICODE_ISLOWER(ch
) || Py_UNICODE_ISTITLE(ch
))
6654 return PyBool_FromLong(0);
6655 else if (!cased
&& Py_UNICODE_ISUPPER(ch
))
6658 return PyBool_FromLong(cased
);
6661 PyDoc_STRVAR(istitle__doc__
,
6662 "S.istitle() -> bool\n\
6664 Return True if S is a titlecased string and there is at least one\n\
6665 character in S, i.e. upper- and titlecase characters may only\n\
6666 follow uncased characters and lowercase characters only cased ones.\n\
6667 Return False otherwise.");
6670 unicode_istitle(PyUnicodeObject
*self
)
6672 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6673 register const Py_UNICODE
*e
;
6674 int cased
, previous_is_cased
;
6676 /* Shortcut for single character strings */
6677 if (PyUnicode_GET_SIZE(self
) == 1)
6678 return PyBool_FromLong((Py_UNICODE_ISTITLE(*p
) != 0) ||
6679 (Py_UNICODE_ISUPPER(*p
) != 0));
6681 /* Special case for empty strings */
6682 if (PyUnicode_GET_SIZE(self
) == 0)
6683 return PyBool_FromLong(0);
6685 e
= p
+ PyUnicode_GET_SIZE(self
);
6687 previous_is_cased
= 0;
6688 for (; p
< e
; p
++) {
6689 register const Py_UNICODE ch
= *p
;
6691 if (Py_UNICODE_ISUPPER(ch
) || Py_UNICODE_ISTITLE(ch
)) {
6692 if (previous_is_cased
)
6693 return PyBool_FromLong(0);
6694 previous_is_cased
= 1;
6697 else if (Py_UNICODE_ISLOWER(ch
)) {
6698 if (!previous_is_cased
)
6699 return PyBool_FromLong(0);
6700 previous_is_cased
= 1;
6704 previous_is_cased
= 0;
6706 return PyBool_FromLong(cased
);
6709 PyDoc_STRVAR(isspace__doc__
,
6710 "S.isspace() -> bool\n\
6712 Return True if all characters in S are whitespace\n\
6713 and there is at least one character in S, False otherwise.");
6716 unicode_isspace(PyUnicodeObject
*self
)
6718 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6719 register const Py_UNICODE
*e
;
6721 /* Shortcut for single character strings */
6722 if (PyUnicode_GET_SIZE(self
) == 1 &&
6723 Py_UNICODE_ISSPACE(*p
))
6724 return PyBool_FromLong(1);
6726 /* Special case for empty strings */
6727 if (PyUnicode_GET_SIZE(self
) == 0)
6728 return PyBool_FromLong(0);
6730 e
= p
+ PyUnicode_GET_SIZE(self
);
6731 for (; p
< e
; p
++) {
6732 if (!Py_UNICODE_ISSPACE(*p
))
6733 return PyBool_FromLong(0);
6735 return PyBool_FromLong(1);
6738 PyDoc_STRVAR(isalpha__doc__
,
6739 "S.isalpha() -> bool\n\
6741 Return True if all characters in S are alphabetic\n\
6742 and there is at least one character in S, False otherwise.");
6745 unicode_isalpha(PyUnicodeObject
*self
)
6747 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6748 register const Py_UNICODE
*e
;
6750 /* Shortcut for single character strings */
6751 if (PyUnicode_GET_SIZE(self
) == 1 &&
6752 Py_UNICODE_ISALPHA(*p
))
6753 return PyBool_FromLong(1);
6755 /* Special case for empty strings */
6756 if (PyUnicode_GET_SIZE(self
) == 0)
6757 return PyBool_FromLong(0);
6759 e
= p
+ PyUnicode_GET_SIZE(self
);
6760 for (; p
< e
; p
++) {
6761 if (!Py_UNICODE_ISALPHA(*p
))
6762 return PyBool_FromLong(0);
6764 return PyBool_FromLong(1);
6767 PyDoc_STRVAR(isalnum__doc__
,
6768 "S.isalnum() -> bool\n\
6770 Return True if all characters in S are alphanumeric\n\
6771 and there is at least one character in S, False otherwise.");
6774 unicode_isalnum(PyUnicodeObject
*self
)
6776 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6777 register const Py_UNICODE
*e
;
6779 /* Shortcut for single character strings */
6780 if (PyUnicode_GET_SIZE(self
) == 1 &&
6781 Py_UNICODE_ISALNUM(*p
))
6782 return PyBool_FromLong(1);
6784 /* Special case for empty strings */
6785 if (PyUnicode_GET_SIZE(self
) == 0)
6786 return PyBool_FromLong(0);
6788 e
= p
+ PyUnicode_GET_SIZE(self
);
6789 for (; p
< e
; p
++) {
6790 if (!Py_UNICODE_ISALNUM(*p
))
6791 return PyBool_FromLong(0);
6793 return PyBool_FromLong(1);
6796 PyDoc_STRVAR(isdecimal__doc__
,
6797 "S.isdecimal() -> bool\n\
6799 Return True if there are only decimal characters in S,\n\
6803 unicode_isdecimal(PyUnicodeObject
*self
)
6805 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6806 register const Py_UNICODE
*e
;
6808 /* Shortcut for single character strings */
6809 if (PyUnicode_GET_SIZE(self
) == 1 &&
6810 Py_UNICODE_ISDECIMAL(*p
))
6811 return PyBool_FromLong(1);
6813 /* Special case for empty strings */
6814 if (PyUnicode_GET_SIZE(self
) == 0)
6815 return PyBool_FromLong(0);
6817 e
= p
+ PyUnicode_GET_SIZE(self
);
6818 for (; p
< e
; p
++) {
6819 if (!Py_UNICODE_ISDECIMAL(*p
))
6820 return PyBool_FromLong(0);
6822 return PyBool_FromLong(1);
6825 PyDoc_STRVAR(isdigit__doc__
,
6826 "S.isdigit() -> bool\n\
6828 Return True if all characters in S are digits\n\
6829 and there is at least one character in S, False otherwise.");
6832 unicode_isdigit(PyUnicodeObject
*self
)
6834 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6835 register const Py_UNICODE
*e
;
6837 /* Shortcut for single character strings */
6838 if (PyUnicode_GET_SIZE(self
) == 1 &&
6839 Py_UNICODE_ISDIGIT(*p
))
6840 return PyBool_FromLong(1);
6842 /* Special case for empty strings */
6843 if (PyUnicode_GET_SIZE(self
) == 0)
6844 return PyBool_FromLong(0);
6846 e
= p
+ PyUnicode_GET_SIZE(self
);
6847 for (; p
< e
; p
++) {
6848 if (!Py_UNICODE_ISDIGIT(*p
))
6849 return PyBool_FromLong(0);
6851 return PyBool_FromLong(1);
6854 PyDoc_STRVAR(isnumeric__doc__
,
6855 "S.isnumeric() -> bool\n\
6857 Return True if there are only numeric characters in S,\n\
6861 unicode_isnumeric(PyUnicodeObject
*self
)
6863 register const Py_UNICODE
*p
= PyUnicode_AS_UNICODE(self
);
6864 register const Py_UNICODE
*e
;
6866 /* Shortcut for single character strings */
6867 if (PyUnicode_GET_SIZE(self
) == 1 &&
6868 Py_UNICODE_ISNUMERIC(*p
))
6869 return PyBool_FromLong(1);
6871 /* Special case for empty strings */
6872 if (PyUnicode_GET_SIZE(self
) == 0)
6873 return PyBool_FromLong(0);
6875 e
= p
+ PyUnicode_GET_SIZE(self
);
6876 for (; p
< e
; p
++) {
6877 if (!Py_UNICODE_ISNUMERIC(*p
))
6878 return PyBool_FromLong(0);
6880 return PyBool_FromLong(1);
6883 PyDoc_STRVAR(join__doc__
,
6884 "S.join(iterable) -> unicode\n\
6886 Return a string which is the concatenation of the strings in the\n\
6887 iterable. The separator between elements is S.");
6890 unicode_join(PyObject
*self
, PyObject
*data
)
6892 return PyUnicode_Join(self
, data
);
6896 unicode_length(PyUnicodeObject
*self
)
6898 return self
->length
;
6901 PyDoc_STRVAR(ljust__doc__
,
6902 "S.ljust(width[, fillchar]) -> int\n\
6904 Return S left-justified in a Unicode string of length width. Padding is\n\
6905 done using the specified fill character (default is a space).");
6908 unicode_ljust(PyUnicodeObject
*self
, PyObject
*args
)
6911 Py_UNICODE fillchar
= ' ';
6913 if (!PyArg_ParseTuple(args
, "n|O&:ljust", &width
, convert_uc
, &fillchar
))
6916 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
6918 return (PyObject
*) self
;
6921 return (PyObject
*) pad(self
, 0, width
- self
->length
, fillchar
);
6924 PyDoc_STRVAR(lower__doc__
,
6925 "S.lower() -> unicode\n\
6927 Return a copy of the string S converted to lowercase.");
6930 unicode_lower(PyUnicodeObject
*self
)
6932 return fixup(self
, fixlower
);
6936 #define RIGHTSTRIP 1
6939 /* Arrays indexed by above */
6940 static const char *stripformat
[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6942 #define STRIPNAME(i) (stripformat[i]+3)
6944 /* externally visible for str.strip(unicode) */
6946 _PyUnicode_XStrip(PyUnicodeObject
*self
, int striptype
, PyObject
*sepobj
)
6948 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6949 Py_ssize_t len
= PyUnicode_GET_SIZE(self
);
6950 Py_UNICODE
*sep
= PyUnicode_AS_UNICODE(sepobj
);
6951 Py_ssize_t seplen
= PyUnicode_GET_SIZE(sepobj
);
6954 BLOOM_MASK sepmask
= make_bloom_mask(sep
, seplen
);
6957 if (striptype
!= RIGHTSTRIP
) {
6958 while (i
< len
&& BLOOM_MEMBER(sepmask
, s
[i
], sep
, seplen
)) {
6964 if (striptype
!= LEFTSTRIP
) {
6967 } while (j
>= i
&& BLOOM_MEMBER(sepmask
, s
[j
], sep
, seplen
));
6971 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
6973 return (PyObject
*)self
;
6976 return PyUnicode_FromUnicode(s
+i
, j
-i
);
6981 do_strip(PyUnicodeObject
*self
, int striptype
)
6983 Py_UNICODE
*s
= PyUnicode_AS_UNICODE(self
);
6984 Py_ssize_t len
= PyUnicode_GET_SIZE(self
), i
, j
;
6987 if (striptype
!= RIGHTSTRIP
) {
6988 while (i
< len
&& Py_UNICODE_ISSPACE(s
[i
])) {
6994 if (striptype
!= LEFTSTRIP
) {
6997 } while (j
>= i
&& Py_UNICODE_ISSPACE(s
[j
]));
7001 if (i
== 0 && j
== len
&& PyUnicode_CheckExact(self
)) {
7003 return (PyObject
*)self
;
7006 return PyUnicode_FromUnicode(s
+i
, j
-i
);
7011 do_argstrip(PyUnicodeObject
*self
, int striptype
, PyObject
*args
)
7013 PyObject
*sep
= NULL
;
7015 if (!PyArg_ParseTuple(args
, (char *)stripformat
[striptype
], &sep
))
7018 if (sep
!= NULL
&& sep
!= Py_None
) {
7019 if (PyUnicode_Check(sep
))
7020 return _PyUnicode_XStrip(self
, striptype
, sep
);
7021 else if (PyString_Check(sep
)) {
7023 sep
= PyUnicode_FromObject(sep
);
7026 res
= _PyUnicode_XStrip(self
, striptype
, sep
);
7031 PyErr_Format(PyExc_TypeError
,
7032 "%s arg must be None, unicode or str",
7033 STRIPNAME(striptype
));
7038 return do_strip(self
, striptype
);
7042 PyDoc_STRVAR(strip__doc__
,
7043 "S.strip([chars]) -> unicode\n\
7045 Return a copy of the string S with leading and trailing\n\
7046 whitespace removed.\n\
7047 If chars is given and not None, remove characters in chars instead.\n\
7048 If chars is a str, it will be converted to unicode before stripping");
7051 unicode_strip(PyUnicodeObject
*self
, PyObject
*args
)
7053 if (PyTuple_GET_SIZE(args
) == 0)
7054 return do_strip(self
, BOTHSTRIP
); /* Common case */
7056 return do_argstrip(self
, BOTHSTRIP
, args
);
7060 PyDoc_STRVAR(lstrip__doc__
,
7061 "S.lstrip([chars]) -> unicode\n\
7063 Return a copy of the string S with leading whitespace removed.\n\
7064 If chars is given and not None, remove characters in chars instead.\n\
7065 If chars is a str, it will be converted to unicode before stripping");
7068 unicode_lstrip(PyUnicodeObject
*self
, PyObject
*args
)
7070 if (PyTuple_GET_SIZE(args
) == 0)
7071 return do_strip(self
, LEFTSTRIP
); /* Common case */
7073 return do_argstrip(self
, LEFTSTRIP
, args
);
7077 PyDoc_STRVAR(rstrip__doc__
,
7078 "S.rstrip([chars]) -> unicode\n\
7080 Return a copy of the string S with trailing whitespace removed.\n\
7081 If chars is given and not None, remove characters in chars instead.\n\
7082 If chars is a str, it will be converted to unicode before stripping");
7085 unicode_rstrip(PyUnicodeObject
*self
, PyObject
*args
)
7087 if (PyTuple_GET_SIZE(args
) == 0)
7088 return do_strip(self
, RIGHTSTRIP
); /* Common case */
7090 return do_argstrip(self
, RIGHTSTRIP
, args
);
7095 unicode_repeat(PyUnicodeObject
*str
, Py_ssize_t len
)
7105 if (len
== 1 && PyUnicode_CheckExact(str
)) {
7106 /* no repeat, return original string */
7108 return (PyObject
*) str
;
7111 /* ensure # of chars needed doesn't overflow int and # of bytes
7112 * needed doesn't overflow size_t
7114 nchars
= len
* str
->length
;
7115 if (len
&& nchars
/ len
!= str
->length
) {
7116 PyErr_SetString(PyExc_OverflowError
,
7117 "repeated string is too long");
7120 nbytes
= (nchars
+ 1) * sizeof(Py_UNICODE
);
7121 if (nbytes
/ sizeof(Py_UNICODE
) != (size_t)(nchars
+ 1)) {
7122 PyErr_SetString(PyExc_OverflowError
,
7123 "repeated string is too long");
7126 u
= _PyUnicode_New(nchars
);
7132 if (str
->length
== 1 && len
> 0) {
7133 Py_UNICODE_FILL(p
, str
->str
[0], len
);
7135 Py_ssize_t done
= 0; /* number of characters copied this far */
7136 if (done
< nchars
) {
7137 Py_UNICODE_COPY(p
, str
->str
, str
->length
);
7140 while (done
< nchars
) {
7141 Py_ssize_t n
= (done
<= nchars
-done
) ? done
: nchars
-done
;
7142 Py_UNICODE_COPY(p
+done
, p
, n
);
7147 return (PyObject
*) u
;
7150 PyObject
*PyUnicode_Replace(PyObject
*obj
,
7153 Py_ssize_t maxcount
)
7160 self
= PyUnicode_FromObject(obj
);
7163 str1
= PyUnicode_FromObject(subobj
);
7168 str2
= PyUnicode_FromObject(replobj
);
7174 result
= replace((PyUnicodeObject
*)self
,
7175 (PyUnicodeObject
*)str1
,
7176 (PyUnicodeObject
*)str2
,
7184 PyDoc_STRVAR(replace__doc__
,
7185 "S.replace(old, new[, count]) -> unicode\n\
7187 Return a copy of S with all occurrences of substring\n\
7188 old replaced by new. If the optional argument count is\n\
7189 given, only the first count occurrences are replaced.");
7192 unicode_replace(PyUnicodeObject
*self
, PyObject
*args
)
7194 PyUnicodeObject
*str1
;
7195 PyUnicodeObject
*str2
;
7196 Py_ssize_t maxcount
= -1;
7199 if (!PyArg_ParseTuple(args
, "OO|n:replace", &str1
, &str2
, &maxcount
))
7201 str1
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str1
);
7204 str2
= (PyUnicodeObject
*)PyUnicode_FromObject((PyObject
*)str2
);
7210 result
= replace(self
, str1
, str2
, maxcount
);
7218 PyObject
*unicode_repr(PyObject
*unicode
)
7220 return unicodeescape_string(PyUnicode_AS_UNICODE(unicode
),
7221 PyUnicode_GET_SIZE(unicode
),
7225 PyDoc_STRVAR(rfind__doc__
,
7226 "S.rfind(sub [,start [,end]]) -> int\n\
7228 Return the highest index in S where substring sub is found,\n\
7229 such that sub is contained within s[start:end]. Optional\n\
7230 arguments start and end are interpreted as in slice notation.\n\
7232 Return -1 on failure.");
7235 unicode_rfind(PyUnicodeObject
*self
, PyObject
*args
)
7237 PyUnicodeObject
*substring
;
7242 if (!stringlib_parse_args_finds_unicode("rfind", args
, &substring
,
7246 result
= stringlib_rfind_slice(
7247 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7248 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7252 Py_DECREF(substring
);
7254 return PyInt_FromSsize_t(result
);
7257 PyDoc_STRVAR(rindex__doc__
,
7258 "S.rindex(sub [,start [,end]]) -> int\n\
7260 Like S.rfind() but raise ValueError when the substring is not found.");
7263 unicode_rindex(PyUnicodeObject
*self
, PyObject
*args
)
7265 PyUnicodeObject
*substring
;
7270 if (!stringlib_parse_args_finds_unicode("rindex", args
, &substring
,
7274 result
= stringlib_rfind_slice(
7275 PyUnicode_AS_UNICODE(self
), PyUnicode_GET_SIZE(self
),
7276 PyUnicode_AS_UNICODE(substring
), PyUnicode_GET_SIZE(substring
),
7280 Py_DECREF(substring
);
7283 PyErr_SetString(PyExc_ValueError
, "substring not found");
7286 return PyInt_FromSsize_t(result
);
7289 PyDoc_STRVAR(rjust__doc__
,
7290 "S.rjust(width[, fillchar]) -> unicode\n\
7292 Return S right-justified in a Unicode string of length width. Padding is\n\
7293 done using the specified fill character (default is a space).");
7296 unicode_rjust(PyUnicodeObject
*self
, PyObject
*args
)
7299 Py_UNICODE fillchar
= ' ';
7301 if (!PyArg_ParseTuple(args
, "n|O&:rjust", &width
, convert_uc
, &fillchar
))
7304 if (self
->length
>= width
&& PyUnicode_CheckExact(self
)) {
7306 return (PyObject
*) self
;
7309 return (PyObject
*) pad(self
, width
- self
->length
, 0, fillchar
);
7313 unicode_slice(PyUnicodeObject
*self
, Py_ssize_t start
, Py_ssize_t end
)
7315 /* standard clamping */
7320 if (end
> self
->length
)
7322 if (start
== 0 && end
== self
->length
&& PyUnicode_CheckExact(self
)) {
7323 /* full slice, return original string */
7325 return (PyObject
*) self
;
7330 return (PyObject
*) PyUnicode_FromUnicode(self
->str
+ start
,
7334 PyObject
*PyUnicode_Split(PyObject
*s
,
7336 Py_ssize_t maxsplit
)
7340 s
= PyUnicode_FromObject(s
);
7344 sep
= PyUnicode_FromObject(sep
);
7351 result
= split((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7358 PyDoc_STRVAR(split__doc__
,
7359 "S.split([sep [,maxsplit]]) -> list of strings\n\
7361 Return a list of the words in S, using sep as the\n\
7362 delimiter string. If maxsplit is given, at most maxsplit\n\
7363 splits are done. If sep is not specified or is None, any\n\
7364 whitespace string is a separator and empty strings are\n\
7365 removed from the result.");
7368 unicode_split(PyUnicodeObject
*self
, PyObject
*args
)
7370 PyObject
*substring
= Py_None
;
7371 Py_ssize_t maxcount
= -1;
7373 if (!PyArg_ParseTuple(args
, "|On:split", &substring
, &maxcount
))
7376 if (substring
== Py_None
)
7377 return split(self
, NULL
, maxcount
);
7378 else if (PyUnicode_Check(substring
))
7379 return split(self
, (PyUnicodeObject
*)substring
, maxcount
);
7381 return PyUnicode_Split((PyObject
*)self
, substring
, maxcount
);
7385 PyUnicode_Partition(PyObject
*str_in
, PyObject
*sep_in
)
7391 str_obj
= PyUnicode_FromObject(str_in
);
7394 sep_obj
= PyUnicode_FromObject(sep_in
);
7400 out
= stringlib_partition(
7401 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7402 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7413 PyUnicode_RPartition(PyObject
*str_in
, PyObject
*sep_in
)
7419 str_obj
= PyUnicode_FromObject(str_in
);
7422 sep_obj
= PyUnicode_FromObject(sep_in
);
7428 out
= stringlib_rpartition(
7429 str_obj
, PyUnicode_AS_UNICODE(str_obj
), PyUnicode_GET_SIZE(str_obj
),
7430 sep_obj
, PyUnicode_AS_UNICODE(sep_obj
), PyUnicode_GET_SIZE(sep_obj
)
7439 PyDoc_STRVAR(partition__doc__
,
7440 "S.partition(sep) -> (head, sep, tail)\n\
7442 Search for the separator sep in S, and return the part before it,\n\
7443 the separator itself, and the part after it. If the separator is not\n\
7444 found, return S and two empty strings.");
7447 unicode_partition(PyUnicodeObject
*self
, PyObject
*separator
)
7449 return PyUnicode_Partition((PyObject
*)self
, separator
);
7452 PyDoc_STRVAR(rpartition__doc__
,
7453 "S.rpartition(sep) -> (head, sep, tail)\n\
7455 Search for the separator sep in S, starting at the end of S, and return\n\
7456 the part before it, the separator itself, and the part after it. If the\n\
7457 separator is not found, return two empty strings and S.");
7460 unicode_rpartition(PyUnicodeObject
*self
, PyObject
*separator
)
7462 return PyUnicode_RPartition((PyObject
*)self
, separator
);
7465 PyObject
*PyUnicode_RSplit(PyObject
*s
,
7467 Py_ssize_t maxsplit
)
7471 s
= PyUnicode_FromObject(s
);
7475 sep
= PyUnicode_FromObject(sep
);
7482 result
= rsplit((PyUnicodeObject
*)s
, (PyUnicodeObject
*)sep
, maxsplit
);
7489 PyDoc_STRVAR(rsplit__doc__
,
7490 "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7492 Return a list of the words in S, using sep as the\n\
7493 delimiter string, starting at the end of the string and\n\
7494 working to the front. If maxsplit is given, at most maxsplit\n\
7495 splits are done. If sep is not specified, any whitespace string\n\
7499 unicode_rsplit(PyUnicodeObject
*self
, PyObject
*args
)
7501 PyObject
*substring
= Py_None
;
7502 Py_ssize_t maxcount
= -1;
7504 if (!PyArg_ParseTuple(args
, "|On:rsplit", &substring
, &maxcount
))
7507 if (substring
== Py_None
)
7508 return rsplit(self
, NULL
, maxcount
);
7509 else if (PyUnicode_Check(substring
))
7510 return rsplit(self
, (PyUnicodeObject
*)substring
, maxcount
);
7512 return PyUnicode_RSplit((PyObject
*)self
, substring
, maxcount
);
7515 PyDoc_STRVAR(splitlines__doc__
,
7516 "S.splitlines([keepends]) -> list of strings\n\
7518 Return a list of the lines in S, breaking at line boundaries.\n\
7519 Line breaks are not included in the resulting list unless keepends\n\
7520 is given and true.");
7523 unicode_splitlines(PyUnicodeObject
*self
, PyObject
*args
)
7527 if (!PyArg_ParseTuple(args
, "|i:splitlines", &keepends
))
7530 return PyUnicode_Splitlines((PyObject
*)self
, keepends
);
7534 PyObject
*unicode_str(PyUnicodeObject
*self
)
7536 return PyUnicode_AsEncodedString((PyObject
*)self
, NULL
, NULL
);
7539 PyDoc_STRVAR(swapcase__doc__
,
7540 "S.swapcase() -> unicode\n\
7542 Return a copy of S with uppercase characters converted to lowercase\n\
7546 unicode_swapcase(PyUnicodeObject
*self
)
7548 return fixup(self
, fixswapcase
);
7551 PyDoc_STRVAR(translate__doc__
,
7552 "S.translate(table) -> unicode\n\
7554 Return a copy of the string S, where all characters have been mapped\n\
7555 through the given translation table, which must be a mapping of\n\
7556 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7557 Unmapped characters are left untouched. Characters mapped to None\n\
7561 unicode_translate(PyUnicodeObject
*self
, PyObject
*table
)
7563 return PyUnicode_TranslateCharmap(self
->str
,
7569 PyDoc_STRVAR(upper__doc__
,
7570 "S.upper() -> unicode\n\
7572 Return a copy of S converted to uppercase.");
7575 unicode_upper(PyUnicodeObject
*self
)
7577 return fixup(self
, fixupper
);
7580 PyDoc_STRVAR(zfill__doc__
,
7581 "S.zfill(width) -> unicode\n\
7583 Pad a numeric string S with zeros on the left, to fill a field\n\
7584 of the specified width. The string S is never truncated.");
7587 unicode_zfill(PyUnicodeObject
*self
, PyObject
*args
)
7593 if (!PyArg_ParseTuple(args
, "n:zfill", &width
))
7596 if (self
->length
>= width
) {
7597 if (PyUnicode_CheckExact(self
)) {
7599 return (PyObject
*) self
;
7602 return PyUnicode_FromUnicode(
7603 PyUnicode_AS_UNICODE(self
),
7604 PyUnicode_GET_SIZE(self
)
7608 fill
= width
- self
->length
;
7610 u
= pad(self
, fill
, 0, '0');
7615 if (u
->str
[fill
] == '+' || u
->str
[fill
] == '-') {
7616 /* move sign to beginning of string */
7617 u
->str
[0] = u
->str
[fill
];
7621 return (PyObject
*) u
;
7626 free_listsize(PyUnicodeObject
*self
)
7628 return PyInt_FromLong(numfree
);
7632 PyDoc_STRVAR(startswith__doc__
,
7633 "S.startswith(prefix[, start[, end]]) -> bool\n\
7635 Return True if S starts with the specified prefix, False otherwise.\n\
7636 With optional start, test S beginning at that position.\n\
7637 With optional end, stop comparing S at that position.\n\
7638 prefix can also be a tuple of strings to try.");
7641 unicode_startswith(PyUnicodeObject
*self
,
7645 PyUnicodeObject
*substring
;
7646 Py_ssize_t start
= 0;
7647 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7650 if (!stringlib_parse_args_finds("startswith", args
, &subobj
, &start
, &end
))
7652 if (PyTuple_Check(subobj
)) {
7654 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7655 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7656 PyTuple_GET_ITEM(subobj
, i
));
7657 if (substring
== NULL
)
7659 result
= tailmatch(self
, substring
, start
, end
, -1);
7660 Py_DECREF(substring
);
7665 /* nothing matched */
7668 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7669 if (substring
== NULL
) {
7670 if (PyErr_ExceptionMatches(PyExc_TypeError
))
7671 PyErr_Format(PyExc_TypeError
, "startswith first arg must be str, "
7672 "unicode, or tuple, not %s", Py_TYPE(subobj
)->tp_name
);
7675 result
= tailmatch(self
, substring
, start
, end
, -1);
7676 Py_DECREF(substring
);
7677 return PyBool_FromLong(result
);
7681 PyDoc_STRVAR(endswith__doc__
,
7682 "S.endswith(suffix[, start[, end]]) -> bool\n\
7684 Return True if S ends with the specified suffix, False otherwise.\n\
7685 With optional start, test S beginning at that position.\n\
7686 With optional end, stop comparing S at that position.\n\
7687 suffix can also be a tuple of strings to try.");
7690 unicode_endswith(PyUnicodeObject
*self
,
7694 PyUnicodeObject
*substring
;
7695 Py_ssize_t start
= 0;
7696 Py_ssize_t end
= PY_SSIZE_T_MAX
;
7699 if (!stringlib_parse_args_finds("endswith", args
, &subobj
, &start
, &end
))
7701 if (PyTuple_Check(subobj
)) {
7703 for (i
= 0; i
< PyTuple_GET_SIZE(subobj
); i
++) {
7704 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(
7705 PyTuple_GET_ITEM(subobj
, i
));
7706 if (substring
== NULL
)
7708 result
= tailmatch(self
, substring
, start
, end
, +1);
7709 Py_DECREF(substring
);
7716 substring
= (PyUnicodeObject
*)PyUnicode_FromObject(subobj
);
7717 if (substring
== NULL
) {
7718 if (PyErr_ExceptionMatches(PyExc_TypeError
))
7719 PyErr_Format(PyExc_TypeError
, "endswith first arg must be str, "
7720 "unicode, or tuple, not %s", Py_TYPE(subobj
)->tp_name
);
7723 result
= tailmatch(self
, substring
, start
, end
, +1);
7724 Py_DECREF(substring
);
7725 return PyBool_FromLong(result
);
7729 /* Implements do_string_format, which is unicode because of stringlib */
7730 #include "stringlib/string_format.h"
7732 PyDoc_STRVAR(format__doc__
,
7733 "S.format(*args, **kwargs) -> unicode\n\
7735 Return a formatted version of S, using substitutions from args and kwargs.\n\
7736 The substitutions are identified by braces ('{' and '}').");
7739 unicode__format__(PyObject
*self
, PyObject
*args
)
7741 PyObject
*format_spec
;
7742 PyObject
*result
= NULL
;
7743 PyObject
*tmp
= NULL
;
7745 /* If 2.x, convert format_spec to the same type as value */
7746 /* This is to allow things like u''.format('') */
7747 if (!PyArg_ParseTuple(args
, "O:__format__", &format_spec
))
7749 if (!(PyBytes_Check(format_spec
) || PyUnicode_Check(format_spec
))) {
7750 PyErr_Format(PyExc_TypeError
, "__format__ arg must be str "
7751 "or unicode, not %s", Py_TYPE(format_spec
)->tp_name
);
7754 tmp
= PyObject_Unicode(format_spec
);
7759 result
= _PyUnicode_FormatAdvanced(self
,
7760 PyUnicode_AS_UNICODE(format_spec
),
7761 PyUnicode_GET_SIZE(format_spec
));
7767 PyDoc_STRVAR(p_format__doc__
,
7768 "S.__format__(format_spec) -> unicode\n\
7770 Return a formatted version of S as described by format_spec.");
7773 unicode__sizeof__(PyUnicodeObject
*v
)
7775 return PyInt_FromSsize_t(sizeof(PyUnicodeObject
) +
7776 sizeof(Py_UNICODE
) * (v
->length
+ 1));
7779 PyDoc_STRVAR(sizeof__doc__
,
7780 "S.__sizeof__() -> size of S in memory, in bytes\n\
7785 unicode_getnewargs(PyUnicodeObject
*v
)
7787 return Py_BuildValue("(u#)", v
->str
, v
->length
);
7791 static PyMethodDef unicode_methods
[] = {
7793 /* Order is according to common usage: often used methods should
7794 appear first, since lookup is done sequentially. */
7796 {"encode", (PyCFunction
) unicode_encode
, METH_VARARGS
| METH_KEYWORDS
, encode__doc__
},
7797 {"replace", (PyCFunction
) unicode_replace
, METH_VARARGS
, replace__doc__
},
7798 {"split", (PyCFunction
) unicode_split
, METH_VARARGS
, split__doc__
},
7799 {"rsplit", (PyCFunction
) unicode_rsplit
, METH_VARARGS
, rsplit__doc__
},
7800 {"join", (PyCFunction
) unicode_join
, METH_O
, join__doc__
},
7801 {"capitalize", (PyCFunction
) unicode_capitalize
, METH_NOARGS
, capitalize__doc__
},
7802 {"title", (PyCFunction
) unicode_title
, METH_NOARGS
, title__doc__
},
7803 {"center", (PyCFunction
) unicode_center
, METH_VARARGS
, center__doc__
},
7804 {"count", (PyCFunction
) unicode_count
, METH_VARARGS
, count__doc__
},
7805 {"expandtabs", (PyCFunction
) unicode_expandtabs
, METH_VARARGS
, expandtabs__doc__
},
7806 {"find", (PyCFunction
) unicode_find
, METH_VARARGS
, find__doc__
},
7807 {"partition", (PyCFunction
) unicode_partition
, METH_O
, partition__doc__
},
7808 {"index", (PyCFunction
) unicode_index
, METH_VARARGS
, index__doc__
},
7809 {"ljust", (PyCFunction
) unicode_ljust
, METH_VARARGS
, ljust__doc__
},
7810 {"lower", (PyCFunction
) unicode_lower
, METH_NOARGS
, lower__doc__
},
7811 {"lstrip", (PyCFunction
) unicode_lstrip
, METH_VARARGS
, lstrip__doc__
},
7812 {"decode", (PyCFunction
) unicode_decode
, METH_VARARGS
| METH_KEYWORDS
, decode__doc__
},
7813 /* {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7814 {"rfind", (PyCFunction
) unicode_rfind
, METH_VARARGS
, rfind__doc__
},
7815 {"rindex", (PyCFunction
) unicode_rindex
, METH_VARARGS
, rindex__doc__
},
7816 {"rjust", (PyCFunction
) unicode_rjust
, METH_VARARGS
, rjust__doc__
},
7817 {"rstrip", (PyCFunction
) unicode_rstrip
, METH_VARARGS
, rstrip__doc__
},
7818 {"rpartition", (PyCFunction
) unicode_rpartition
, METH_O
, rpartition__doc__
},
7819 {"splitlines", (PyCFunction
) unicode_splitlines
, METH_VARARGS
, splitlines__doc__
},
7820 {"strip", (PyCFunction
) unicode_strip
, METH_VARARGS
, strip__doc__
},
7821 {"swapcase", (PyCFunction
) unicode_swapcase
, METH_NOARGS
, swapcase__doc__
},
7822 {"translate", (PyCFunction
) unicode_translate
, METH_O
, translate__doc__
},
7823 {"upper", (PyCFunction
) unicode_upper
, METH_NOARGS
, upper__doc__
},
7824 {"startswith", (PyCFunction
) unicode_startswith
, METH_VARARGS
, startswith__doc__
},
7825 {"endswith", (PyCFunction
) unicode_endswith
, METH_VARARGS
, endswith__doc__
},
7826 {"islower", (PyCFunction
) unicode_islower
, METH_NOARGS
, islower__doc__
},
7827 {"isupper", (PyCFunction
) unicode_isupper
, METH_NOARGS
, isupper__doc__
},
7828 {"istitle", (PyCFunction
) unicode_istitle
, METH_NOARGS
, istitle__doc__
},
7829 {"isspace", (PyCFunction
) unicode_isspace
, METH_NOARGS
, isspace__doc__
},
7830 {"isdecimal", (PyCFunction
) unicode_isdecimal
, METH_NOARGS
, isdecimal__doc__
},
7831 {"isdigit", (PyCFunction
) unicode_isdigit
, METH_NOARGS
, isdigit__doc__
},
7832 {"isnumeric", (PyCFunction
) unicode_isnumeric
, METH_NOARGS
, isnumeric__doc__
},
7833 {"isalpha", (PyCFunction
) unicode_isalpha
, METH_NOARGS
, isalpha__doc__
},
7834 {"isalnum", (PyCFunction
) unicode_isalnum
, METH_NOARGS
, isalnum__doc__
},
7835 {"zfill", (PyCFunction
) unicode_zfill
, METH_VARARGS
, zfill__doc__
},
7836 {"format", (PyCFunction
) do_string_format
, METH_VARARGS
| METH_KEYWORDS
, format__doc__
},
7837 {"__format__", (PyCFunction
) unicode__format__
, METH_VARARGS
, p_format__doc__
},
7838 {"_formatter_field_name_split", (PyCFunction
) formatter_field_name_split
, METH_NOARGS
},
7839 {"_formatter_parser", (PyCFunction
) formatter_parser
, METH_NOARGS
},
7840 {"__sizeof__", (PyCFunction
) unicode__sizeof__
, METH_NOARGS
, sizeof__doc__
},
7842 {"capwords", (PyCFunction
) unicode_capwords
, METH_NOARGS
, capwords__doc__
},
7846 /* This one is just used for debugging the implementation. */
7847 {"freelistsize", (PyCFunction
) free_listsize
, METH_NOARGS
},
7850 {"__getnewargs__", (PyCFunction
)unicode_getnewargs
, METH_NOARGS
},
7855 unicode_mod(PyObject
*v
, PyObject
*w
)
7857 if (!PyUnicode_Check(v
)) {
7858 Py_INCREF(Py_NotImplemented
);
7859 return Py_NotImplemented
;
7861 return PyUnicode_Format(v
, w
);
7864 static PyNumberMethods unicode_as_number
= {
7869 unicode_mod
, /*nb_remainder*/
7872 static PySequenceMethods unicode_as_sequence
= {
7873 (lenfunc
) unicode_length
, /* sq_length */
7874 PyUnicode_Concat
, /* sq_concat */
7875 (ssizeargfunc
) unicode_repeat
, /* sq_repeat */
7876 (ssizeargfunc
) unicode_getitem
, /* sq_item */
7877 (ssizessizeargfunc
) unicode_slice
, /* sq_slice */
7878 0, /* sq_ass_item */
7879 0, /* sq_ass_slice */
7880 PyUnicode_Contains
, /* sq_contains */
7884 unicode_subscript(PyUnicodeObject
* self
, PyObject
* item
)
7886 if (PyIndex_Check(item
)) {
7887 Py_ssize_t i
= PyNumber_AsSsize_t(item
, PyExc_IndexError
);
7888 if (i
== -1 && PyErr_Occurred())
7891 i
+= PyUnicode_GET_SIZE(self
);
7892 return unicode_getitem(self
, i
);
7893 } else if (PySlice_Check(item
)) {
7894 Py_ssize_t start
, stop
, step
, slicelength
, cur
, i
;
7895 Py_UNICODE
* source_buf
;
7896 Py_UNICODE
* result_buf
;
7899 if (PySlice_GetIndicesEx((PySliceObject
*)item
, PyUnicode_GET_SIZE(self
),
7900 &start
, &stop
, &step
, &slicelength
) < 0) {
7904 if (slicelength
<= 0) {
7905 return PyUnicode_FromUnicode(NULL
, 0);
7906 } else if (start
== 0 && step
== 1 && slicelength
== self
->length
&&
7907 PyUnicode_CheckExact(self
)) {
7909 return (PyObject
*)self
;
7910 } else if (step
== 1) {
7911 return PyUnicode_FromUnicode(self
->str
+ start
, slicelength
);
7913 source_buf
= PyUnicode_AS_UNICODE((PyObject
*)self
);
7914 result_buf
= (Py_UNICODE
*)PyObject_MALLOC(slicelength
*
7915 sizeof(Py_UNICODE
));
7917 if (result_buf
== NULL
)
7918 return PyErr_NoMemory();
7920 for (cur
= start
, i
= 0; i
< slicelength
; cur
+= step
, i
++) {
7921 result_buf
[i
] = source_buf
[cur
];
7924 result
= PyUnicode_FromUnicode(result_buf
, slicelength
);
7925 PyObject_FREE(result_buf
);
7929 PyErr_SetString(PyExc_TypeError
, "string indices must be integers");
7934 static PyMappingMethods unicode_as_mapping
= {
7935 (lenfunc
)unicode_length
, /* mp_length */
7936 (binaryfunc
)unicode_subscript
, /* mp_subscript */
7937 (objobjargproc
)0, /* mp_ass_subscript */
7941 unicode_buffer_getreadbuf(PyUnicodeObject
*self
,
7946 PyErr_SetString(PyExc_SystemError
,
7947 "accessing non-existent unicode segment");
7950 *ptr
= (void *) self
->str
;
7951 return PyUnicode_GET_DATA_SIZE(self
);
7955 unicode_buffer_getwritebuf(PyUnicodeObject
*self
, Py_ssize_t index
,
7958 PyErr_SetString(PyExc_TypeError
,
7959 "cannot use unicode as modifiable buffer");
7964 unicode_buffer_getsegcount(PyUnicodeObject
*self
,
7968 *lenp
= PyUnicode_GET_DATA_SIZE(self
);
7973 unicode_buffer_getcharbuf(PyUnicodeObject
*self
,
7980 PyErr_SetString(PyExc_SystemError
,
7981 "accessing non-existent unicode segment");
7984 str
= _PyUnicode_AsDefaultEncodedString((PyObject
*)self
, NULL
);
7987 *ptr
= (void *) PyString_AS_STRING(str
);
7988 return PyString_GET_SIZE(str
);
7991 /* Helpers for PyUnicode_Format() */
7994 getnextarg(PyObject
*args
, Py_ssize_t arglen
, Py_ssize_t
*p_argidx
)
7996 Py_ssize_t argidx
= *p_argidx
;
7997 if (argidx
< arglen
) {
8002 return PyTuple_GetItem(args
, argidx
);
8004 PyErr_SetString(PyExc_TypeError
,
8005 "not enough arguments for format string");
8009 #define F_LJUST (1<<0)
8010 #define F_SIGN (1<<1)
8011 #define F_BLANK (1<<2)
8012 #define F_ALT (1<<3)
8013 #define F_ZERO (1<<4)
8016 strtounicode(Py_UNICODE
*buffer
, const char *charbuffer
)
8018 register Py_ssize_t i
;
8019 Py_ssize_t len
= strlen(charbuffer
);
8020 for (i
= len
- 1; i
>= 0; i
--)
8021 buffer
[i
] = (Py_UNICODE
) charbuffer
[i
];
8027 longtounicode(Py_UNICODE
*buffer
, size_t len
, const char *format
, long x
)
8031 PyOS_snprintf((char *)buffer
, len
, format
, x
);
8032 result
= strtounicode(buffer
, (char *)buffer
);
8033 return Py_SAFE_DOWNCAST(result
, Py_ssize_t
, int);
8036 /* XXX To save some code duplication, formatfloat/long/int could have been
8037 shared with stringobject.c, converting from 8-bit to Unicode after the
8038 formatting is done. */
8040 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8043 formatfloat(PyObject
*v
, int flags
, int prec
, int type
)
8049 x
= PyFloat_AsDouble(v
);
8050 if (x
== -1.0 && PyErr_Occurred())
8056 p
= PyOS_double_to_string(x
, type
, prec
,
8057 (flags
& F_ALT
) ? Py_DTSF_ALT
: 0, NULL
);
8060 result
= PyUnicode_FromStringAndSize(p
, strlen(p
));
8066 formatlong(PyObject
*val
, int flags
, int prec
, int type
)
8070 PyObject
*str
; /* temporary string object. */
8071 PyUnicodeObject
*result
;
8073 str
= _PyString_FormatLong(val
, flags
, prec
, type
, &buf
, &len
);
8076 result
= _PyUnicode_New(len
);
8081 for (i
= 0; i
< len
; i
++)
8082 result
->str
[i
] = buf
[i
];
8083 result
->str
[len
] = 0;
8085 return (PyObject
*)result
;
8089 formatint(Py_UNICODE
*buf
,
8096 /* fmt = '%#.' + `prec` + 'l' + `type`
8097 * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8101 char fmt
[64]; /* plenty big enough! */
8105 x
= PyInt_AsLong(v
);
8106 if (x
== -1 && PyErr_Occurred())
8108 if (x
< 0 && type
== 'u') {
8111 if (x
< 0 && (type
== 'x' || type
== 'X' || type
== 'o'))
8118 /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8119 * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8121 if (buflen
<= 14 || buflen
<= (size_t)3 + (size_t)prec
) {
8122 PyErr_SetString(PyExc_OverflowError
,
8123 "formatted integer is too long (precision too large?)");
8127 if ((flags
& F_ALT
) &&
8128 (type
== 'x' || type
== 'X')) {
8129 /* When converting under %#x or %#X, there are a number
8130 * of issues that cause pain:
8131 * - when 0 is being converted, the C standard leaves off
8132 * the '0x' or '0X', which is inconsistent with other
8133 * %#x/%#X conversions and inconsistent with Python's
8135 * - there are platforms that violate the standard and
8136 * convert 0 with the '0x' or '0X'
8137 * (Metrowerks, Compaq Tru64)
8138 * - there are platforms that give '0x' when converting
8139 * under %#X, but convert 0 in accordance with the
8140 * standard (OS/2 EMX)
8142 * We can achieve the desired consistency by inserting our
8143 * own '0x' or '0X' prefix, and substituting %x/%X in place
8146 * Note that this is the same approach as used in
8147 * formatint() in stringobject.c
8149 PyOS_snprintf(fmt
, sizeof(fmt
), "%s0%c%%.%dl%c",
8150 sign
, type
, prec
, type
);
8153 PyOS_snprintf(fmt
, sizeof(fmt
), "%s%%%s.%dl%c",
8154 sign
, (flags
&F_ALT
) ? "#" : "",
8158 return longtounicode(buf
, buflen
, fmt
, -x
);
8160 return longtounicode(buf
, buflen
, fmt
, x
);
8164 formatchar(Py_UNICODE
*buf
,
8170 /* presume that the buffer is at least 2 characters long */
8171 if (PyUnicode_Check(v
)) {
8172 if (PyUnicode_GET_SIZE(v
) != 1)
8174 buf
[0] = PyUnicode_AS_UNICODE(v
)[0];
8177 else if (PyString_Check(v
)) {
8178 if (PyString_GET_SIZE(v
) != 1)
8180 /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8181 with a UnicodeDecodeError if 'char' is not decodable with the
8182 default encoding (usually ASCII, but it might be something else) */
8183 str
= PyString_AS_STRING(v
);
8184 if ((unsigned char)str
[0] > 0x7F) {
8185 /* the char is not ASCII; try to decode the string using the
8186 default encoding and return -1 to let the UnicodeDecodeError
8187 be raised if the string can't be decoded */
8188 unistr
= PyUnicode_Decode(str
, 1, NULL
, "strict");
8191 buf
[0] = PyUnicode_AS_UNICODE(unistr
)[0];
8195 buf
[0] = (Py_UNICODE
)str
[0];
8199 /* Integer input truncated to a character */
8201 x
= PyInt_AsLong(v
);
8202 if (x
== -1 && PyErr_Occurred())
8204 #ifdef Py_UNICODE_WIDE
8205 if (x
< 0 || x
> 0x10ffff) {
8206 PyErr_SetString(PyExc_OverflowError
,
8207 "%c arg not in range(0x110000) "
8208 "(wide Python build)");
8212 if (x
< 0 || x
> 0xffff) {
8213 PyErr_SetString(PyExc_OverflowError
,
8214 "%c arg not in range(0x10000) "
8215 "(narrow Python build)");
8219 buf
[0] = (Py_UNICODE
) x
;
8225 PyErr_SetString(PyExc_TypeError
,
8226 "%c requires int or char");
8230 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8232 FORMATBUFLEN is the length of the buffer in which the ints &
8233 chars are formatted. XXX This is a magic number. Each formatting
8234 routine does bounds checking to ensure no overflow, but a better
8235 solution may be to malloc a buffer of appropriate size for each
8236 format. For now, the current solution is sufficient.
8238 #define FORMATBUFLEN (size_t)120
8240 PyObject
*PyUnicode_Format(PyObject
*format
,
8243 Py_UNICODE
*fmt
, *res
;
8244 Py_ssize_t fmtcnt
, rescnt
, reslen
, arglen
, argidx
;
8246 PyUnicodeObject
*result
= NULL
;
8247 PyObject
*dict
= NULL
;
8250 if (format
== NULL
|| args
== NULL
) {
8251 PyErr_BadInternalCall();
8254 uformat
= PyUnicode_FromObject(format
);
8255 if (uformat
== NULL
)
8257 fmt
= PyUnicode_AS_UNICODE(uformat
);
8258 fmtcnt
= PyUnicode_GET_SIZE(uformat
);
8260 reslen
= rescnt
= fmtcnt
+ 100;
8261 result
= _PyUnicode_New(reslen
);
8264 res
= PyUnicode_AS_UNICODE(result
);
8266 if (PyTuple_Check(args
)) {
8267 arglen
= PyTuple_Size(args
);
8274 if (Py_TYPE(args
)->tp_as_mapping
&& !PyTuple_Check(args
) &&
8275 !PyObject_TypeCheck(args
, &PyBaseString_Type
))
8278 while (--fmtcnt
>= 0) {
8281 rescnt
= fmtcnt
+ 100;
8283 if (_PyUnicode_Resize(&result
, reslen
) < 0)
8285 res
= PyUnicode_AS_UNICODE(result
) + reslen
- rescnt
;
8291 /* Got a format specifier */
8293 Py_ssize_t width
= -1;
8295 Py_UNICODE c
= '\0';
8299 PyObject
*temp
= NULL
;
8303 Py_UNICODE formatbuf
[FORMATBUFLEN
]; /* For format{int,char}() */
8307 Py_UNICODE
*keystart
;
8313 PyErr_SetString(PyExc_TypeError
,
8314 "format requires a mapping");
8320 /* Skip over balanced parentheses */
8321 while (pcount
> 0 && --fmtcnt
>= 0) {
8324 else if (*fmt
== '(')
8328 keylen
= fmt
- keystart
- 1;
8329 if (fmtcnt
< 0 || pcount
> 0) {
8330 PyErr_SetString(PyExc_ValueError
,
8331 "incomplete format key");
8335 /* keys are converted to strings using UTF-8 and
8336 then looked up since Python uses strings to hold
8337 variables names etc. in its namespaces and we
8338 wouldn't want to break common idioms. */
8339 key
= PyUnicode_EncodeUTF8(keystart
,
8343 key
= PyUnicode_FromUnicode(keystart
, keylen
);
8351 args
= PyObject_GetItem(dict
, key
);
8360 while (--fmtcnt
>= 0) {
8361 switch (c
= *fmt
++) {
8362 case '-': flags
|= F_LJUST
; continue;
8363 case '+': flags
|= F_SIGN
; continue;
8364 case ' ': flags
|= F_BLANK
; continue;
8365 case '#': flags
|= F_ALT
; continue;
8366 case '0': flags
|= F_ZERO
; continue;
8371 v
= getnextarg(args
, arglen
, &argidx
);
8374 if (!PyInt_Check(v
)) {
8375 PyErr_SetString(PyExc_TypeError
,
8379 width
= PyInt_AsLong(v
);
8387 else if (c
>= '0' && c
<= '9') {
8389 while (--fmtcnt
>= 0) {
8391 if (c
< '0' || c
> '9')
8393 if ((width
*10) / 10 != width
) {
8394 PyErr_SetString(PyExc_ValueError
,
8398 width
= width
*10 + (c
- '0');
8406 v
= getnextarg(args
, arglen
, &argidx
);
8409 if (!PyInt_Check(v
)) {
8410 PyErr_SetString(PyExc_TypeError
,
8414 prec
= PyInt_AsLong(v
);
8420 else if (c
>= '0' && c
<= '9') {
8422 while (--fmtcnt
>= 0) {
8424 if (c
< '0' || c
> '9')
8426 if ((prec
*10) / 10 != prec
) {
8427 PyErr_SetString(PyExc_ValueError
,
8431 prec
= prec
*10 + (c
- '0');
8436 if (c
== 'h' || c
== 'l' || c
== 'L') {
8442 PyErr_SetString(PyExc_ValueError
,
8443 "incomplete format");
8447 v
= getnextarg(args
, arglen
, &argidx
);
8457 /* presume that buffer length is at least 1 */
8464 if (PyUnicode_CheckExact(v
) && c
== 's') {
8471 temp
= PyObject_Unicode(v
);
8473 temp
= PyObject_Repr(v
);
8476 if (PyUnicode_Check(temp
))
8477 /* nothing to do */;
8478 else if (PyString_Check(temp
)) {
8479 /* convert to string to Unicode */
8480 unicode
= PyUnicode_Decode(PyString_AS_STRING(temp
),
8481 PyString_GET_SIZE(temp
),
8491 PyErr_SetString(PyExc_TypeError
,
8492 "%s argument has non-string str()");
8496 pbuf
= PyUnicode_AS_UNICODE(temp
);
8497 len
= PyUnicode_GET_SIZE(temp
);
8498 if (prec
>= 0 && len
> prec
)
8511 if (PyNumber_Check(v
)) {
8512 PyObject
*iobj
=NULL
;
8514 if (PyInt_Check(v
) || (PyLong_Check(v
))) {
8519 iobj
= PyNumber_Int(v
);
8520 if (iobj
==NULL
) iobj
= PyNumber_Long(v
);
8523 if (PyInt_Check(iobj
)) {
8526 len
= formatint(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
),
8527 flags
, prec
, c
, iobj
);
8533 else if (PyLong_Check(iobj
)) {
8535 temp
= formatlong(iobj
, flags
, prec
, c
);
8539 pbuf
= PyUnicode_AS_UNICODE(temp
);
8540 len
= PyUnicode_GET_SIZE(temp
);
8549 PyErr_Format(PyExc_TypeError
,
8550 "%%%c format: a number is required, "
8551 "not %.200s", (char)c
, Py_TYPE(v
)->tp_name
);
8564 temp
= formatfloat(v
, flags
, prec
, c
);
8567 pbuf
= PyUnicode_AS_UNICODE(temp
);
8568 len
= PyUnicode_GET_SIZE(temp
);
8576 len
= formatchar(pbuf
, sizeof(formatbuf
)/sizeof(Py_UNICODE
), v
);
8582 PyErr_Format(PyExc_ValueError
,
8583 "unsupported format character '%c' (0x%x) "
8585 (31<=c
&& c
<=126) ? (char)c
: '?',
8587 (Py_ssize_t
)(fmt
- 1 -
8588 PyUnicode_AS_UNICODE(uformat
)));
8592 if (*pbuf
== '-' || *pbuf
== '+') {
8596 else if (flags
& F_SIGN
)
8598 else if (flags
& F_BLANK
)
8605 if (rescnt
- (sign
!= 0) < width
) {
8607 rescnt
= width
+ fmtcnt
+ 100;
8614 if (_PyUnicode_Resize(&result
, reslen
) < 0) {
8618 res
= PyUnicode_AS_UNICODE(result
)
8628 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8629 assert(pbuf
[0] == '0');
8630 assert(pbuf
[1] == c
);
8641 if (width
> len
&& !(flags
& F_LJUST
)) {
8645 } while (--width
> len
);
8650 if ((flags
& F_ALT
) && (c
== 'x' || c
== 'X')) {
8651 assert(pbuf
[0] == '0');
8652 assert(pbuf
[1] == c
);
8657 Py_UNICODE_COPY(res
, pbuf
, len
);
8660 while (--width
>= len
) {
8664 if (dict
&& (argidx
< arglen
) && c
!= '%') {
8665 PyErr_SetString(PyExc_TypeError
,
8666 "not all arguments converted during string formatting");
8673 if (argidx
< arglen
&& !dict
) {
8674 PyErr_SetString(PyExc_TypeError
,
8675 "not all arguments converted during string formatting");
8679 if (_PyUnicode_Resize(&result
, reslen
- rescnt
) < 0)
8685 return (PyObject
*)result
;
8696 static PyBufferProcs unicode_as_buffer
= {
8697 (readbufferproc
) unicode_buffer_getreadbuf
,
8698 (writebufferproc
) unicode_buffer_getwritebuf
,
8699 (segcountproc
) unicode_buffer_getsegcount
,
8700 (charbufferproc
) unicode_buffer_getcharbuf
,
8704 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
);
8707 unicode_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8710 static char *kwlist
[] = {"string", "encoding", "errors", 0};
8711 char *encoding
= NULL
;
8712 char *errors
= NULL
;
8714 if (type
!= &PyUnicode_Type
)
8715 return unicode_subtype_new(type
, args
, kwds
);
8716 if (!PyArg_ParseTupleAndKeywords(args
, kwds
, "|Oss:unicode",
8717 kwlist
, &x
, &encoding
, &errors
))
8720 return (PyObject
*)_PyUnicode_New(0);
8721 if (encoding
== NULL
&& errors
== NULL
)
8722 return PyObject_Unicode(x
);
8724 return PyUnicode_FromEncodedObject(x
, encoding
, errors
);
8728 unicode_subtype_new(PyTypeObject
*type
, PyObject
*args
, PyObject
*kwds
)
8730 PyUnicodeObject
*tmp
, *pnew
;
8733 assert(PyType_IsSubtype(type
, &PyUnicode_Type
));
8734 tmp
= (PyUnicodeObject
*)unicode_new(&PyUnicode_Type
, args
, kwds
);
8737 assert(PyUnicode_Check(tmp
));
8738 pnew
= (PyUnicodeObject
*) type
->tp_alloc(type
, n
= tmp
->length
);
8743 pnew
->str
= (Py_UNICODE
*) PyObject_MALLOC(sizeof(Py_UNICODE
) * (n
+1));
8744 if (pnew
->str
== NULL
) {
8745 _Py_ForgetReference((PyObject
*)pnew
);
8748 return PyErr_NoMemory();
8750 Py_UNICODE_COPY(pnew
->str
, tmp
->str
, n
+1);
8752 pnew
->hash
= tmp
->hash
;
8754 return (PyObject
*)pnew
;
8757 PyDoc_STRVAR(unicode_doc
,
8758 "unicode(string [, encoding[, errors]]) -> object\n\
8760 Create a new Unicode object from the given encoded string.\n\
8761 encoding defaults to the current default string encoding.\n\
8762 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8764 PyTypeObject PyUnicode_Type
= {
8765 PyVarObject_HEAD_INIT(&PyType_Type
, 0)
8766 "unicode", /* tp_name */
8767 sizeof(PyUnicodeObject
), /* tp_size */
8768 0, /* tp_itemsize */
8770 (destructor
)unicode_dealloc
, /* tp_dealloc */
8775 unicode_repr
, /* tp_repr */
8776 &unicode_as_number
, /* tp_as_number */
8777 &unicode_as_sequence
, /* tp_as_sequence */
8778 &unicode_as_mapping
, /* tp_as_mapping */
8779 (hashfunc
) unicode_hash
, /* tp_hash*/
8781 (reprfunc
) unicode_str
, /* tp_str */
8782 PyObject_GenericGetAttr
, /* tp_getattro */
8783 0, /* tp_setattro */
8784 &unicode_as_buffer
, /* tp_as_buffer */
8785 Py_TPFLAGS_DEFAULT
| Py_TPFLAGS_CHECKTYPES
|
8786 Py_TPFLAGS_BASETYPE
| Py_TPFLAGS_UNICODE_SUBCLASS
, /* tp_flags */
8787 unicode_doc
, /* tp_doc */
8788 0, /* tp_traverse */
8790 PyUnicode_RichCompare
, /* tp_richcompare */
8791 0, /* tp_weaklistoffset */
8793 0, /* tp_iternext */
8794 unicode_methods
, /* tp_methods */
8797 &PyBaseString_Type
, /* tp_base */
8799 0, /* tp_descr_get */
8800 0, /* tp_descr_set */
8801 0, /* tp_dictoffset */
8804 unicode_new
, /* tp_new */
8805 PyObject_Del
, /* tp_free */
8808 /* Initialize the Unicode implementation */
8810 void _PyUnicode_Init(void)
8814 /* XXX - move this array to unicodectype.c ? */
8815 Py_UNICODE linebreak
[] = {
8816 0x000A, /* LINE FEED */
8817 0x000D, /* CARRIAGE RETURN */
8818 0x001C, /* FILE SEPARATOR */
8819 0x001D, /* GROUP SEPARATOR */
8820 0x001E, /* RECORD SEPARATOR */
8821 0x0085, /* NEXT LINE */
8822 0x2028, /* LINE SEPARATOR */
8823 0x2029, /* PARAGRAPH SEPARATOR */
8826 /* Init the implementation */
8829 unicode_empty
= _PyUnicode_New(0);
8833 strcpy(unicode_default_encoding
, "ascii");
8834 for (i
= 0; i
< 256; i
++)
8835 unicode_latin1
[i
] = NULL
;
8836 if (PyType_Ready(&PyUnicode_Type
) < 0)
8837 Py_FatalError("Can't initialize 'unicode'");
8839 /* initialize the linebreak bloom filter */
8840 bloom_linebreak
= make_bloom_mask(
8841 linebreak
, sizeof(linebreak
) / sizeof(linebreak
[0])
8844 PyType_Ready(&EncodingMapType
);
8847 /* Finalize the Unicode implementation */
8850 PyUnicode_ClearFreeList(void)
8852 int freelist_size
= numfree
;
8855 for (u
= free_list
; u
!= NULL
;) {
8856 PyUnicodeObject
*v
= u
;
8857 u
= *(PyUnicodeObject
**)u
;
8859 PyObject_DEL(v
->str
);
8860 Py_XDECREF(v
->defenc
);
8865 assert(numfree
== 0);
8866 return freelist_size
;
8870 _PyUnicode_Fini(void)
8874 Py_XDECREF(unicode_empty
);
8875 unicode_empty
= NULL
;
8877 for (i
= 0; i
< 256; i
++) {
8878 if (unicode_latin1
[i
]) {
8879 Py_DECREF(unicode_latin1
[i
]);
8880 unicode_latin1
[i
] = NULL
;
8883 (void)PyUnicode_ClearFreeList();