AppPkg/Applications/Python/Python-2.7.2/Objects/unicodeobject.c

   1 /*
   2
   3 Unicode implementation based on original code by Fredrik Lundh,
   4 modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
   5 Unicode Integration Proposal (see file Misc/unicode.txt).
   6
   7 Major speed upgrades to the method implementations at the Reykjavik
   8 NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
   9
  10 Copyright (c) Corporation for National Research Initiatives.
  11
  12 --------------------------------------------------------------------
  13 The original string type implementation is:
  14
  15   Copyright (c) 1999 by Secret Labs AB
  16   Copyright (c) 1999 by Fredrik Lundh
  17
  18 By obtaining, using, and/or copying this software and/or its
  19 associated documentation, you agree that you have read, understood,
  20 and will comply with the following terms and conditions:
  21
  22 Permission to use, copy, modify, and distribute this software and its
  23 associated documentation for any purpose and without fee is hereby
  24 granted, provided that the above copyright notice appears in all
  25 copies, and that both that copyright notice and this permission notice
  26 appear in supporting documentation, and that the name of Secret Labs
  27 AB or the author not be used in advertising or publicity pertaining to
  28 distribution of the software without specific, written prior
  29 permission.
  30
  31 SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
  32 THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
  33 FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
  34 ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  35 WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  36 ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
  37 OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  38 --------------------------------------------------------------------
  39
  40 */
  41
  42 #define PY_SSIZE_T_CLEAN
  43 #include "Python.h"
  44
  45 #include "unicodeobject.h"
  46 #include "ucnhash.h"
  47
  48 #ifdef MS_WINDOWS
  49 #include <windows.h>
  50 #endif
  51
  52 /* Limit for the Unicode object free list */
  53
  54 #define PyUnicode_MAXFREELIST       1024
  55
  56 /* Limit for the Unicode object free list stay alive optimization.
  57
  58    The implementation will keep allocated Unicode memory intact for
  59    all objects on the free list having a size less than this
  60    limit. This reduces malloc() overhead for small Unicode objects.
  61
  62    At worst this will result in PyUnicode_MAXFREELIST *
  63    (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
  64    malloc()-overhead) bytes of unused garbage.
  65
  66    Setting the limit to 0 effectively turns the feature off.
  67
  68    Note: This is an experimental feature ! If you get core dumps when
  69    using Unicode objects, turn this feature off.
  70
  71 */
  72
  73 #define KEEPALIVE_SIZE_LIMIT       9
  74
  75 /* Endianness switches; defaults to little endian */
  76
  77 #ifdef WORDS_BIGENDIAN
  78 # define BYTEORDER_IS_BIG_ENDIAN
  79 #else
  80 # define BYTEORDER_IS_LITTLE_ENDIAN
  81 #endif
  82
  83 /* --- Globals ------------------------------------------------------------
  84
  85    The globals are initialized by the _PyUnicode_Init() API and should
  86    not be used before calling that API.
  87
  88 */
  89
  90
  91 #ifdef __cplusplus
  92 extern "C" {
  93 #endif
  94
  95 /* Free list for Unicode objects */
  96 static PyUnicodeObject *free_list;
  97 static int numfree;
  98
  99 /* The empty Unicode object is shared to improve performance. */
 100 static PyUnicodeObject *unicode_empty;
 101
 102 /* Single character Unicode strings in the Latin-1 range are being
 103    shared as well. */
 104 static PyUnicodeObject *unicode_latin1[256];
 105
 106 /* Default encoding to use and assume when NULL is passed as encoding
 107    parameter; it is initialized by _PyUnicode_Init().
 108
 109    Always use the PyUnicode_SetDefaultEncoding() and
 110    PyUnicode_GetDefaultEncoding() APIs to access this global.
 111
 112 */
 113 static char unicode_default_encoding[100];
 114
 115 /* Fast detection of the most frequent whitespace characters */
 116 const unsigned char _Py_ascii_whitespace[] = {
 117     0, 0, 0, 0, 0, 0, 0, 0,
 118 /*     case 0x0009: * CHARACTER TABULATION */
 119 /*     case 0x000A: * LINE FEED */
 120 /*     case 0x000B: * LINE TABULATION */
 121 /*     case 0x000C: * FORM FEED */
 122 /*     case 0x000D: * CARRIAGE RETURN */
 123     0, 1, 1, 1, 1, 1, 0, 0,
 124     0, 0, 0, 0, 0, 0, 0, 0,
 125 /*     case 0x001C: * FILE SEPARATOR */
 126 /*     case 0x001D: * GROUP SEPARATOR */
 127 /*     case 0x001E: * RECORD SEPARATOR */
 128 /*     case 0x001F: * UNIT SEPARATOR */
 129     0, 0, 0, 0, 1, 1, 1, 1,
 130 /*     case 0x0020: * SPACE */
 131     1, 0, 0, 0, 0, 0, 0, 0,
 132     0, 0, 0, 0, 0, 0, 0, 0,
 133     0, 0, 0, 0, 0, 0, 0, 0,
 134     0, 0, 0, 0, 0, 0, 0, 0,
 135
 136     0, 0, 0, 0, 0, 0, 0, 0,
 137     0, 0, 0, 0, 0, 0, 0, 0,
 138     0, 0, 0, 0, 0, 0, 0, 0,
 139     0, 0, 0, 0, 0, 0, 0, 0,
 140     0, 0, 0, 0, 0, 0, 0, 0,
 141     0, 0, 0, 0, 0, 0, 0, 0,
 142     0, 0, 0, 0, 0, 0, 0, 0,
 143     0, 0, 0, 0, 0, 0, 0, 0
 144 };
 145
 146 /* Same for linebreaks */
 147 static unsigned char ascii_linebreak[] = {
 148     0, 0, 0, 0, 0, 0, 0, 0,
 149 /*         0x000A, * LINE FEED */
 150 /*         0x000B, * LINE TABULATION */
 151 /*         0x000C, * FORM FEED */
 152 /*         0x000D, * CARRIAGE RETURN */
 153     0, 0, 1, 1, 1, 1, 0, 0,
 154     0, 0, 0, 0, 0, 0, 0, 0,
 155 /*         0x001C, * FILE SEPARATOR */
 156 /*         0x001D, * GROUP SEPARATOR */
 157 /*         0x001E, * RECORD SEPARATOR */
 158     0, 0, 0, 0, 1, 1, 1, 0,
 159     0, 0, 0, 0, 0, 0, 0, 0,
 160     0, 0, 0, 0, 0, 0, 0, 0,
 161     0, 0, 0, 0, 0, 0, 0, 0,
 162     0, 0, 0, 0, 0, 0, 0, 0,
 163
 164     0, 0, 0, 0, 0, 0, 0, 0,
 165     0, 0, 0, 0, 0, 0, 0, 0,
 166     0, 0, 0, 0, 0, 0, 0, 0,
 167     0, 0, 0, 0, 0, 0, 0, 0,
 168     0, 0, 0, 0, 0, 0, 0, 0,
 169     0, 0, 0, 0, 0, 0, 0, 0,
 170     0, 0, 0, 0, 0, 0, 0, 0,
 171     0, 0, 0, 0, 0, 0, 0, 0
 172 };
 173
 174
 175 Py_UNICODE
 176 PyUnicode_GetMax(void)
 177 {
 178 #ifdef Py_UNICODE_WIDE
 179     return 0x10FFFF;
 180 #else
 181     /* This is actually an illegal character, so it should
 182        not be passed to unichr. */
 183     return 0xFFFF;
 184 #endif
 185 }
 186
 187 /* --- Bloom Filters ----------------------------------------------------- */
 188
 189 /* stuff to implement simple "bloom filters" for Unicode characters.
 190    to keep things simple, we use a single bitmask, using the least 5
 191    bits from each unicode characters as the bit index. */
 192
 193 /* the linebreak mask is set up by Unicode_Init below */
 194
 195 #if LONG_BIT >= 128
 196 #define BLOOM_WIDTH 128
 197 #elif LONG_BIT >= 64
 198 #define BLOOM_WIDTH 64
 199 #elif LONG_BIT >= 32
 200 #define BLOOM_WIDTH 32
 201 #else
 202 #error "LONG_BIT is smaller than 32"
 203 #endif
 204
 205 #define BLOOM_MASK unsigned long
 206
 207 static BLOOM_MASK bloom_linebreak;
 208
 209 #define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 210 #define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
 211
 212 #define BLOOM_LINEBREAK(ch)                                             \
 213     ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
 214      (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
 215
 216 Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
 217 {
 218     /* calculate simple bloom-style bitmask for a given unicode string */
 219
 220     BLOOM_MASK mask;
 221     Py_ssize_t i;
 222
 223     mask = 0;
 224     for (i = 0; i < len; i++)
 225         BLOOM_ADD(mask, ptr[i]);
 226
 227     return mask;
 228 }
 229
 230 Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
 231 {
 232     Py_ssize_t i;
 233
 234     for (i = 0; i < setlen; i++)
 235         if (set[i] == chr)
 236             return 1;
 237
 238     return 0;
 239 }
 240
 241 #define BLOOM_MEMBER(mask, chr, set, setlen)                    \
 242     BLOOM(mask, chr) && unicode_member(chr, set, setlen)
 243
 244 /* --- Unicode Object ----------------------------------------------------- */
 245
 246 static
 247 int unicode_resize(register PyUnicodeObject *unicode,
 248                    Py_ssize_t length)
 249 {
 250     void *oldstr;
 251
 252     /* Shortcut if there's nothing much to do. */
 253     if (unicode->length == length)
 254         goto reset;
 255
 256     /* Resizing shared object (unicode_empty or single character
 257        objects) in-place is not allowed. Use PyUnicode_Resize()
 258        instead ! */
 259
 260     if (unicode == unicode_empty ||
 261         (unicode->length == 1 &&
 262          unicode->str[0] < 256U &&
 263          unicode_latin1[unicode->str[0]] == unicode)) {
 264         PyErr_SetString(PyExc_SystemError,
 265                         "can't resize shared unicode objects");
 266         return -1;
 267     }
 268
 269     /* We allocate one more byte to make sure the string is Ux0000 terminated.
 270        The overallocation is also used by fastsearch, which assumes that it's
 271        safe to look at str[length] (without making any assumptions about what
 272        it contains). */
 273
 274     oldstr = unicode->str;
 275     unicode->str = PyObject_REALLOC(unicode->str,
 276                                     sizeof(Py_UNICODE) * (length + 1));
 277     if (!unicode->str) {
 278         unicode->str = (Py_UNICODE *)oldstr;
 279         PyErr_NoMemory();
 280         return -1;
 281     }
 282     unicode->str[length] = 0;
 283     unicode->length = length;
 284
 285   reset:
 286     /* Reset the object caches */
 287     if (unicode->defenc) {
 288         Py_CLEAR(unicode->defenc);
 289     }
 290     unicode->hash = -1;
 291
 292     return 0;
 293 }
 294
 295 /* We allocate one more byte to make sure the string is
 296    Ux0000 terminated; some code relies on that.
 297
 298    XXX This allocator could further be enhanced by assuring that the
 299    free list never reduces its size below 1.
 300
 301 */
 302
 303 static
 304 PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
 305 {
 306     register PyUnicodeObject *unicode;
 307
 308     /* Optimization for empty strings */
 309     if (length == 0 && unicode_empty != NULL) {
 310         Py_INCREF(unicode_empty);
 311         return unicode_empty;
 312     }
 313
 314     /* Ensure we won't overflow the size. */
 315     if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
 316         return (PyUnicodeObject *)PyErr_NoMemory();
 317     }
 318
 319     /* Unicode freelist & memory allocation */
 320     if (free_list) {
 321         unicode = free_list;
 322         free_list = *(PyUnicodeObject **)unicode;
 323         numfree--;
 324         if (unicode->str) {
 325             /* Keep-Alive optimization: we only upsize the buffer,
 326                never downsize it. */
 327             if ((unicode->length < length) &&
 328                 unicode_resize(unicode, length) < 0) {
 329                 PyObject_DEL(unicode->str);
 330                 unicode->str = NULL;
 331             }
 332         }
 333         else {
 334             size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 335             unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 336         }
 337         PyObject_INIT(unicode, &PyUnicode_Type);
 338     }
 339     else {
 340         size_t new_size;
 341         unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
 342         if (unicode == NULL)
 343             return NULL;
 344         new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
 345         unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
 346     }
 347
 348     if (!unicode->str) {
 349         PyErr_NoMemory();
 350         goto onError;
 351     }
 352     /* Initialize the first element to guard against cases where
 353      * the caller fails before initializing str -- unicode_resize()
 354      * reads str[0], and the Keep-Alive optimization can keep memory
 355      * allocated for str alive across a call to unicode_dealloc(unicode).
 356      * We don't want unicode_resize to read uninitialized memory in
 357      * that case.
 358      */
 359     unicode->str[0] = 0;
 360     unicode->str[length] = 0;
 361     unicode->length = length;
 362     unicode->hash = -1;
 363     unicode->defenc = NULL;
 364     return unicode;
 365
 366   onError:
 367     /* XXX UNREF/NEWREF interface should be more symmetrical */
 368     _Py_DEC_REFTOTAL;
 369     _Py_ForgetReference((PyObject *)unicode);
 370     PyObject_Del(unicode);
 371     return NULL;
 372 }
 373
 374 static
 375 void unicode_dealloc(register PyUnicodeObject *unicode)
 376 {
 377     if (PyUnicode_CheckExact(unicode) &&
 378         numfree < PyUnicode_MAXFREELIST) {
 379         /* Keep-Alive optimization */
 380         if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
 381             PyObject_DEL(unicode->str);
 382             unicode->str = NULL;
 383             unicode->length = 0;
 384         }
 385         if (unicode->defenc) {
 386             Py_CLEAR(unicode->defenc);
 387         }
 388         /* Add to free list */
 389         *(PyUnicodeObject **)unicode = free_list;
 390         free_list = unicode;
 391         numfree++;
 392     }
 393     else {
 394         PyObject_DEL(unicode->str);
 395         Py_XDECREF(unicode->defenc);
 396         Py_TYPE(unicode)->tp_free((PyObject *)unicode);
 397     }
 398 }
 399
 400 static
 401 int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
 402 {
 403     register PyUnicodeObject *v;
 404
 405     /* Argument checks */
 406     if (unicode == NULL) {
 407         PyErr_BadInternalCall();
 408         return -1;
 409     }
 410     v = *unicode;
 411     if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
 412         PyErr_BadInternalCall();
 413         return -1;
 414     }
 415
 416     /* Resizing unicode_empty and single character objects is not
 417        possible since these are being shared. We simply return a fresh
 418        copy with the same Unicode content. */
 419     if (v->length != length &&
 420         (v == unicode_empty || v->length == 1)) {
 421         PyUnicodeObject *w = _PyUnicode_New(length);
 422         if (w == NULL)
 423             return -1;
 424         Py_UNICODE_COPY(w->str, v->str,
 425                         length < v->length ? length : v->length);
 426         Py_DECREF(*unicode);
 427         *unicode = w;
 428         return 0;
 429     }
 430
 431     /* Note that we don't have to modify *unicode for unshared Unicode
 432        objects, since we can modify them in-place. */
 433     return unicode_resize(v, length);
 434 }
 435
 436 int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
 437 {
 438     return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
 439 }
 440
 441 PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
 442                                 Py_ssize_t size)
 443 {
 444     PyUnicodeObject *unicode;
 445
 446     /* If the Unicode data is known at construction time, we can apply
 447        some optimizations which share commonly used objects. */
 448     if (u != NULL) {
 449
 450         /* Optimization for empty strings */
 451         if (size == 0 && unicode_empty != NULL) {
 452             Py_INCREF(unicode_empty);
 453             return (PyObject *)unicode_empty;
 454         }
 455
 456         /* Single character Unicode objects in the Latin-1 range are
 457            shared when using this constructor */
 458         if (size == 1 && *u < 256) {
 459             unicode = unicode_latin1[*u];
 460             if (!unicode) {
 461                 unicode = _PyUnicode_New(1);
 462                 if (!unicode)
 463                     return NULL;
 464                 unicode->str[0] = *u;
 465                 unicode_latin1[*u] = unicode;
 466             }
 467             Py_INCREF(unicode);
 468             return (PyObject *)unicode;
 469         }
 470     }
 471
 472     unicode = _PyUnicode_New(size);
 473     if (!unicode)
 474         return NULL;
 475
 476     /* Copy the Unicode data into the new object */
 477     if (u != NULL)
 478         Py_UNICODE_COPY(unicode->str, u, size);
 479
 480     return (PyObject *)unicode;
 481 }
 482
 483 PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
 484 {
 485     PyUnicodeObject *unicode;
 486
 487     if (size < 0) {
 488         PyErr_SetString(PyExc_SystemError,
 489                         "Negative size passed to PyUnicode_FromStringAndSize");
 490         return NULL;
 491     }
 492
 493     /* If the Unicode data is known at construction time, we can apply
 494        some optimizations which share commonly used objects.
 495        Also, this means the input must be UTF-8, so fall back to the
 496        UTF-8 decoder at the end. */
 497     if (u != NULL) {
 498
 499         /* Optimization for empty strings */
 500         if (size == 0 && unicode_empty != NULL) {
 501             Py_INCREF(unicode_empty);
 502             return (PyObject *)unicode_empty;
 503         }
 504
 505         /* Single characters are shared when using this constructor.
 506            Restrict to ASCII, since the input must be UTF-8. */
 507         if (size == 1 && Py_CHARMASK(*u) < 128) {
 508             unicode = unicode_latin1[Py_CHARMASK(*u)];
 509             if (!unicode) {
 510                 unicode = _PyUnicode_New(1);
 511                 if (!unicode)
 512                     return NULL;
 513                 unicode->str[0] = Py_CHARMASK(*u);
 514                 unicode_latin1[Py_CHARMASK(*u)] = unicode;
 515             }
 516             Py_INCREF(unicode);
 517             return (PyObject *)unicode;
 518         }
 519
 520         return PyUnicode_DecodeUTF8(u, size, NULL);
 521     }
 522
 523     unicode = _PyUnicode_New(size);
 524     if (!unicode)
 525         return NULL;
 526
 527     return (PyObject *)unicode;
 528 }
 529
 530 PyObject *PyUnicode_FromString(const char *u)
 531 {
 532     size_t size = strlen(u);
 533     if (size > PY_SSIZE_T_MAX) {
 534         PyErr_SetString(PyExc_OverflowError, "input too long");
 535         return NULL;
 536     }
 537
 538     return PyUnicode_FromStringAndSize(u, size);
 539 }
 540
 541 #ifdef HAVE_WCHAR_H
 542
 543 #if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
 544 # define CONVERT_WCHAR_TO_SURROGATES
 545 #endif
 546
 547 #ifdef CONVERT_WCHAR_TO_SURROGATES
 548
 549 /* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
 550    to convert from UTF32 to UTF16. */
 551
 552 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 553                                  Py_ssize_t size)
 554 {
 555     PyUnicodeObject *unicode;
 556     register Py_ssize_t i;
 557     Py_ssize_t alloc;
 558     const wchar_t *orig_w;
 559
 560     if (w == NULL) {
 561         PyErr_BadInternalCall();
 562         return NULL;
 563     }
 564
 565     alloc = size;
 566     orig_w = w;
 567     for (i = size; i > 0; i--) {
 568         if (*w > 0xFFFF)
 569             alloc++;
 570         w++;
 571     }
 572     w = orig_w;
 573     unicode = _PyUnicode_New(alloc);
 574     if (!unicode)
 575         return NULL;
 576
 577     /* Copy the wchar_t data into the new object */
 578     {
 579         register Py_UNICODE *u;
 580         u = PyUnicode_AS_UNICODE(unicode);
 581         for (i = size; i > 0; i--) {
 582             if (*w > 0xFFFF) {
 583                 wchar_t ordinal = *w++;
 584                 ordinal -= 0x10000;
 585                 *u++ = 0xD800 | (ordinal >> 10);
 586                 *u++ = 0xDC00 | (ordinal & 0x3FF);
 587             }
 588             else
 589                 *u++ = *w++;
 590         }
 591     }
 592     return (PyObject *)unicode;
 593 }
 594
 595 #else
 596
 597 PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
 598                                  Py_ssize_t size)
 599 {
 600     PyUnicodeObject *unicode;
 601
 602     if (w == NULL) {
 603         PyErr_BadInternalCall();
 604         return NULL;
 605     }
 606
 607     unicode = _PyUnicode_New(size);
 608     if (!unicode)
 609         return NULL;
 610
 611     /* Copy the wchar_t data into the new object */
 612 #ifdef HAVE_USABLE_WCHAR_T
 613     memcpy(unicode->str, w, size * sizeof(wchar_t));
 614 #else
 615     {
 616         register Py_UNICODE *u;
 617         register Py_ssize_t i;
 618         u = PyUnicode_AS_UNICODE(unicode);
 619         for (i = size; i > 0; i--)
 620             *u++ = *w++;
 621     }
 622 #endif
 623
 624     return (PyObject *)unicode;
 625 }
 626
 627 #endif /* CONVERT_WCHAR_TO_SURROGATES */
 628
 629 #undef CONVERT_WCHAR_TO_SURROGATES
 630
 631 static void
 632 makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
 633 {
 634     *fmt++ = '%';
 635     if (width) {
 636         if (zeropad)
 637             *fmt++ = '0';
 638         fmt += sprintf(fmt, "%d", width);
 639     }
 640     if (precision)
 641         fmt += sprintf(fmt, ".%d", precision);
 642     if (longflag)
 643         *fmt++ = 'l';
 644     else if (size_tflag) {
 645         char *f = PY_FORMAT_SIZE_T;
 646         while (*f)
 647             *fmt++ = *f++;
 648     }
 649     *fmt++ = c;
 650     *fmt = '\0';
 651 }
 652
 653 #define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
 654
 655 PyObject *
 656 PyUnicode_FromFormatV(const char *format, va_list vargs)
 657 {
 658     va_list count;
 659     Py_ssize_t callcount = 0;
 660     PyObject **callresults = NULL;
 661     PyObject **callresult = NULL;
 662     Py_ssize_t n = 0;
 663     int width = 0;
 664     int precision = 0;
 665     int zeropad;
 666     const char* f;
 667     Py_UNICODE *s;
 668     PyObject *string;
 669     /* used by sprintf */
 670     char buffer[21];
 671     /* use abuffer instead of buffer, if we need more space
 672      * (which can happen if there's a format specifier with width). */
 673     char *abuffer = NULL;
 674     char *realbuffer;
 675     Py_ssize_t abuffersize = 0;
 676     char fmt[60]; /* should be enough for %0width.precisionld */
 677     const char *copy;
 678
 679 #ifdef VA_LIST_IS_ARRAY
 680     Py_MEMCPY(count, vargs, sizeof(va_list));
 681 #else
 682 #ifdef  __va_copy
 683     __va_copy(count, vargs);
 684 #else
 685     count = vargs;
 686 #endif
 687 #endif
 688      /* step 1: count the number of %S/%R/%s format specifications
 689       * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
 690       * objects once during step 3 and put the result in an array) */
 691     for (f = format; *f; f++) {
 692          if (*f == '%') {
 693              if (*(f+1)=='%')
 694                  continue;
 695              if (*(f+1)=='S' || *(f+1)=='R')
 696                  ++callcount;
 697              while (isdigit((unsigned)*f))
 698                  width = (width*10) + *f++ - '0';
 699              while (*++f && *f != '%' && !isalpha((unsigned)*f))
 700                  ;
 701              if (*f == 's')
 702                  ++callcount;
 703          }
 704     }
 705     /* step 2: allocate memory for the results of
 706      * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
 707     if (callcount) {
 708         callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
 709         if (!callresults) {
 710             PyErr_NoMemory();
 711             return NULL;
 712         }
 713         callresult = callresults;
 714     }
 715     /* step 3: figure out how large a buffer we need */
 716     for (f = format; *f; f++) {
 717         if (*f == '%') {
 718             const char* p = f;
 719             width = 0;
 720             while (isdigit((unsigned)*f))
 721                 width = (width*10) + *f++ - '0';
 722             while (*++f && *f != '%' && !isalpha((unsigned)*f))
 723                 ;
 724
 725             /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
 726              * they don't affect the amount of space we reserve.
 727              */
 728             if ((*f == 'l' || *f == 'z') &&
 729                 (f[1] == 'd' || f[1] == 'u'))
 730                 ++f;
 731
 732             switch (*f) {
 733             case 'c':
 734                 (void)va_arg(count, int);
 735                 /* fall through... */
 736             case '%':
 737                 n++;
 738                 break;
 739             case 'd': case 'u': case 'i': case 'x':
 740                 (void) va_arg(count, int);
 741                 /* 20 bytes is enough to hold a 64-bit
 742                    integer.  Decimal takes the most space.
 743                    This isn't enough for octal.
 744                    If a width is specified we need more
 745                    (which we allocate later). */
 746                 if (width < 20)
 747                     width = 20;
 748                 n += width;
 749                 if (abuffersize < width)
 750                     abuffersize = width;
 751                 break;
 752             case 's':
 753             {
 754                 /* UTF-8 */
 755                 const char *s = va_arg(count, const char*);
 756                 PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
 757                 if (!str)
 758                     goto fail;
 759                 n += PyUnicode_GET_SIZE(str);
 760                 /* Remember the str and switch to the next slot */
 761                 *callresult++ = str;
 762                 break;
 763             }
 764             case 'U':
 765             {
 766                 PyObject *obj = va_arg(count, PyObject *);
 767                 assert(obj && PyUnicode_Check(obj));
 768                 n += PyUnicode_GET_SIZE(obj);
 769                 break;
 770             }
 771             case 'V':
 772             {
 773                 PyObject *obj = va_arg(count, PyObject *);
 774                 const char *str = va_arg(count, const char *);
 775                 assert(obj || str);
 776                 assert(!obj || PyUnicode_Check(obj));
 777                 if (obj)
 778                     n += PyUnicode_GET_SIZE(obj);
 779                 else
 780                     n += strlen(str);
 781                 break;
 782             }
 783             case 'S':
 784             {
 785                 PyObject *obj = va_arg(count, PyObject *);
 786                 PyObject *str;
 787                 assert(obj);
 788                 str = PyObject_Str(obj);
 789                 if (!str)
 790                     goto fail;
 791                 n += PyUnicode_GET_SIZE(str);
 792                 /* Remember the str and switch to the next slot */
 793                 *callresult++ = str;
 794                 break;
 795             }
 796             case 'R':
 797             {
 798                 PyObject *obj = va_arg(count, PyObject *);
 799                 PyObject *repr;
 800                 assert(obj);
 801                 repr = PyObject_Repr(obj);
 802                 if (!repr)
 803                     goto fail;
 804                 n += PyUnicode_GET_SIZE(repr);
 805                 /* Remember the repr and switch to the next slot */
 806                 *callresult++ = repr;
 807                 break;
 808             }
 809             case 'p':
 810                 (void) va_arg(count, int);
 811                 /* maximum 64-bit pointer representation:
 812                  * 0xffffffffffffffff
 813                  * so 19 characters is enough.
 814                  * XXX I count 18 -- what's the extra for?
 815                  */
 816                 n += 19;
 817                 break;
 818             default:
 819                 /* if we stumble upon an unknown
 820                    formatting code, copy the rest of
 821                    the format string to the output
 822                    string. (we cannot just skip the
 823                    code, since there's no way to know
 824                    what's in the argument list) */
 825                 n += strlen(p);
 826                 goto expand;
 827             }
 828         } else
 829             n++;
 830     }
 831   expand:
 832     if (abuffersize > 20) {
 833         abuffer = PyObject_Malloc(abuffersize);
 834         if (!abuffer) {
 835             PyErr_NoMemory();
 836             goto fail;
 837         }
 838         realbuffer = abuffer;
 839     }
 840     else
 841         realbuffer = buffer;
 842     /* step 4: fill the buffer */
 843     /* Since we've analyzed how much space we need for the worst case,
 844        we don't have to resize the string.
 845        There can be no errors beyond this point. */
 846     string = PyUnicode_FromUnicode(NULL, n);
 847     if (!string)
 848         goto fail;
 849
 850     s = PyUnicode_AS_UNICODE(string);
 851     callresult = callresults;
 852
 853     for (f = format; *f; f++) {
 854         if (*f == '%') {
 855             const char* p = f++;
 856             int longflag = 0;
 857             int size_tflag = 0;
 858             zeropad = (*f == '0');
 859             /* parse the width.precision part */
 860             width = 0;
 861             while (isdigit((unsigned)*f))
 862                 width = (width*10) + *f++ - '0';
 863             precision = 0;
 864             if (*f == '.') {
 865                 f++;
 866                 while (isdigit((unsigned)*f))
 867                     precision = (precision*10) + *f++ - '0';
 868             }
 869             /* handle the long flag, but only for %ld and %lu.
 870                others can be added when necessary. */
 871             if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
 872                 longflag = 1;
 873                 ++f;
 874             }
 875             /* handle the size_t flag. */
 876             if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
 877                 size_tflag = 1;
 878                 ++f;
 879             }
 880
 881             switch (*f) {
 882             case 'c':
 883                 *s++ = va_arg(vargs, int);
 884                 break;
 885             case 'd':
 886                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
 887                 if (longflag)
 888                     sprintf(realbuffer, fmt, va_arg(vargs, long));
 889                 else if (size_tflag)
 890                     sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
 891                 else
 892                     sprintf(realbuffer, fmt, va_arg(vargs, int));
 893                 appendstring(realbuffer);
 894                 break;
 895             case 'u':
 896                 makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
 897                 if (longflag)
 898                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
 899                 else if (size_tflag)
 900                     sprintf(realbuffer, fmt, va_arg(vargs, size_t));
 901                 else
 902                     sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
 903                 appendstring(realbuffer);
 904                 break;
 905             case 'i':
 906                 makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
 907                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 908                 appendstring(realbuffer);
 909                 break;
 910             case 'x':
 911                 makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
 912                 sprintf(realbuffer, fmt, va_arg(vargs, int));
 913                 appendstring(realbuffer);
 914                 break;
 915             case 's':
 916             {
 917                 /* unused, since we already have the result */
 918                 (void) va_arg(vargs, char *);
 919                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
 920                                 PyUnicode_GET_SIZE(*callresult));
 921                 s += PyUnicode_GET_SIZE(*callresult);
 922                 /* We're done with the unicode()/repr() => forget it */
 923                 Py_DECREF(*callresult);
 924                 /* switch to next unicode()/repr() result */
 925                 ++callresult;
 926                 break;
 927             }
 928             case 'U':
 929             {
 930                 PyObject *obj = va_arg(vargs, PyObject *);
 931                 Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 932                 Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 933                 s += size;
 934                 break;
 935             }
 936             case 'V':
 937             {
 938                 PyObject *obj = va_arg(vargs, PyObject *);
 939                 const char *str = va_arg(vargs, const char *);
 940                 if (obj) {
 941                     Py_ssize_t size = PyUnicode_GET_SIZE(obj);
 942                     Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
 943                     s += size;
 944                 } else {
 945                     appendstring(str);
 946                 }
 947                 break;
 948             }
 949             case 'S':
 950             case 'R':
 951             {
 952                 Py_UNICODE *ucopy;
 953                 Py_ssize_t usize;
 954                 Py_ssize_t upos;
 955                 /* unused, since we already have the result */
 956                 (void) va_arg(vargs, PyObject *);
 957                 ucopy = PyUnicode_AS_UNICODE(*callresult);
 958                 usize = PyUnicode_GET_SIZE(*callresult);
 959                 for (upos = 0; upos<usize;)
 960                     *s++ = ucopy[upos++];
 961                 /* We're done with the unicode()/repr() => forget it */
 962                 Py_DECREF(*callresult);
 963                 /* switch to next unicode()/repr() result */
 964                 ++callresult;
 965                 break;
 966             }
 967             case 'p':
 968                 sprintf(buffer, "%p", va_arg(vargs, void*));
 969                 /* %p is ill-defined:  ensure leading 0x. */
 970                 if (buffer[1] == 'X')
 971                     buffer[1] = 'x';
 972                 else if (buffer[1] != 'x') {
 973                     memmove(buffer+2, buffer, strlen(buffer)+1);
 974                     buffer[0] = '0';
 975                     buffer[1] = 'x';
 976                 }
 977                 appendstring(buffer);
 978                 break;
 979             case '%':
 980                 *s++ = '%';
 981                 break;
 982             default:
 983                 appendstring(p);
 984                 goto end;
 985             }
 986         } else
 987             *s++ = *f;
 988     }
 989
 990   end:
 991     if (callresults)
 992         PyObject_Free(callresults);
 993     if (abuffer)
 994         PyObject_Free(abuffer);
 995     PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
 996     return string;
 997   fail:
 998     if (callresults) {
 999         PyObject **callresult2 = callresults;
1000         while (callresult2 < callresult) {
1001             Py_DECREF(*callresult2);
1002             ++callresult2;
1003         }
1004         PyObject_Free(callresults);
1005     }
1006     if (abuffer)
1007         PyObject_Free(abuffer);
1008     return NULL;
1009 }
1010
1011 #undef appendstring
1012
1013 PyObject *
1014 PyUnicode_FromFormat(const char *format, ...)
1015 {
1016     PyObject* ret;
1017     va_list vargs;
1018
1019 #ifdef HAVE_STDARG_PROTOTYPES
1020     va_start(vargs, format);
1021 #else
1022     va_start(vargs);
1023 #endif
1024     ret = PyUnicode_FromFormatV(format, vargs);
1025     va_end(vargs);
1026     return ret;
1027 }
1028
1029 Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1030                                 wchar_t *w,
1031                                 Py_ssize_t size)
1032 {
1033     if (unicode == NULL) {
1034         PyErr_BadInternalCall();
1035         return -1;
1036     }
1037
1038     /* If possible, try to copy the 0-termination as well */
1039     if (size > PyUnicode_GET_SIZE(unicode))
1040         size = PyUnicode_GET_SIZE(unicode) + 1;
1041
1042 #ifdef HAVE_USABLE_WCHAR_T
1043     memcpy(w, unicode->str, size * sizeof(wchar_t));
1044 #else
1045     {
1046         register Py_UNICODE *u;
1047         register Py_ssize_t i;
1048         u = PyUnicode_AS_UNICODE(unicode);
1049         for (i = size; i > 0; i--)
1050             *w++ = *u++;
1051     }
1052 #endif
1053
1054     if (size > PyUnicode_GET_SIZE(unicode))
1055         return PyUnicode_GET_SIZE(unicode);
1056     else
1057         return size;
1058 }
1059
1060 #endif
1061
1062 PyObject *PyUnicode_FromOrdinal(int ordinal)
1063 {
1064     Py_UNICODE s[1];
1065
1066 #ifdef Py_UNICODE_WIDE
1067     if (ordinal < 0 || ordinal > 0x10ffff) {
1068         PyErr_SetString(PyExc_ValueError,
1069                         "unichr() arg not in range(0x110000) "
1070                         "(wide Python build)");
1071         return NULL;
1072     }
1073 #else
1074     if (ordinal < 0 || ordinal > 0xffff) {
1075         PyErr_SetString(PyExc_ValueError,
1076                         "unichr() arg not in range(0x10000) "
1077                         "(narrow Python build)");
1078         return NULL;
1079     }
1080 #endif
1081
1082     s[0] = (Py_UNICODE)ordinal;
1083     return PyUnicode_FromUnicode(s, 1);
1084 }
1085
1086 PyObject *PyUnicode_FromObject(register PyObject *obj)
1087 {
1088     /* XXX Perhaps we should make this API an alias of
1089        PyObject_Unicode() instead ?! */
1090     if (PyUnicode_CheckExact(obj)) {
1091         Py_INCREF(obj);
1092         return obj;
1093     }
1094     if (PyUnicode_Check(obj)) {
1095         /* For a Unicode subtype that's not a Unicode object,
1096            return a true Unicode object with the same data. */
1097         return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098                                      PyUnicode_GET_SIZE(obj));
1099     }
1100     return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101 }
1102
1103 PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104                                       const char *encoding,
1105                                       const char *errors)
1106 {
1107     const char *s = NULL;
1108     Py_ssize_t len;
1109     PyObject *v;
1110
1111     if (obj == NULL) {
1112         PyErr_BadInternalCall();
1113         return NULL;
1114     }
1115
1116 #if 0
1117     /* For b/w compatibility we also accept Unicode objects provided
1118        that no encodings is given and then redirect to
1119        PyObject_Unicode() which then applies the additional logic for
1120        Unicode subclasses.
1121
1122        NOTE: This API should really only be used for object which
1123        represent *encoded* Unicode !
1124
1125     */
1126     if (PyUnicode_Check(obj)) {
1127         if (encoding) {
1128             PyErr_SetString(PyExc_TypeError,
1129                             "decoding Unicode is not supported");
1130             return NULL;
1131         }
1132         return PyObject_Unicode(obj);
1133     }
1134 #else
1135     if (PyUnicode_Check(obj)) {
1136         PyErr_SetString(PyExc_TypeError,
1137                         "decoding Unicode is not supported");
1138         return NULL;
1139     }
1140 #endif
1141
1142     /* Coerce object */
1143     if (PyString_Check(obj)) {
1144         s = PyString_AS_STRING(obj);
1145         len = PyString_GET_SIZE(obj);
1146     }
1147     else if (PyByteArray_Check(obj)) {
1148         /* Python 2.x specific */
1149         PyErr_Format(PyExc_TypeError,
1150                      "decoding bytearray is not supported");
1151         return NULL;
1152     }
1153     else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1154         /* Overwrite the error message with something more useful in
1155            case of a TypeError. */
1156         if (PyErr_ExceptionMatches(PyExc_TypeError))
1157             PyErr_Format(PyExc_TypeError,
1158                          "coercing to Unicode: need string or buffer, "
1159                          "%.80s found",
1160                          Py_TYPE(obj)->tp_name);
1161         goto onError;
1162     }
1163
1164     /* Convert to Unicode */
1165     if (len == 0) {
1166         Py_INCREF(unicode_empty);
1167         v = (PyObject *)unicode_empty;
1168     }
1169     else
1170         v = PyUnicode_Decode(s, len, encoding, errors);
1171
1172     return v;
1173
1174   onError:
1175     return NULL;
1176 }
1177
1178 PyObject *PyUnicode_Decode(const char *s,
1179                            Py_ssize_t size,
1180                            const char *encoding,
1181                            const char *errors)
1182 {
1183     PyObject *buffer = NULL, *unicode;
1184
1185     if (encoding == NULL)
1186         encoding = PyUnicode_GetDefaultEncoding();
1187
1188     /* Shortcuts for common default encodings */
1189     if (strcmp(encoding, "utf-8") == 0)
1190         return PyUnicode_DecodeUTF8(s, size, errors);
1191     else if (strcmp(encoding, "latin-1") == 0)
1192         return PyUnicode_DecodeLatin1(s, size, errors);
1193 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194     else if (strcmp(encoding, "mbcs") == 0)
1195         return PyUnicode_DecodeMBCS(s, size, errors);
1196 #endif
1197     else if (strcmp(encoding, "ascii") == 0)
1198         return PyUnicode_DecodeASCII(s, size, errors);
1199
1200     /* Decode via the codec registry */
1201     buffer = PyBuffer_FromMemory((void *)s, size);
1202     if (buffer == NULL)
1203         goto onError;
1204     unicode = PyCodec_Decode(buffer, encoding, errors);
1205     if (unicode == NULL)
1206         goto onError;
1207     if (!PyUnicode_Check(unicode)) {
1208         PyErr_Format(PyExc_TypeError,
1209                      "decoder did not return an unicode object (type=%.400s)",
1210                      Py_TYPE(unicode)->tp_name);
1211         Py_DECREF(unicode);
1212         goto onError;
1213     }
1214     Py_DECREF(buffer);
1215     return unicode;
1216
1217   onError:
1218     Py_XDECREF(buffer);
1219     return NULL;
1220 }
1221
1222 PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223                                     const char *encoding,
1224                                     const char *errors)
1225 {
1226     PyObject *v;
1227
1228     if (!PyUnicode_Check(unicode)) {
1229         PyErr_BadArgument();
1230         goto onError;
1231     }
1232
1233     if (encoding == NULL)
1234         encoding = PyUnicode_GetDefaultEncoding();
1235
1236     /* Decode via the codec registry */
1237     v = PyCodec_Decode(unicode, encoding, errors);
1238     if (v == NULL)
1239         goto onError;
1240     return v;
1241
1242   onError:
1243     return NULL;
1244 }
1245
1246 PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247                            Py_ssize_t size,
1248                            const char *encoding,
1249                            const char *errors)
1250 {
1251     PyObject *v, *unicode;
1252
1253     unicode = PyUnicode_FromUnicode(s, size);
1254     if (unicode == NULL)
1255         return NULL;
1256     v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257     Py_DECREF(unicode);
1258     return v;
1259 }
1260
1261 PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262                                     const char *encoding,
1263                                     const char *errors)
1264 {
1265     PyObject *v;
1266
1267     if (!PyUnicode_Check(unicode)) {
1268         PyErr_BadArgument();
1269         goto onError;
1270     }
1271
1272     if (encoding == NULL)
1273         encoding = PyUnicode_GetDefaultEncoding();
1274
1275     /* Encode via the codec registry */
1276     v = PyCodec_Encode(unicode, encoding, errors);
1277     if (v == NULL)
1278         goto onError;
1279     return v;
1280
1281   onError:
1282     return NULL;
1283 }
1284
1285 PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286                                     const char *encoding,
1287                                     const char *errors)
1288 {
1289     PyObject *v;
1290
1291     if (!PyUnicode_Check(unicode)) {
1292         PyErr_BadArgument();
1293         goto onError;
1294     }
1295
1296     if (encoding == NULL)
1297         encoding = PyUnicode_GetDefaultEncoding();
1298
1299     /* Shortcuts for common default encodings */
1300     if (errors == NULL) {
1301         if (strcmp(encoding, "utf-8") == 0)
1302             return PyUnicode_AsUTF8String(unicode);
1303         else if (strcmp(encoding, "latin-1") == 0)
1304             return PyUnicode_AsLatin1String(unicode);
1305 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306         else if (strcmp(encoding, "mbcs") == 0)
1307             return PyUnicode_AsMBCSString(unicode);
1308 #endif
1309         else if (strcmp(encoding, "ascii") == 0)
1310             return PyUnicode_AsASCIIString(unicode);
1311     }
1312
1313     /* Encode via the codec registry */
1314     v = PyCodec_Encode(unicode, encoding, errors);
1315     if (v == NULL)
1316         goto onError;
1317     if (!PyString_Check(v)) {
1318         PyErr_Format(PyExc_TypeError,
1319                      "encoder did not return a string object (type=%.400s)",
1320                      Py_TYPE(v)->tp_name);
1321         Py_DECREF(v);
1322         goto onError;
1323     }
1324     return v;
1325
1326   onError:
1327     return NULL;
1328 }
1329
1330 PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331                                             const char *errors)
1332 {
1333     PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335     if (v)
1336         return v;
1337     v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338     if (v && errors == NULL)
1339         ((PyUnicodeObject *)unicode)->defenc = v;
1340     return v;
1341 }
1342
1343 Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344 {
1345     if (!PyUnicode_Check(unicode)) {
1346         PyErr_BadArgument();
1347         goto onError;
1348     }
1349     return PyUnicode_AS_UNICODE(unicode);
1350
1351   onError:
1352     return NULL;
1353 }
1354
1355 Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1356 {
1357     if (!PyUnicode_Check(unicode)) {
1358         PyErr_BadArgument();
1359         goto onError;
1360     }
1361     return PyUnicode_GET_SIZE(unicode);
1362
1363   onError:
1364     return -1;
1365 }
1366
1367 const char *PyUnicode_GetDefaultEncoding(void)
1368 {
1369     return unicode_default_encoding;
1370 }
1371
1372 int PyUnicode_SetDefaultEncoding(const char *encoding)
1373 {
1374     PyObject *v;
1375
1376     /* Make sure the encoding is valid. As side effect, this also
1377        loads the encoding into the codec registry cache. */
1378     v = _PyCodec_Lookup(encoding);
1379     if (v == NULL)
1380         goto onError;
1381     Py_DECREF(v);
1382     strncpy(unicode_default_encoding,
1383             encoding,
1384             sizeof(unicode_default_encoding));
1385     return 0;
1386
1387   onError:
1388     return -1;
1389 }
1390
1391 /* error handling callback helper:
1392    build arguments, call the callback and check the arguments,
1393    if no exception occurred, copy the replacement to the output
1394    and adjust various state variables.
1395    return 0 on success, -1 on error
1396 */
1397
1398 static
1399 int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1400                                      const char *encoding, const char *reason,
1401                                      const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402                                      Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403                                      PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1404 {
1405     static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1406
1407     PyObject *restuple = NULL;
1408     PyObject *repunicode = NULL;
1409     Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410     Py_ssize_t requiredsize;
1411     Py_ssize_t newpos;
1412     Py_UNICODE *repptr;
1413     Py_ssize_t repsize;
1414     int res = -1;
1415
1416     if (*errorHandler == NULL) {
1417         *errorHandler = PyCodec_LookupError(errors);
1418         if (*errorHandler == NULL)
1419             goto onError;
1420     }
1421
1422     if (*exceptionObject == NULL) {
1423         *exceptionObject = PyUnicodeDecodeError_Create(
1424             encoding, input, insize, *startinpos, *endinpos, reason);
1425         if (*exceptionObject == NULL)
1426             goto onError;
1427     }
1428     else {
1429         if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430             goto onError;
1431         if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432             goto onError;
1433         if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434             goto onError;
1435     }
1436
1437     restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438     if (restuple == NULL)
1439         goto onError;
1440     if (!PyTuple_Check(restuple)) {
1441         PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442         goto onError;
1443     }
1444     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1445         goto onError;
1446     if (newpos<0)
1447         newpos = insize+newpos;
1448     if (newpos<0 || newpos>insize) {
1449         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450         goto onError;
1451     }
1452
1453     /* need more space? (at least enough for what we
1454        have+the replacement+the rest of the string (starting
1455        at the new input position), so we won't have to check space
1456        when there are no errors in the rest of the string) */
1457     repptr = PyUnicode_AS_UNICODE(repunicode);
1458     repsize = PyUnicode_GET_SIZE(repunicode);
1459     requiredsize = *outpos + repsize + insize-newpos;
1460     if (requiredsize > outsize) {
1461         if (requiredsize<2*outsize)
1462             requiredsize = 2*outsize;
1463         if (_PyUnicode_Resize(output, requiredsize) < 0)
1464             goto onError;
1465         *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1466     }
1467     *endinpos = newpos;
1468     *inptr = input + newpos;
1469     Py_UNICODE_COPY(*outptr, repptr, repsize);
1470     *outptr += repsize;
1471     *outpos += repsize;
1472     /* we made it! */
1473     res = 0;
1474
1475   onError:
1476     Py_XDECREF(restuple);
1477     return res;
1478 }
1479
1480 /* --- UTF-7 Codec -------------------------------------------------------- */
1481
1482 /* See RFC2152 for details.  We encode conservatively and decode liberally. */
1483
1484 /* Three simple macros defining base-64. */
1485
1486 /* Is c a base-64 character? */
1487
1488 #define IS_BASE64(c) \
1489     (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491 /* given that c is a base-64 character, what is its base-64 value? */
1492
1493 #define FROM_BASE64(c)                                                  \
1494     (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1495      ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1496      ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1497      (c) == '+' ? 62 : 63)
1498
1499 /* What is the base-64 character of the bottom 6 bits of n? */
1500
1501 #define TO_BASE64(n)  \
1502     ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504 /* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505  * decoded as itself.  We are permissive on decoding; the only ASCII
1506  * byte not decoding to itself is the + which begins a base64
1507  * string. */
1508
1509 #define DECODE_DIRECT(c)                                \
1510     ((c) <= 127 && (c) != '+')
1511
1512 /* The UTF-7 encoder treats ASCII characters differently according to
1513  * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514  * the above).  See RFC2152.  This array identifies these different
1515  * sets:
1516  * 0 : "Set D"
1517  *     alphanumeric and '(),-./:?
1518  * 1 : "Set O"
1519  *     !"#$%&*;<=>@[]^_`{|}
1520  * 2 : "whitespace"
1521  *     ht nl cr sp
1522  * 3 : special (must be base64 encoded)
1523  *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524  */
1525
1526 static
1527 char utf7_category[128] = {
1528 /* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1529     3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1530 /* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1531     3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1532 /* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1533     2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1534 /*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1535     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1536 /*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1537     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1538 /*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1539     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1540 /*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1541     1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1542 /*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1543     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1544 };
1545
1546 /* ENCODE_DIRECT: this character should be encoded as itself.  The
1547  * answer depends on whether we are encoding set O as itself, and also
1548  * on whether we are encoding whitespace as itself.  RFC2152 makes it
1549  * clear that the answers to these questions vary between
1550  * applications, so this code needs to be flexible.  */
1551
1552 #define ENCODE_DIRECT(c, directO, directWS)             \
1553     ((c) < 128 && (c) > 0 &&                            \
1554      ((utf7_category[(c)] == 0) ||                      \
1555       (directWS && (utf7_category[(c)] == 2)) ||        \
1556       (directO && (utf7_category[(c)] == 1))))
1557
1558 PyObject *PyUnicode_DecodeUTF7(const char *s,
1559                                Py_ssize_t size,
1560                                const char *errors)
1561 {
1562     return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563 }
1564
1565 /* The decoder.  The only state we preserve is our read position,
1566  * i.e. how many characters we have consumed.  So if we end in the
1567  * middle of a shift sequence we have to back off the read position
1568  * and the output to the beginning of the sequence, otherwise we lose
1569  * all the shift state (seen bits, number of bits seen, high
1570  * surrogate). */
1571
1572 PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1573                                        Py_ssize_t size,
1574                                        const char *errors,
1575                                        Py_ssize_t *consumed)
1576 {
1577     const char *starts = s;
1578     Py_ssize_t startinpos;
1579     Py_ssize_t endinpos;
1580     Py_ssize_t outpos;
1581     const char *e;
1582     PyUnicodeObject *unicode;
1583     Py_UNICODE *p;
1584     const char *errmsg = "";
1585     int inShift = 0;
1586     Py_UNICODE *shiftOutStart;
1587     unsigned int base64bits = 0;
1588     unsigned long base64buffer = 0;
1589     Py_UNICODE surrogate = 0;
1590     PyObject *errorHandler = NULL;
1591     PyObject *exc = NULL;
1592
1593     unicode = _PyUnicode_New(size);
1594     if (!unicode)
1595         return NULL;
1596     if (size == 0) {
1597         if (consumed)
1598             *consumed = 0;
1599         return (PyObject *)unicode;
1600     }
1601
1602     p = unicode->str;
1603     shiftOutStart = p;
1604     e = s + size;
1605
1606     while (s < e) {
1607         Py_UNICODE ch = (unsigned char) *s;
1608
1609         if (inShift) { /* in a base-64 section */
1610             if (IS_BASE64(ch)) { /* consume a base-64 character */
1611                 base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612                 base64bits += 6;
1613                 s++;
1614                 if (base64bits >= 16) {
1615                     /* we have enough bits for a UTF-16 value */
1616                     Py_UNICODE outCh = (Py_UNICODE)
1617                                        (base64buffer >> (base64bits-16));
1618                     base64bits -= 16;
1619                     base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620                     if (surrogate) {
1621                         /* expecting a second surrogate */
1622                         if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623 #ifdef Py_UNICODE_WIDE
1624                             *p++ = (((surrogate & 0x3FF)<<10)
1625                                     | (outCh & 0x3FF)) + 0x10000;
1626 #else
1627                             *p++ = surrogate;
1628                             *p++ = outCh;
1629 #endif
1630                             surrogate = 0;
1631                         }
1632                         else {
1633                             surrogate = 0;
1634                             errmsg = "second surrogate missing";
1635                             goto utf7Error;
1636                         }
1637                     }
1638                     else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639                         /* first surrogate */
1640                         surrogate = outCh;
1641                     }
1642                     else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1643                         errmsg = "unexpected second surrogate";
1644                         goto utf7Error;
1645                     }
1646                     else {
1647                         *p++ = outCh;
1648                     }
1649                 }
1650             }
1651             else { /* now leaving a base-64 section */
1652                 inShift = 0;
1653                 s++;
1654                 if (surrogate) {
1655                     errmsg = "second surrogate missing at end of shift sequence";
1656                     goto utf7Error;
1657                 }
1658                 if (base64bits > 0) { /* left-over bits */
1659                     if (base64bits >= 6) {
1660                         /* We've seen at least one base-64 character */
1661                         errmsg = "partial character in shift sequence";
1662                         goto utf7Error;
1663                     }
1664                     else {
1665                         /* Some bits remain; they should be zero */
1666                         if (base64buffer != 0) {
1667                             errmsg = "non-zero padding bits in shift sequence";
1668                             goto utf7Error;
1669                         }
1670                     }
1671                 }
1672                 if (ch != '-') {
1673                     /* '-' is absorbed; other terminating
1674                        characters are preserved */
1675                     *p++ = ch;
1676                 }
1677             }
1678         }
1679         else if ( ch == '+' ) {
1680             startinpos = s-starts;
1681             s++; /* consume '+' */
1682             if (s < e && *s == '-') { /* '+-' encodes '+' */
1683                 s++;
1684                 *p++ = '+';
1685             }
1686             else { /* begin base64-encoded section */
1687                 inShift = 1;
1688                 shiftOutStart = p;
1689                 base64bits = 0;
1690             }
1691         }
1692         else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1693             *p++ = ch;
1694             s++;
1695         }
1696         else {
1697             startinpos = s-starts;
1698             s++;
1699             errmsg = "unexpected special character";
1700             goto utf7Error;
1701         }
1702         continue;
1703 utf7Error:
1704         outpos = p-PyUnicode_AS_UNICODE(unicode);
1705         endinpos = s-starts;
1706         if (unicode_decode_call_errorhandler(
1707                 errors, &errorHandler,
1708                 "utf7", errmsg,
1709                 starts, size, &startinpos, &endinpos, &exc, &s,
1710                 &unicode, &outpos, &p))
1711             goto onError;
1712     }
1713
1714     /* end of string */
1715
1716     if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717         /* if we're in an inconsistent state, that's an error */
1718         if (surrogate ||
1719                 (base64bits >= 6) ||
1720                 (base64bits > 0 && base64buffer != 0)) {
1721             outpos = p-PyUnicode_AS_UNICODE(unicode);
1722             endinpos = size;
1723             if (unicode_decode_call_errorhandler(
1724                     errors, &errorHandler,
1725                     "utf7", "unterminated shift sequence",
1726                     starts, size, &startinpos, &endinpos, &exc, &s,
1727                     &unicode, &outpos, &p))
1728                 goto onError;
1729         }
1730     }
1731
1732     /* return state */
1733     if (consumed) {
1734         if (inShift) {
1735             p = shiftOutStart; /* back off output */
1736             *consumed = startinpos;
1737         }
1738         else {
1739             *consumed = s-starts;
1740         }
1741     }
1742
1743     if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1744         goto onError;
1745
1746     Py_XDECREF(errorHandler);
1747     Py_XDECREF(exc);
1748     return (PyObject *)unicode;
1749
1750   onError:
1751     Py_XDECREF(errorHandler);
1752     Py_XDECREF(exc);
1753     Py_DECREF(unicode);
1754     return NULL;
1755 }
1756
1757
1758 PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1759                                Py_ssize_t size,
1760                                int base64SetO,
1761                                int base64WhiteSpace,
1762                                const char *errors)
1763 {
1764     PyObject *v;
1765     /* It might be possible to tighten this worst case */
1766     Py_ssize_t allocated = 8 * size;
1767     int inShift = 0;
1768     Py_ssize_t i = 0;
1769     unsigned int base64bits = 0;
1770     unsigned long base64buffer = 0;
1771     char * out;
1772     char * start;
1773
1774     if (allocated / 8 != size)
1775         return PyErr_NoMemory();
1776
1777     if (size == 0)
1778         return PyString_FromStringAndSize(NULL, 0);
1779
1780     v = PyString_FromStringAndSize(NULL, allocated);
1781     if (v == NULL)
1782         return NULL;
1783
1784     start = out = PyString_AS_STRING(v);
1785     for (;i < size; ++i) {
1786         Py_UNICODE ch = s[i];
1787
1788         if (inShift) {
1789             if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790                 /* shifting out */
1791                 if (base64bits) { /* output remaining bits */
1792                     *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793                     base64buffer = 0;
1794                     base64bits = 0;
1795                 }
1796                 inShift = 0;
1797                 /* Characters not in the BASE64 set implicitly unshift the sequence
1798                    so no '-' is required, except if the character is itself a '-' */
1799                 if (IS_BASE64(ch) || ch == '-') {
1800                     *out++ = '-';
1801                 }
1802                 *out++ = (char) ch;
1803             }
1804             else {
1805                 goto encode_char;
1806             }
1807         }
1808         else { /* not in a shift sequence */
1809             if (ch == '+') {
1810                 *out++ = '+';
1811                         *out++ = '-';
1812             }
1813             else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1814                 *out++ = (char) ch;
1815             }
1816             else {
1817                 *out++ = '+';
1818                 inShift = 1;
1819                 goto encode_char;
1820             }
1821         }
1822         continue;
1823 encode_char:
1824 #ifdef Py_UNICODE_WIDE
1825         if (ch >= 0x10000) {
1826             /* code first surrogate */
1827             base64bits += 16;
1828             base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1829             while (base64bits >= 6) {
1830                 *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831                 base64bits -= 6;
1832             }
1833             /* prepare second surrogate */
1834             ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1835         }
1836 #endif
1837         base64bits += 16;
1838         base64buffer = (base64buffer << 16) | ch;
1839         while (base64bits >= 6) {
1840             *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841             base64bits -= 6;
1842         }
1843     }
1844     if (base64bits)
1845         *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846     if (inShift)
1847         *out++ = '-';
1848
1849     if (_PyString_Resize(&v, out - start))
1850         return NULL;
1851     return v;
1852 }
1853
1854 #undef IS_BASE64
1855 #undef FROM_BASE64
1856 #undef TO_BASE64
1857 #undef DECODE_DIRECT
1858 #undef ENCODE_DIRECT
1859
1860 /* --- UTF-8 Codec -------------------------------------------------------- */
1861
1862 static
1863 char utf8_code_length[256] = {
1864     /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1865        illegal prefix.  See RFC 3629 for details */
1866     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1867     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873     1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1874     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1875     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1878     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1879     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1880     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1881     4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1882 };
1883
1884 PyObject *PyUnicode_DecodeUTF8(const char *s,
1885                                Py_ssize_t size,
1886                                const char *errors)
1887 {
1888     return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1889 }
1890
1891 PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1892                                        Py_ssize_t size,
1893                                        const char *errors,
1894                                        Py_ssize_t *consumed)
1895 {
1896     const char *starts = s;
1897     int n;
1898     int k;
1899     Py_ssize_t startinpos;
1900     Py_ssize_t endinpos;
1901     Py_ssize_t outpos;
1902     const char *e;
1903     PyUnicodeObject *unicode;
1904     Py_UNICODE *p;
1905     const char *errmsg = "";
1906     PyObject *errorHandler = NULL;
1907     PyObject *exc = NULL;
1908
1909     /* Note: size will always be longer than the resulting Unicode
1910        character count */
1911     unicode = _PyUnicode_New(size);
1912     if (!unicode)
1913         return NULL;
1914     if (size == 0) {
1915         if (consumed)
1916             *consumed = 0;
1917         return (PyObject *)unicode;
1918     }
1919
1920     /* Unpack UTF-8 encoded data */
1921     p = unicode->str;
1922     e = s + size;
1923
1924     while (s < e) {
1925         Py_UCS4 ch = (unsigned char)*s;
1926
1927         if (ch < 0x80) {
1928             *p++ = (Py_UNICODE)ch;
1929             s++;
1930             continue;
1931         }
1932
1933         n = utf8_code_length[ch];
1934
1935         if (s + n > e) {
1936             if (consumed)
1937                 break;
1938             else {
1939                 errmsg = "unexpected end of data";
1940                 startinpos = s-starts;
1941                 endinpos = startinpos+1;
1942                 for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1943                     endinpos++;
1944                 goto utf8Error;
1945             }
1946         }
1947
1948         switch (n) {
1949
1950         case 0:
1951             errmsg = "invalid start byte";
1952             startinpos = s-starts;
1953             endinpos = startinpos+1;
1954             goto utf8Error;
1955
1956         case 1:
1957             errmsg = "internal error";
1958             startinpos = s-starts;
1959             endinpos = startinpos+1;
1960             goto utf8Error;
1961
1962         case 2:
1963             if ((s[1] & 0xc0) != 0x80) {
1964                 errmsg = "invalid continuation byte";
1965                 startinpos = s-starts;
1966                 endinpos = startinpos + 1;
1967                 goto utf8Error;
1968             }
1969             ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1970             assert ((ch > 0x007F) && (ch <= 0x07FF));
1971             *p++ = (Py_UNICODE)ch;
1972             break;
1973
1974         case 3:
1975             /* XXX: surrogates shouldn't be valid UTF-8!
1976                see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1977                (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1978                Uncomment the 2 lines below to make them invalid,
1979                codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1980             if ((s[1] & 0xc0) != 0x80 ||
1981                 (s[2] & 0xc0) != 0x80 ||
1982                 ((unsigned char)s[0] == 0xE0 &&
1983                  (unsigned char)s[1] < 0xA0)/* ||
1984                 ((unsigned char)s[0] == 0xED &&
1985                  (unsigned char)s[1] > 0x9F)*/) {
1986                 errmsg = "invalid continuation byte";
1987                 startinpos = s-starts;
1988                 endinpos = startinpos + 1;
1989
1990                 /* if s[1] first two bits are 1 and 0, then the invalid
1991                    continuation byte is s[2], so increment endinpos by 1,
1992                    if not, s[1] is invalid and endinpos doesn't need to
1993                    be incremented. */
1994                 if ((s[1] & 0xC0) == 0x80)
1995                     endinpos++;
1996                 goto utf8Error;
1997             }
1998             ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1999             assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2000             *p++ = (Py_UNICODE)ch;
2001             break;
2002
2003         case 4:
2004             if ((s[1] & 0xc0) != 0x80 ||
2005                 (s[2] & 0xc0) != 0x80 ||
2006                 (s[3] & 0xc0) != 0x80 ||
2007                 ((unsigned char)s[0] == 0xF0 &&
2008                  (unsigned char)s[1] < 0x90) ||
2009                 ((unsigned char)s[0] == 0xF4 &&
2010                  (unsigned char)s[1] > 0x8F)) {
2011                 errmsg = "invalid continuation byte";
2012                 startinpos = s-starts;
2013                 endinpos = startinpos + 1;
2014                 if ((s[1] & 0xC0) == 0x80) {
2015                     endinpos++;
2016                     if ((s[2] & 0xC0) == 0x80)
2017                         endinpos++;
2018                 }
2019                 goto utf8Error;
2020             }
2021             ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2022                  ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2023             assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2024
2025 #ifdef Py_UNICODE_WIDE
2026             *p++ = (Py_UNICODE)ch;
2027 #else
2028             /*  compute and append the two surrogates: */
2029
2030             /*  translate from 10000..10FFFF to 0..FFFF */
2031             ch -= 0x10000;
2032
2033             /*  high surrogate = top 10 bits added to D800 */
2034             *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2035
2036             /*  low surrogate = bottom 10 bits added to DC00 */
2037             *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2038 #endif
2039             break;
2040         }
2041         s += n;
2042         continue;
2043
2044       utf8Error:
2045         outpos = p-PyUnicode_AS_UNICODE(unicode);
2046         if (unicode_decode_call_errorhandler(
2047                 errors, &errorHandler,
2048                 "utf8", errmsg,
2049                 starts, size, &startinpos, &endinpos, &exc, &s,
2050                 &unicode, &outpos, &p))
2051             goto onError;
2052     }
2053     if (consumed)
2054         *consumed = s-starts;
2055
2056     /* Adjust length */
2057     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2058         goto onError;
2059
2060     Py_XDECREF(errorHandler);
2061     Py_XDECREF(exc);
2062     return (PyObject *)unicode;
2063
2064   onError:
2065     Py_XDECREF(errorHandler);
2066     Py_XDECREF(exc);
2067     Py_DECREF(unicode);
2068     return NULL;
2069 }
2070
2071 /* Allocation strategy:  if the string is short, convert into a stack buffer
2072    and allocate exactly as much space needed at the end.  Else allocate the
2073    maximum possible needed (4 result bytes per Unicode character), and return
2074    the excess memory at the end.
2075 */
2076 PyObject *
2077 PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2078                      Py_ssize_t size,
2079                      const char *errors)
2080 {
2081 #define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2082
2083     Py_ssize_t i;           /* index into s of next input byte */
2084     PyObject *v;        /* result string object */
2085     char *p;            /* next free byte in output buffer */
2086     Py_ssize_t nallocated;  /* number of result bytes allocated */
2087     Py_ssize_t nneeded;        /* number of result bytes needed */
2088     char stackbuf[MAX_SHORT_UNICHARS * 4];
2089
2090     assert(s != NULL);
2091     assert(size >= 0);
2092
2093     if (size <= MAX_SHORT_UNICHARS) {
2094         /* Write into the stack buffer; nallocated can't overflow.
2095          * At the end, we'll allocate exactly as much heap space as it
2096          * turns out we need.
2097          */
2098         nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2099         v = NULL;   /* will allocate after we're done */
2100         p = stackbuf;
2101     }
2102     else {
2103         /* Overallocate on the heap, and give the excess back at the end. */
2104         nallocated = size * 4;
2105         if (nallocated / 4 != size)  /* overflow! */
2106             return PyErr_NoMemory();
2107         v = PyString_FromStringAndSize(NULL, nallocated);
2108         if (v == NULL)
2109             return NULL;
2110         p = PyString_AS_STRING(v);
2111     }
2112
2113     for (i = 0; i < size;) {
2114         Py_UCS4 ch = s[i++];
2115
2116         if (ch < 0x80)
2117             /* Encode ASCII */
2118             *p++ = (char) ch;
2119
2120         else if (ch < 0x0800) {
2121             /* Encode Latin-1 */
2122             *p++ = (char)(0xc0 | (ch >> 6));
2123             *p++ = (char)(0x80 | (ch & 0x3f));
2124         }
2125         else {
2126             /* Encode UCS2 Unicode ordinals */
2127             if (ch < 0x10000) {
2128                 /* Special case: check for high surrogate */
2129                 if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2130                     Py_UCS4 ch2 = s[i];
2131                     /* Check for low surrogate and combine the two to
2132                        form a UCS4 value */
2133                     if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2134                         ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2135                         i++;
2136                         goto encodeUCS4;
2137                     }
2138                     /* Fall through: handles isolated high surrogates */
2139                 }
2140                 *p++ = (char)(0xe0 | (ch >> 12));
2141                 *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2142                 *p++ = (char)(0x80 | (ch & 0x3f));
2143                 continue;
2144             }
2145           encodeUCS4:
2146             /* Encode UCS4 Unicode ordinals */
2147             *p++ = (char)(0xf0 | (ch >> 18));
2148             *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2149             *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2150             *p++ = (char)(0x80 | (ch & 0x3f));
2151         }
2152     }
2153
2154     if (v == NULL) {
2155         /* This was stack allocated. */
2156         nneeded = p - stackbuf;
2157         assert(nneeded <= nallocated);
2158         v = PyString_FromStringAndSize(stackbuf, nneeded);
2159     }
2160     else {
2161         /* Cut back to size actually needed. */
2162         nneeded = p - PyString_AS_STRING(v);
2163         assert(nneeded <= nallocated);
2164         if (_PyString_Resize(&v, nneeded))
2165             return NULL;
2166     }
2167     return v;
2168
2169 #undef MAX_SHORT_UNICHARS
2170 }
2171
2172 PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2173 {
2174     if (!PyUnicode_Check(unicode)) {
2175         PyErr_BadArgument();
2176         return NULL;
2177     }
2178     return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2179                                 PyUnicode_GET_SIZE(unicode),
2180                                 NULL);
2181 }
2182
2183 /* --- UTF-32 Codec ------------------------------------------------------- */
2184
2185 PyObject *
2186 PyUnicode_DecodeUTF32(const char *s,
2187                       Py_ssize_t size,
2188                       const char *errors,
2189                       int *byteorder)
2190 {
2191     return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2192 }
2193
2194 PyObject *
2195 PyUnicode_DecodeUTF32Stateful(const char *s,
2196                               Py_ssize_t size,
2197                               const char *errors,
2198                               int *byteorder,
2199                               Py_ssize_t *consumed)
2200 {
2201     const char *starts = s;
2202     Py_ssize_t startinpos;
2203     Py_ssize_t endinpos;
2204     Py_ssize_t outpos;
2205     PyUnicodeObject *unicode;
2206     Py_UNICODE *p;
2207 #ifndef Py_UNICODE_WIDE
2208     int pairs = 0;
2209     const unsigned char *qq;
2210 #else
2211     const int pairs = 0;
2212 #endif
2213     const unsigned char *q, *e;
2214     int bo = 0;       /* assume native ordering by default */
2215     const char *errmsg = "";
2216     /* Offsets from q for retrieving bytes in the right order. */
2217 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2218     int iorder[] = {0, 1, 2, 3};
2219 #else
2220     int iorder[] = {3, 2, 1, 0};
2221 #endif
2222     PyObject *errorHandler = NULL;
2223     PyObject *exc = NULL;
2224
2225     q = (unsigned char *)s;
2226     e = q + size;
2227
2228     if (byteorder)
2229         bo = *byteorder;
2230
2231     /* Check for BOM marks (U+FEFF) in the input and adjust current
2232        byte order setting accordingly. In native mode, the leading BOM
2233        mark is skipped, in all other modes, it is copied to the output
2234        stream as-is (giving a ZWNBSP character). */
2235     if (bo == 0) {
2236         if (size >= 4) {
2237             const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2238                 (q[iorder[1]] << 8) | q[iorder[0]];
2239 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2240             if (bom == 0x0000FEFF) {
2241                 q += 4;
2242                 bo = -1;
2243             }
2244             else if (bom == 0xFFFE0000) {
2245                 q += 4;
2246                 bo = 1;
2247             }
2248 #else
2249             if (bom == 0x0000FEFF) {
2250                 q += 4;
2251                 bo = 1;
2252             }
2253             else if (bom == 0xFFFE0000) {
2254                 q += 4;
2255                 bo = -1;
2256             }
2257 #endif
2258         }
2259     }
2260
2261     if (bo == -1) {
2262         /* force LE */
2263         iorder[0] = 0;
2264         iorder[1] = 1;
2265         iorder[2] = 2;
2266         iorder[3] = 3;
2267     }
2268     else if (bo == 1) {
2269         /* force BE */
2270         iorder[0] = 3;
2271         iorder[1] = 2;
2272         iorder[2] = 1;
2273         iorder[3] = 0;
2274     }
2275
2276     /* On narrow builds we split characters outside the BMP into two
2277        codepoints => count how much extra space we need. */
2278 #ifndef Py_UNICODE_WIDE
2279     for (qq = q; qq < e; qq += 4)
2280         if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2281             pairs++;
2282 #endif
2283
2284     /* This might be one to much, because of a BOM */
2285     unicode = _PyUnicode_New((size+3)/4+pairs);
2286     if (!unicode)
2287         return NULL;
2288     if (size == 0)
2289         return (PyObject *)unicode;
2290
2291     /* Unpack UTF-32 encoded data */
2292     p = unicode->str;
2293
2294     while (q < e) {
2295         Py_UCS4 ch;
2296         /* remaining bytes at the end? (size should be divisible by 4) */
2297         if (e-q<4) {
2298             if (consumed)
2299                 break;
2300             errmsg = "truncated data";
2301             startinpos = ((const char *)q)-starts;
2302             endinpos = ((const char *)e)-starts;
2303             goto utf32Error;
2304             /* The remaining input chars are ignored if the callback
2305                chooses to skip the input */
2306         }
2307         ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2308             (q[iorder[1]] << 8) | q[iorder[0]];
2309
2310         if (ch >= 0x110000)
2311         {
2312             errmsg = "codepoint not in range(0x110000)";
2313             startinpos = ((const char *)q)-starts;
2314             endinpos = startinpos+4;
2315             goto utf32Error;
2316         }
2317 #ifndef Py_UNICODE_WIDE
2318         if (ch >= 0x10000)
2319         {
2320             *p++ = 0xD800 | ((ch-0x10000) >> 10);
2321             *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2322         }
2323         else
2324 #endif
2325             *p++ = ch;
2326         q += 4;
2327         continue;
2328       utf32Error:
2329         outpos = p-PyUnicode_AS_UNICODE(unicode);
2330         if (unicode_decode_call_errorhandler(
2331                 errors, &errorHandler,
2332                 "utf32", errmsg,
2333                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2334                 &unicode, &outpos, &p))
2335             goto onError;
2336     }
2337
2338     if (byteorder)
2339         *byteorder = bo;
2340
2341     if (consumed)
2342         *consumed = (const char *)q-starts;
2343
2344     /* Adjust length */
2345     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2346         goto onError;
2347
2348     Py_XDECREF(errorHandler);
2349     Py_XDECREF(exc);
2350     return (PyObject *)unicode;
2351
2352   onError:
2353     Py_DECREF(unicode);
2354     Py_XDECREF(errorHandler);
2355     Py_XDECREF(exc);
2356     return NULL;
2357 }
2358
2359 PyObject *
2360 PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2361                       Py_ssize_t size,
2362                       const char *errors,
2363                       int byteorder)
2364 {
2365     PyObject *v;
2366     unsigned char *p;
2367     Py_ssize_t nsize, bytesize;
2368 #ifndef Py_UNICODE_WIDE
2369     Py_ssize_t i, pairs;
2370 #else
2371     const int pairs = 0;
2372 #endif
2373     /* Offsets from p for storing byte pairs in the right order. */
2374 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2375     int iorder[] = {0, 1, 2, 3};
2376 #else
2377     int iorder[] = {3, 2, 1, 0};
2378 #endif
2379
2380 #define STORECHAR(CH)                           \
2381     do {                                        \
2382         p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2383         p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2384         p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2385         p[iorder[0]] = (CH) & 0xff;             \
2386         p += 4;                                 \
2387     } while(0)
2388
2389     /* In narrow builds we can output surrogate pairs as one codepoint,
2390        so we need less space. */
2391 #ifndef Py_UNICODE_WIDE
2392     for (i = pairs = 0; i < size-1; i++)
2393         if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2394             0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2395             pairs++;
2396 #endif
2397     nsize = (size - pairs + (byteorder == 0));
2398     bytesize = nsize * 4;
2399     if (bytesize / 4 != nsize)
2400         return PyErr_NoMemory();
2401     v = PyString_FromStringAndSize(NULL, bytesize);
2402     if (v == NULL)
2403         return NULL;
2404
2405     p = (unsigned char *)PyString_AS_STRING(v);
2406     if (byteorder == 0)
2407         STORECHAR(0xFEFF);
2408     if (size == 0)
2409         return v;
2410
2411     if (byteorder == -1) {
2412         /* force LE */
2413         iorder[0] = 0;
2414         iorder[1] = 1;
2415         iorder[2] = 2;
2416         iorder[3] = 3;
2417     }
2418     else if (byteorder == 1) {
2419         /* force BE */
2420         iorder[0] = 3;
2421         iorder[1] = 2;
2422         iorder[2] = 1;
2423         iorder[3] = 0;
2424     }
2425
2426     while (size-- > 0) {
2427         Py_UCS4 ch = *s++;
2428 #ifndef Py_UNICODE_WIDE
2429         if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2430             Py_UCS4 ch2 = *s;
2431             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2432                 ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2433                 s++;
2434                 size--;
2435             }
2436         }
2437 #endif
2438         STORECHAR(ch);
2439     }
2440     return v;
2441 #undef STORECHAR
2442 }
2443
2444 PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2445 {
2446     if (!PyUnicode_Check(unicode)) {
2447         PyErr_BadArgument();
2448         return NULL;
2449     }
2450     return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2451                                  PyUnicode_GET_SIZE(unicode),
2452                                  NULL,
2453                                  0);
2454 }
2455
2456 /* --- UTF-16 Codec ------------------------------------------------------- */
2457
2458 PyObject *
2459 PyUnicode_DecodeUTF16(const char *s,
2460                       Py_ssize_t size,
2461                       const char *errors,
2462                       int *byteorder)
2463 {
2464     return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2465 }
2466
2467 PyObject *
2468 PyUnicode_DecodeUTF16Stateful(const char *s,
2469                               Py_ssize_t size,
2470                               const char *errors,
2471                               int *byteorder,
2472                               Py_ssize_t *consumed)
2473 {
2474     const char *starts = s;
2475     Py_ssize_t startinpos;
2476     Py_ssize_t endinpos;
2477     Py_ssize_t outpos;
2478     PyUnicodeObject *unicode;
2479     Py_UNICODE *p;
2480     const unsigned char *q, *e;
2481     int bo = 0;       /* assume native ordering by default */
2482     const char *errmsg = "";
2483     /* Offsets from q for retrieving byte pairs in the right order. */
2484 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2485     int ihi = 1, ilo = 0;
2486 #else
2487     int ihi = 0, ilo = 1;
2488 #endif
2489     PyObject *errorHandler = NULL;
2490     PyObject *exc = NULL;
2491
2492     /* Note: size will always be longer than the resulting Unicode
2493        character count */
2494     unicode = _PyUnicode_New(size);
2495     if (!unicode)
2496         return NULL;
2497     if (size == 0)
2498         return (PyObject *)unicode;
2499
2500     /* Unpack UTF-16 encoded data */
2501     p = unicode->str;
2502     q = (unsigned char *)s;
2503     e = q + size;
2504
2505     if (byteorder)
2506         bo = *byteorder;
2507
2508     /* Check for BOM marks (U+FEFF) in the input and adjust current
2509        byte order setting accordingly. In native mode, the leading BOM
2510        mark is skipped, in all other modes, it is copied to the output
2511        stream as-is (giving a ZWNBSP character). */
2512     if (bo == 0) {
2513         if (size >= 2) {
2514             const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2515 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2516             if (bom == 0xFEFF) {
2517                 q += 2;
2518                 bo = -1;
2519             }
2520             else if (bom == 0xFFFE) {
2521                 q += 2;
2522                 bo = 1;
2523             }
2524 #else
2525             if (bom == 0xFEFF) {
2526                 q += 2;
2527                 bo = 1;
2528             }
2529             else if (bom == 0xFFFE) {
2530                 q += 2;
2531                 bo = -1;
2532             }
2533 #endif
2534         }
2535     }
2536
2537     if (bo == -1) {
2538         /* force LE */
2539         ihi = 1;
2540         ilo = 0;
2541     }
2542     else if (bo == 1) {
2543         /* force BE */
2544         ihi = 0;
2545         ilo = 1;
2546     }
2547
2548     while (q < e) {
2549         Py_UNICODE ch;
2550         /* remaining bytes at the end? (size should be even) */
2551         if (e-q<2) {
2552             if (consumed)
2553                 break;
2554             errmsg = "truncated data";
2555             startinpos = ((const char *)q)-starts;
2556             endinpos = ((const char *)e)-starts;
2557             goto utf16Error;
2558             /* The remaining input chars are ignored if the callback
2559                chooses to skip the input */
2560         }
2561         ch = (q[ihi] << 8) | q[ilo];
2562
2563         q += 2;
2564
2565         if (ch < 0xD800 || ch > 0xDFFF) {
2566             *p++ = ch;
2567             continue;
2568         }
2569
2570         /* UTF-16 code pair: */
2571         if (q >= e) {
2572             errmsg = "unexpected end of data";
2573             startinpos = (((const char *)q)-2)-starts;
2574             endinpos = ((const char *)e)-starts;
2575             goto utf16Error;
2576         }
2577         if (0xD800 <= ch && ch <= 0xDBFF) {
2578             Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2579             q += 2;
2580             if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2581 #ifndef Py_UNICODE_WIDE
2582                 *p++ = ch;
2583                 *p++ = ch2;
2584 #else
2585                 *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2586 #endif
2587                 continue;
2588             }
2589             else {
2590                 errmsg = "illegal UTF-16 surrogate";
2591                 startinpos = (((const char *)q)-4)-starts;
2592                 endinpos = startinpos+2;
2593                 goto utf16Error;
2594             }
2595
2596         }
2597         errmsg = "illegal encoding";
2598         startinpos = (((const char *)q)-2)-starts;
2599         endinpos = startinpos+2;
2600         /* Fall through to report the error */
2601
2602       utf16Error:
2603         outpos = p-PyUnicode_AS_UNICODE(unicode);
2604         if (unicode_decode_call_errorhandler(
2605                 errors, &errorHandler,
2606                 "utf16", errmsg,
2607                 starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2608                 &unicode, &outpos, &p))
2609             goto onError;
2610     }
2611
2612     if (byteorder)
2613         *byteorder = bo;
2614
2615     if (consumed)
2616         *consumed = (const char *)q-starts;
2617
2618     /* Adjust length */
2619     if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2620         goto onError;
2621
2622     Py_XDECREF(errorHandler);
2623     Py_XDECREF(exc);
2624     return (PyObject *)unicode;
2625
2626   onError:
2627     Py_DECREF(unicode);
2628     Py_XDECREF(errorHandler);
2629     Py_XDECREF(exc);
2630     return NULL;
2631 }
2632
2633 PyObject *
2634 PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2635                       Py_ssize_t size,
2636                       const char *errors,
2637                       int byteorder)
2638 {
2639     PyObject *v;
2640     unsigned char *p;
2641     Py_ssize_t nsize, bytesize;
2642 #ifdef Py_UNICODE_WIDE
2643     Py_ssize_t i, pairs;
2644 #else
2645     const int pairs = 0;
2646 #endif
2647     /* Offsets from p for storing byte pairs in the right order. */
2648 #ifdef BYTEORDER_IS_LITTLE_ENDIAN
2649     int ihi = 1, ilo = 0;
2650 #else
2651     int ihi = 0, ilo = 1;
2652 #endif
2653
2654 #define STORECHAR(CH)                           \
2655     do {                                        \
2656         p[ihi] = ((CH) >> 8) & 0xff;            \
2657         p[ilo] = (CH) & 0xff;                   \
2658         p += 2;                                 \
2659     } while(0)
2660
2661 #ifdef Py_UNICODE_WIDE
2662     for (i = pairs = 0; i < size; i++)
2663         if (s[i] >= 0x10000)
2664             pairs++;
2665 #endif
2666     /* 2 * (size + pairs + (byteorder == 0)) */
2667     if (size > PY_SSIZE_T_MAX ||
2668         size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2669         return PyErr_NoMemory();
2670     nsize = size + pairs + (byteorder == 0);
2671     bytesize = nsize * 2;
2672     if (bytesize / 2 != nsize)
2673         return PyErr_NoMemory();
2674     v = PyString_FromStringAndSize(NULL, bytesize);
2675     if (v == NULL)
2676         return NULL;
2677
2678     p = (unsigned char *)PyString_AS_STRING(v);
2679     if (byteorder == 0)
2680         STORECHAR(0xFEFF);
2681     if (size == 0)
2682         return v;
2683
2684     if (byteorder == -1) {
2685         /* force LE */
2686         ihi = 1;
2687         ilo = 0;
2688     }
2689     else if (byteorder == 1) {
2690         /* force BE */
2691         ihi = 0;
2692         ilo = 1;
2693     }
2694
2695     while (size-- > 0) {
2696         Py_UNICODE ch = *s++;
2697         Py_UNICODE ch2 = 0;
2698 #ifdef Py_UNICODE_WIDE
2699         if (ch >= 0x10000) {
2700             ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2701             ch  = 0xD800 | ((ch-0x10000) >> 10);
2702         }
2703 #endif
2704         STORECHAR(ch);
2705         if (ch2)
2706             STORECHAR(ch2);
2707     }
2708     return v;
2709 #undef STORECHAR
2710 }
2711
2712 PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2713 {
2714     if (!PyUnicode_Check(unicode)) {
2715         PyErr_BadArgument();
2716         return NULL;
2717     }
2718     return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2719                                  PyUnicode_GET_SIZE(unicode),
2720                                  NULL,
2721                                  0);
2722 }
2723
2724 /* --- Unicode Escape Codec ----------------------------------------------- */
2725
2726 static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2727
2728 PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2729                                         Py_ssize_t size,
2730                                         const char *errors)
2731 {
2732     const char *starts = s;
2733     Py_ssize_t startinpos;
2734     Py_ssize_t endinpos;
2735     Py_ssize_t outpos;
2736     int i;
2737     PyUnicodeObject *v;
2738     Py_UNICODE *p;
2739     const char *end;
2740     char* message;
2741     Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2742     PyObject *errorHandler = NULL;
2743     PyObject *exc = NULL;
2744
2745     /* Escaped strings will always be longer than the resulting
2746        Unicode string, so we start with size here and then reduce the
2747        length after conversion to the true value.
2748        (but if the error callback returns a long replacement string
2749        we'll have to allocate more space) */
2750     v = _PyUnicode_New(size);
2751     if (v == NULL)
2752         goto onError;
2753     if (size == 0)
2754         return (PyObject *)v;
2755
2756     p = PyUnicode_AS_UNICODE(v);
2757     end = s + size;
2758
2759     while (s < end) {
2760         unsigned char c;
2761         Py_UNICODE x;
2762         int digits;
2763
2764         /* Non-escape characters are interpreted as Unicode ordinals */
2765         if (*s != '\\') {
2766             *p++ = (unsigned char) *s++;
2767             continue;
2768         }
2769
2770         startinpos = s-starts;
2771         /* \ - Escapes */
2772         s++;
2773         c = *s++;
2774         if (s > end)
2775             c = '\0'; /* Invalid after \ */
2776         switch (c) {
2777
2778             /* \x escapes */
2779         case '\n': break;
2780         case '\\': *p++ = '\\'; break;
2781         case '\'': *p++ = '\''; break;
2782         case '\"': *p++ = '\"'; break;
2783         case 'b': *p++ = '\b'; break;
2784         case 'f': *p++ = '\014'; break; /* FF */
2785         case 't': *p++ = '\t'; break;
2786         case 'n': *p++ = '\n'; break;
2787         case 'r': *p++ = '\r'; break;
2788         case 'v': *p++ = '\013'; break; /* VT */
2789         case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2790
2791             /* \OOO (octal) escapes */
2792         case '0': case '1': case '2': case '3':
2793         case '4': case '5': case '6': case '7':
2794             x = s[-1] - '0';
2795             if (s < end && '0' <= *s && *s <= '7') {
2796                 x = (x<<3) + *s++ - '0';
2797                 if (s < end && '0' <= *s && *s <= '7')
2798                     x = (x<<3) + *s++ - '0';
2799             }
2800             *p++ = x;
2801             break;
2802
2803             /* hex escapes */
2804             /* \xXX */
2805         case 'x':
2806             digits = 2;
2807             message = "truncated \\xXX escape";
2808             goto hexescape;
2809
2810             /* \uXXXX */
2811         case 'u':
2812             digits = 4;
2813             message = "truncated \\uXXXX escape";
2814             goto hexescape;
2815
2816             /* \UXXXXXXXX */
2817         case 'U':
2818             digits = 8;
2819             message = "truncated \\UXXXXXXXX escape";
2820         hexescape:
2821             chr = 0;
2822             outpos = p-PyUnicode_AS_UNICODE(v);
2823             if (s+digits>end) {
2824                 endinpos = size;
2825                 if (unicode_decode_call_errorhandler(
2826                         errors, &errorHandler,
2827                         "unicodeescape", "end of string in escape sequence",
2828                         starts, size, &startinpos, &endinpos, &exc, &s,
2829                         &v, &outpos, &p))
2830                     goto onError;
2831                 goto nextByte;
2832             }
2833             for (i = 0; i < digits; ++i) {
2834                 c = (unsigned char) s[i];
2835                 if (!isxdigit(c)) {
2836                     endinpos = (s+i+1)-starts;
2837                     if (unicode_decode_call_errorhandler(
2838                             errors, &errorHandler,
2839                             "unicodeescape", message,
2840                             starts, size, &startinpos, &endinpos, &exc, &s,
2841                             &v, &outpos, &p))
2842                         goto onError;
2843                     goto nextByte;
2844                 }
2845                 chr = (chr<<4) & ~0xF;
2846                 if (c >= '0' && c <= '9')
2847                     chr += c - '0';
2848                 else if (c >= 'a' && c <= 'f')
2849                     chr += 10 + c - 'a';
2850                 else
2851                     chr += 10 + c - 'A';
2852             }
2853             s += i;
2854             if (chr == 0xffffffff && PyErr_Occurred())
2855                 /* _decoding_error will have already written into the
2856                    target buffer. */
2857                 break;
2858         store:
2859             /* when we get here, chr is a 32-bit unicode character */
2860             if (chr <= 0xffff)
2861                 /* UCS-2 character */
2862                 *p++ = (Py_UNICODE) chr;
2863             else if (chr <= 0x10ffff) {
2864                 /* UCS-4 character. Either store directly, or as
2865                    surrogate pair. */
2866 #ifdef Py_UNICODE_WIDE
2867                 *p++ = chr;
2868 #else
2869                 chr -= 0x10000L;
2870                 *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2871                 *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2872 #endif
2873             } else {
2874                 endinpos = s-starts;
2875                 outpos = p-PyUnicode_AS_UNICODE(v);
2876                 if (unicode_decode_call_errorhandler(
2877                         errors, &errorHandler,
2878                         "unicodeescape", "illegal Unicode character",
2879                         starts, size, &startinpos, &endinpos, &exc, &s,
2880                         &v, &outpos, &p))
2881                     goto onError;
2882             }
2883             break;
2884
2885             /* \N{name} */
2886         case 'N':
2887             message = "malformed \\N character escape";
2888             if (ucnhash_CAPI == NULL) {
2889                 /* load the unicode data module */
2890                 ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2891                 if (ucnhash_CAPI == NULL)
2892                     goto ucnhashError;
2893             }
2894             if (*s == '{') {
2895                 const char *start = s+1;
2896                 /* look for the closing brace */
2897                 while (*s != '}' && s < end)
2898                     s++;
2899                 if (s > start && s < end && *s == '}') {
2900                     /* found a name.  look it up in the unicode database */
2901                     message = "unknown Unicode character name";
2902                     s++;
2903                     if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904                         goto store;
2905                 }
2906             }
2907             endinpos = s-starts;
2908             outpos = p-PyUnicode_AS_UNICODE(v);
2909             if (unicode_decode_call_errorhandler(
2910                     errors, &errorHandler,
2911                     "unicodeescape", message,
2912                     starts, size, &startinpos, &endinpos, &exc, &s,
2913                     &v, &outpos, &p))
2914                 goto onError;
2915             break;
2916
2917         default:
2918             if (s > end) {
2919                 message = "\\ at end of string";
2920                 s--;
2921                 endinpos = s-starts;
2922                 outpos = p-PyUnicode_AS_UNICODE(v);
2923                 if (unicode_decode_call_errorhandler(
2924                         errors, &errorHandler,
2925                         "unicodeescape", message,
2926                         starts, size, &startinpos, &endinpos, &exc, &s,
2927                         &v, &outpos, &p))
2928                     goto onError;
2929             }
2930             else {
2931                 *p++ = '\\';
2932                 *p++ = (unsigned char)s[-1];
2933             }
2934             break;
2935         }
2936       nextByte:
2937         ;
2938     }
2939     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940         goto onError;
2941     Py_XDECREF(errorHandler);
2942     Py_XDECREF(exc);
2943     return (PyObject *)v;
2944
2945   ucnhashError:
2946     PyErr_SetString(
2947         PyExc_UnicodeError,
2948         "\\N escapes not supported (can't load unicodedata module)"
2949         );
2950     Py_XDECREF(v);
2951     Py_XDECREF(errorHandler);
2952     Py_XDECREF(exc);
2953     return NULL;
2954
2955   onError:
2956     Py_XDECREF(v);
2957     Py_XDECREF(errorHandler);
2958     Py_XDECREF(exc);
2959     return NULL;
2960 }
2961
2962 /* Return a Unicode-Escape string version of the Unicode object.
2963
2964    If quotes is true, the string is enclosed in u"" or u'' quotes as
2965    appropriate.
2966
2967 */
2968
2969 Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970                                              Py_ssize_t size,
2971                                              Py_UNICODE ch)
2972 {
2973     /* like wcschr, but doesn't stop at NULL characters */
2974
2975     while (size-- > 0) {
2976         if (*s == ch)
2977             return s;
2978         s++;
2979     }
2980
2981     return NULL;
2982 }
2983
2984 static
2985 PyObject *unicodeescape_string(const Py_UNICODE *s,
2986                                Py_ssize_t size,
2987                                int quotes)
2988 {
2989     PyObject *repr;
2990     char *p;
2991
2992     static const char *hexdigit = "0123456789abcdef";
2993 #ifdef Py_UNICODE_WIDE
2994     const Py_ssize_t expandsize = 10;
2995 #else
2996     const Py_ssize_t expandsize = 6;
2997 #endif
2998
2999     /* XXX(nnorwitz): rather than over-allocating, it would be
3000        better to choose a different scheme.  Perhaps scan the
3001        first N-chars of the string and allocate based on that size.
3002     */
3003     /* Initial allocation is based on the longest-possible unichr
3004        escape.
3005
3006        In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007        unichr, so in this case it's the longest unichr escape. In
3008        narrow (UTF-16) builds this is five chars per source unichr
3009        since there are two unichrs in the surrogate pair, so in narrow
3010        (UTF-16) builds it's not the longest unichr escape.
3011
3012        In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013        so in the narrow (UTF-16) build case it's the longest unichr
3014        escape.
3015     */
3016
3017     if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018         return PyErr_NoMemory();
3019
3020     repr = PyString_FromStringAndSize(NULL,
3021                                       2
3022                                       + expandsize*size
3023                                       + 1);
3024     if (repr == NULL)
3025         return NULL;
3026
3027     p = PyString_AS_STRING(repr);
3028
3029     if (quotes) {
3030         *p++ = 'u';
3031         *p++ = (findchar(s, size, '\'') &&
3032                 !findchar(s, size, '"')) ? '"' : '\'';
3033     }
3034     while (size-- > 0) {
3035         Py_UNICODE ch = *s++;
3036
3037         /* Escape quotes and backslashes */
3038         if ((quotes &&
3039              ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040             *p++ = '\\';
3041             *p++ = (char) ch;
3042             continue;
3043         }
3044
3045 #ifdef Py_UNICODE_WIDE
3046         /* Map 21-bit characters to '\U00xxxxxx' */
3047         else if (ch >= 0x10000) {
3048             *p++ = '\\';
3049             *p++ = 'U';
3050             *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051             *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052             *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053             *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054             *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055             *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056             *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057             *p++ = hexdigit[ch & 0x0000000F];
3058             continue;
3059         }
3060 #else
3061         /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062         else if (ch >= 0xD800 && ch < 0xDC00) {
3063             Py_UNICODE ch2;
3064             Py_UCS4 ucs;
3065
3066             ch2 = *s++;
3067             size--;
3068             if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069                 ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070                 *p++ = '\\';
3071                 *p++ = 'U';
3072                 *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073                 *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074                 *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075                 *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076                 *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077                 *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078                 *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079                 *p++ = hexdigit[ucs & 0x0000000F];
3080                 continue;
3081             }
3082             /* Fall through: isolated surrogates are copied as-is */
3083             s--;
3084             size++;
3085         }
3086 #endif
3087
3088         /* Map 16-bit characters to '\uxxxx' */
3089         if (ch >= 256) {
3090             *p++ = '\\';
3091             *p++ = 'u';
3092             *p++ = hexdigit[(ch >> 12) & 0x000F];
3093             *p++ = hexdigit[(ch >> 8) & 0x000F];
3094             *p++ = hexdigit[(ch >> 4) & 0x000F];
3095             *p++ = hexdigit[ch & 0x000F];
3096         }
3097
3098         /* Map special whitespace to '\t', \n', '\r' */
3099         else if (ch == '\t') {
3100             *p++ = '\\';
3101             *p++ = 't';
3102         }
3103         else if (ch == '\n') {
3104             *p++ = '\\';
3105             *p++ = 'n';
3106         }
3107         else if (ch == '\r') {
3108             *p++ = '\\';
3109             *p++ = 'r';
3110         }
3111
3112         /* Map non-printable US ASCII to '\xhh' */
3113         else if (ch < ' ' || ch >= 0x7F) {
3114             *p++ = '\\';
3115             *p++ = 'x';
3116             *p++ = hexdigit[(ch >> 4) & 0x000F];
3117             *p++ = hexdigit[ch & 0x000F];
3118         }
3119
3120         /* Copy everything else as-is */
3121         else
3122             *p++ = (char) ch;
3123     }
3124     if (quotes)
3125         *p++ = PyString_AS_STRING(repr)[1];
3126
3127     *p = '\0';
3128     if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3129         return NULL;
3130     return repr;
3131 }
3132
3133 PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3134                                         Py_ssize_t size)
3135 {
3136     return unicodeescape_string(s, size, 0);
3137 }
3138
3139 PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140 {
3141     if (!PyUnicode_Check(unicode)) {
3142         PyErr_BadArgument();
3143         return NULL;
3144     }
3145     return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3146                                          PyUnicode_GET_SIZE(unicode));
3147 }
3148
3149 /* --- Raw Unicode Escape Codec ------------------------------------------- */
3150
3151 PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3152                                            Py_ssize_t size,
3153                                            const char *errors)
3154 {
3155     const char *starts = s;
3156     Py_ssize_t startinpos;
3157     Py_ssize_t endinpos;
3158     Py_ssize_t outpos;
3159     PyUnicodeObject *v;
3160     Py_UNICODE *p;
3161     const char *end;
3162     const char *bs;
3163     PyObject *errorHandler = NULL;
3164     PyObject *exc = NULL;
3165
3166     /* Escaped strings will always be longer than the resulting
3167        Unicode string, so we start with size here and then reduce the
3168        length after conversion to the true value. (But decoding error
3169        handler might have to resize the string) */
3170     v = _PyUnicode_New(size);
3171     if (v == NULL)
3172         goto onError;
3173     if (size == 0)
3174         return (PyObject *)v;
3175     p = PyUnicode_AS_UNICODE(v);
3176     end = s + size;
3177     while (s < end) {
3178         unsigned char c;
3179         Py_UCS4 x;
3180         int i;
3181         int count;
3182
3183         /* Non-escape characters are interpreted as Unicode ordinals */
3184         if (*s != '\\') {
3185             *p++ = (unsigned char)*s++;
3186             continue;
3187         }
3188         startinpos = s-starts;
3189
3190         /* \u-escapes are only interpreted iff the number of leading
3191            backslashes if odd */
3192         bs = s;
3193         for (;s < end;) {
3194             if (*s != '\\')
3195                 break;
3196             *p++ = (unsigned char)*s++;
3197         }
3198         if (((s - bs) & 1) == 0 ||
3199             s >= end ||
3200             (*s != 'u' && *s != 'U')) {
3201             continue;
3202         }
3203         p--;
3204         count = *s=='u' ? 4 : 8;
3205         s++;
3206
3207         /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208         outpos = p-PyUnicode_AS_UNICODE(v);
3209         for (x = 0, i = 0; i < count; ++i, ++s) {
3210             c = (unsigned char)*s;
3211             if (!isxdigit(c)) {
3212                 endinpos = s-starts;
3213                 if (unicode_decode_call_errorhandler(
3214                         errors, &errorHandler,
3215                         "rawunicodeescape", "truncated \\uXXXX",
3216                         starts, size, &startinpos, &endinpos, &exc, &s,
3217                         &v, &outpos, &p))
3218                     goto onError;
3219                 goto nextByte;
3220             }
3221             x = (x<<4) & ~0xF;
3222             if (c >= '0' && c <= '9')
3223                 x += c - '0';
3224             else if (c >= 'a' && c <= 'f')
3225                 x += 10 + c - 'a';
3226             else
3227                 x += 10 + c - 'A';
3228         }
3229         if (x <= 0xffff)
3230             /* UCS-2 character */
3231             *p++ = (Py_UNICODE) x;
3232         else if (x <= 0x10ffff) {
3233             /* UCS-4 character. Either store directly, or as
3234                surrogate pair. */
3235 #ifdef Py_UNICODE_WIDE
3236             *p++ = (Py_UNICODE) x;
3237 #else
3238             x -= 0x10000L;
3239             *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240             *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3241 #endif
3242         } else {
3243             endinpos = s-starts;
3244             outpos = p-PyUnicode_AS_UNICODE(v);
3245             if (unicode_decode_call_errorhandler(
3246                     errors, &errorHandler,
3247                     "rawunicodeescape", "\\Uxxxxxxxx out of range",
3248                     starts, size, &startinpos, &endinpos, &exc, &s,
3249                     &v, &outpos, &p))
3250                 goto onError;
3251         }
3252       nextByte:
3253         ;
3254     }
3255     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3256         goto onError;
3257     Py_XDECREF(errorHandler);
3258     Py_XDECREF(exc);
3259     return (PyObject *)v;
3260
3261   onError:
3262     Py_XDECREF(v);
3263     Py_XDECREF(errorHandler);
3264     Py_XDECREF(exc);
3265     return NULL;
3266 }
3267
3268 PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3269                                            Py_ssize_t size)
3270 {
3271     PyObject *repr;
3272     char *p;
3273     char *q;
3274
3275     static const char *hexdigit = "0123456789abcdef";
3276 #ifdef Py_UNICODE_WIDE
3277     const Py_ssize_t expandsize = 10;
3278 #else
3279     const Py_ssize_t expandsize = 6;
3280 #endif
3281
3282     if (size > PY_SSIZE_T_MAX / expandsize)
3283         return PyErr_NoMemory();
3284
3285     repr = PyString_FromStringAndSize(NULL, expandsize * size);
3286     if (repr == NULL)
3287         return NULL;
3288     if (size == 0)
3289         return repr;
3290
3291     p = q = PyString_AS_STRING(repr);
3292     while (size-- > 0) {
3293         Py_UNICODE ch = *s++;
3294 #ifdef Py_UNICODE_WIDE
3295         /* Map 32-bit characters to '\Uxxxxxxxx' */
3296         if (ch >= 0x10000) {
3297             *p++ = '\\';
3298             *p++ = 'U';
3299             *p++ = hexdigit[(ch >> 28) & 0xf];
3300             *p++ = hexdigit[(ch >> 24) & 0xf];
3301             *p++ = hexdigit[(ch >> 20) & 0xf];
3302             *p++ = hexdigit[(ch >> 16) & 0xf];
3303             *p++ = hexdigit[(ch >> 12) & 0xf];
3304             *p++ = hexdigit[(ch >> 8) & 0xf];
3305             *p++ = hexdigit[(ch >> 4) & 0xf];
3306             *p++ = hexdigit[ch & 15];
3307         }
3308         else
3309 #else
3310             /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311             if (ch >= 0xD800 && ch < 0xDC00) {
3312                 Py_UNICODE ch2;
3313                 Py_UCS4 ucs;
3314
3315                 ch2 = *s++;
3316                 size--;
3317                 if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3318                     ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319                     *p++ = '\\';
3320                     *p++ = 'U';
3321                     *p++ = hexdigit[(ucs >> 28) & 0xf];
3322                     *p++ = hexdigit[(ucs >> 24) & 0xf];
3323                     *p++ = hexdigit[(ucs >> 20) & 0xf];
3324                     *p++ = hexdigit[(ucs >> 16) & 0xf];
3325                     *p++ = hexdigit[(ucs >> 12) & 0xf];
3326                     *p++ = hexdigit[(ucs >> 8) & 0xf];
3327                     *p++ = hexdigit[(ucs >> 4) & 0xf];
3328                     *p++ = hexdigit[ucs & 0xf];
3329                     continue;
3330                 }
3331                 /* Fall through: isolated surrogates are copied as-is */
3332                 s--;
3333                 size++;
3334             }
3335 #endif
3336         /* Map 16-bit characters to '\uxxxx' */
3337         if (ch >= 256) {
3338             *p++ = '\\';
3339             *p++ = 'u';
3340             *p++ = hexdigit[(ch >> 12) & 0xf];
3341             *p++ = hexdigit[(ch >> 8) & 0xf];
3342             *p++ = hexdigit[(ch >> 4) & 0xf];
3343             *p++ = hexdigit[ch & 15];
3344         }
3345         /* Copy everything else as-is */
3346         else
3347             *p++ = (char) ch;
3348     }
3349     *p = '\0';
3350     if (_PyString_Resize(&repr, p - q))
3351         return NULL;
3352     return repr;
3353 }
3354
3355 PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3356 {
3357     if (!PyUnicode_Check(unicode)) {
3358         PyErr_BadArgument();
3359         return NULL;
3360     }
3361     return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3362                                             PyUnicode_GET_SIZE(unicode));
3363 }
3364
3365 /* --- Unicode Internal Codec ------------------------------------------- */
3366
3367 PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3368                                            Py_ssize_t size,
3369                                            const char *errors)
3370 {
3371     const char *starts = s;
3372     Py_ssize_t startinpos;
3373     Py_ssize_t endinpos;
3374     Py_ssize_t outpos;
3375     PyUnicodeObject *v;
3376     Py_UNICODE *p;
3377     const char *end;
3378     const char *reason;
3379     PyObject *errorHandler = NULL;
3380     PyObject *exc = NULL;
3381
3382 #ifdef Py_UNICODE_WIDE
3383     Py_UNICODE unimax = PyUnicode_GetMax();
3384 #endif
3385
3386     /* XXX overflow detection missing */
3387     v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3388     if (v == NULL)
3389         goto onError;
3390     if (PyUnicode_GetSize((PyObject *)v) == 0)
3391         return (PyObject *)v;
3392     p = PyUnicode_AS_UNICODE(v);
3393     end = s + size;
3394
3395     while (s < end) {
3396         memcpy(p, s, sizeof(Py_UNICODE));
3397         /* We have to sanity check the raw data, otherwise doom looms for
3398            some malformed UCS-4 data. */
3399         if (
3400 #ifdef Py_UNICODE_WIDE
3401             *p > unimax || *p < 0 ||
3402 #endif
3403             end-s < Py_UNICODE_SIZE
3404             )
3405         {
3406             startinpos = s - starts;
3407             if (end-s < Py_UNICODE_SIZE) {
3408                 endinpos = end-starts;
3409                 reason = "truncated input";
3410             }
3411             else {
3412                 endinpos = s - starts + Py_UNICODE_SIZE;
3413                 reason = "illegal code point (> 0x10FFFF)";
3414             }
3415             outpos = p - PyUnicode_AS_UNICODE(v);
3416             if (unicode_decode_call_errorhandler(
3417                     errors, &errorHandler,
3418                     "unicode_internal", reason,
3419                     starts, size, &startinpos, &endinpos, &exc, &s,
3420                     &v, &outpos, &p)) {
3421                 goto onError;
3422             }
3423         }
3424         else {
3425             p++;
3426             s += Py_UNICODE_SIZE;
3427         }
3428     }
3429
3430     if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3431         goto onError;
3432     Py_XDECREF(errorHandler);
3433     Py_XDECREF(exc);
3434     return (PyObject *)v;
3435
3436   onError:
3437     Py_XDECREF(v);
3438     Py_XDECREF(errorHandler);
3439     Py_XDECREF(exc);
3440     return NULL;
3441 }
3442
3443 /* --- Latin-1 Codec ------------------------------------------------------ */
3444
3445 PyObject *PyUnicode_DecodeLatin1(const char *s,
3446                                  Py_ssize_t size,
3447                                  const char *errors)
3448 {
3449     PyUnicodeObject *v;
3450     Py_UNICODE *p;
3451
3452     /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3453     if (size == 1) {
3454         Py_UNICODE r = *(unsigned char*)s;
3455         return PyUnicode_FromUnicode(&r, 1);
3456     }
3457
3458     v = _PyUnicode_New(size);
3459     if (v == NULL)
3460         goto onError;
3461     if (size == 0)
3462         return (PyObject *)v;
3463     p = PyUnicode_AS_UNICODE(v);
3464     while (size-- > 0)
3465         *p++ = (unsigned char)*s++;
3466     return (PyObject *)v;
3467
3468   onError:
3469     Py_XDECREF(v);
3470     return NULL;
3471 }
3472
3473 /* create or adjust a UnicodeEncodeError */
3474 static void make_encode_exception(PyObject **exceptionObject,
3475                                   const char *encoding,
3476                                   const Py_UNICODE *unicode, Py_ssize_t size,
3477                                   Py_ssize_t startpos, Py_ssize_t endpos,
3478                                   const char *reason)
3479 {
3480     if (*exceptionObject == NULL) {
3481         *exceptionObject = PyUnicodeEncodeError_Create(
3482             encoding, unicode, size, startpos, endpos, reason);
3483     }
3484     else {
3485         if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3486             goto onError;
3487         if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3488             goto onError;
3489         if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3490             goto onError;
3491         return;
3492       onError:
3493         Py_DECREF(*exceptionObject);
3494         *exceptionObject = NULL;
3495     }
3496 }
3497
3498 /* raises a UnicodeEncodeError */
3499 static void raise_encode_exception(PyObject **exceptionObject,
3500                                    const char *encoding,
3501                                    const Py_UNICODE *unicode, Py_ssize_t size,
3502                                    Py_ssize_t startpos, Py_ssize_t endpos,
3503                                    const char *reason)
3504 {
3505     make_encode_exception(exceptionObject,
3506                           encoding, unicode, size, startpos, endpos, reason);
3507     if (*exceptionObject != NULL)
3508         PyCodec_StrictErrors(*exceptionObject);
3509 }
3510
3511 /* error handling callback helper:
3512    build arguments, call the callback and check the arguments,
3513    put the result into newpos and return the replacement string, which
3514    has to be freed by the caller */
3515 static PyObject *unicode_encode_call_errorhandler(const char *errors,
3516                                                   PyObject **errorHandler,
3517                                                   const char *encoding, const char *reason,
3518                                                   const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3519                                                   Py_ssize_t startpos, Py_ssize_t endpos,
3520                                                   Py_ssize_t *newpos)
3521 {
3522     static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3523
3524     PyObject *restuple;
3525     PyObject *resunicode;
3526
3527     if (*errorHandler == NULL) {
3528         *errorHandler = PyCodec_LookupError(errors);
3529         if (*errorHandler == NULL)
3530             return NULL;
3531     }
3532
3533     make_encode_exception(exceptionObject,
3534                           encoding, unicode, size, startpos, endpos, reason);
3535     if (*exceptionObject == NULL)
3536         return NULL;
3537
3538     restuple = PyObject_CallFunctionObjArgs(
3539         *errorHandler, *exceptionObject, NULL);
3540     if (restuple == NULL)
3541         return NULL;
3542     if (!PyTuple_Check(restuple)) {
3543         PyErr_SetString(PyExc_TypeError, &argparse[4]);
3544         Py_DECREF(restuple);
3545         return NULL;
3546     }
3547     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3548                           &resunicode, newpos)) {
3549         Py_DECREF(restuple);
3550         return NULL;
3551     }
3552     if (*newpos<0)
3553         *newpos = size+*newpos;
3554     if (*newpos<0 || *newpos>size) {
3555         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3556         Py_DECREF(restuple);
3557         return NULL;
3558     }
3559     Py_INCREF(resunicode);
3560     Py_DECREF(restuple);
3561     return resunicode;
3562 }
3563
3564 static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3565                                      Py_ssize_t size,
3566                                      const char *errors,
3567                                      int limit)
3568 {
3569     /* output object */
3570     PyObject *res;
3571     /* pointers to the beginning and end+1 of input */
3572     const Py_UNICODE *startp = p;
3573     const Py_UNICODE *endp = p + size;
3574     /* pointer to the beginning of the unencodable characters */
3575     /* const Py_UNICODE *badp = NULL; */
3576     /* pointer into the output */
3577     char *str;
3578     /* current output position */
3579     Py_ssize_t respos = 0;
3580     Py_ssize_t ressize;
3581     const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3582     const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3583     PyObject *errorHandler = NULL;
3584     PyObject *exc = NULL;
3585     /* the following variable is used for caching string comparisons
3586      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3587     int known_errorHandler = -1;
3588
3589     /* allocate enough for a simple encoding without
3590        replacements, if we need more, we'll resize */
3591     res = PyString_FromStringAndSize(NULL, size);
3592     if (res == NULL)
3593         goto onError;
3594     if (size == 0)
3595         return res;
3596     str = PyString_AS_STRING(res);
3597     ressize = size;
3598
3599     while (p<endp) {
3600         Py_UNICODE c = *p;
3601
3602         /* can we encode this? */
3603         if (c<limit) {
3604             /* no overflow check, because we know that the space is enough */
3605             *str++ = (char)c;
3606             ++p;
3607         }
3608         else {
3609             Py_ssize_t unicodepos = p-startp;
3610             Py_ssize_t requiredsize;
3611             PyObject *repunicode;
3612             Py_ssize_t repsize;
3613             Py_ssize_t newpos;
3614             Py_ssize_t respos;
3615             Py_UNICODE *uni2;
3616             /* startpos for collecting unencodable chars */
3617             const Py_UNICODE *collstart = p;
3618             const Py_UNICODE *collend = p;
3619             /* find all unecodable characters */
3620             while ((collend < endp) && ((*collend)>=limit))
3621                 ++collend;
3622             /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3623             if (known_errorHandler==-1) {
3624                 if ((errors==NULL) || (!strcmp(errors, "strict")))
3625                     known_errorHandler = 1;
3626                 else if (!strcmp(errors, "replace"))
3627                     known_errorHandler = 2;
3628                 else if (!strcmp(errors, "ignore"))
3629                     known_errorHandler = 3;
3630                 else if (!strcmp(errors, "xmlcharrefreplace"))
3631                     known_errorHandler = 4;
3632                 else
3633                     known_errorHandler = 0;
3634             }
3635             switch (known_errorHandler) {
3636             case 1: /* strict */
3637                 raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3638                 goto onError;
3639             case 2: /* replace */
3640                 while (collstart++<collend)
3641                     *str++ = '?'; /* fall through */
3642             case 3: /* ignore */
3643                 p = collend;
3644                 break;
3645             case 4: /* xmlcharrefreplace */
3646                 respos = str-PyString_AS_STRING(res);
3647                 /* determine replacement size (temporarily (mis)uses p) */
3648                 for (p = collstart, repsize = 0; p < collend; ++p) {
3649                     if (*p<10)
3650                         repsize += 2+1+1;
3651                     else if (*p<100)
3652                         repsize += 2+2+1;
3653                     else if (*p<1000)
3654                         repsize += 2+3+1;
3655                     else if (*p<10000)
3656                         repsize += 2+4+1;
3657 #ifndef Py_UNICODE_WIDE
3658                     else
3659                         repsize += 2+5+1;
3660 #else
3661                     else if (*p<100000)
3662                         repsize += 2+5+1;
3663                     else if (*p<1000000)
3664                         repsize += 2+6+1;
3665                     else
3666                         repsize += 2+7+1;
3667 #endif
3668                 }
3669                 requiredsize = respos+repsize+(endp-collend);
3670                 if (requiredsize > ressize) {
3671                     if (requiredsize<2*ressize)
3672                         requiredsize = 2*ressize;
3673                     if (_PyString_Resize(&res, requiredsize))
3674                         goto onError;
3675                     str = PyString_AS_STRING(res) + respos;
3676                     ressize = requiredsize;
3677                 }
3678                 /* generate replacement (temporarily (mis)uses p) */
3679                 for (p = collstart; p < collend; ++p) {
3680                     str += sprintf(str, "&#%d;", (int)*p);
3681                 }
3682                 p = collend;
3683                 break;
3684             default:
3685                 repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3686                                                               encoding, reason, startp, size, &exc,
3687                                                               collstart-startp, collend-startp, &newpos);
3688                 if (repunicode == NULL)
3689                     goto onError;
3690                 /* need more space? (at least enough for what we have+the
3691                    replacement+the rest of the string, so we won't have to
3692                    check space for encodable characters) */
3693                 respos = str-PyString_AS_STRING(res);
3694                 repsize = PyUnicode_GET_SIZE(repunicode);
3695                 requiredsize = respos+repsize+(endp-collend);
3696                 if (requiredsize > ressize) {
3697                     if (requiredsize<2*ressize)
3698                         requiredsize = 2*ressize;
3699                     if (_PyString_Resize(&res, requiredsize)) {
3700                         Py_DECREF(repunicode);
3701                         goto onError;
3702                     }
3703                     str = PyString_AS_STRING(res) + respos;
3704                     ressize = requiredsize;
3705                 }
3706                 /* check if there is anything unencodable in the replacement
3707                    and copy it to the output */
3708                 for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3709                     c = *uni2;
3710                     if (c >= limit) {
3711                         raise_encode_exception(&exc, encoding, startp, size,
3712                                                unicodepos, unicodepos+1, reason);
3713                         Py_DECREF(repunicode);
3714                         goto onError;
3715                     }
3716                     *str = (char)c;
3717                 }
3718                 p = startp + newpos;
3719                 Py_DECREF(repunicode);
3720             }
3721         }
3722     }
3723     /* Resize if we allocated to much */
3724     respos = str-PyString_AS_STRING(res);
3725     if (respos<ressize)
3726         /* If this falls res will be NULL */
3727         _PyString_Resize(&res, respos);
3728     Py_XDECREF(errorHandler);
3729     Py_XDECREF(exc);
3730     return res;
3731
3732   onError:
3733     Py_XDECREF(res);
3734     Py_XDECREF(errorHandler);
3735     Py_XDECREF(exc);
3736     return NULL;
3737 }
3738
3739 PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3740                                  Py_ssize_t size,
3741                                  const char *errors)
3742 {
3743     return unicode_encode_ucs1(p, size, errors, 256);
3744 }
3745
3746 PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3747 {
3748     if (!PyUnicode_Check(unicode)) {
3749         PyErr_BadArgument();
3750         return NULL;
3751     }
3752     return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3753                                   PyUnicode_GET_SIZE(unicode),
3754                                   NULL);
3755 }
3756
3757 /* --- 7-bit ASCII Codec -------------------------------------------------- */
3758
3759 PyObject *PyUnicode_DecodeASCII(const char *s,
3760                                 Py_ssize_t size,
3761                                 const char *errors)
3762 {
3763     const char *starts = s;
3764     PyUnicodeObject *v;
3765     Py_UNICODE *p;
3766     Py_ssize_t startinpos;
3767     Py_ssize_t endinpos;
3768     Py_ssize_t outpos;
3769     const char *e;
3770     PyObject *errorHandler = NULL;
3771     PyObject *exc = NULL;
3772
3773     /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3774     if (size == 1 && *(unsigned char*)s < 128) {
3775         Py_UNICODE r = *(unsigned char*)s;
3776         return PyUnicode_FromUnicode(&r, 1);
3777     }
3778
3779     v = _PyUnicode_New(size);
3780     if (v == NULL)
3781         goto onError;
3782     if (size == 0)
3783         return (PyObject *)v;
3784     p = PyUnicode_AS_UNICODE(v);
3785     e = s + size;
3786     while (s < e) {
3787         register unsigned char c = (unsigned char)*s;
3788         if (c < 128) {
3789             *p++ = c;
3790             ++s;
3791         }
3792         else {
3793             startinpos = s-starts;
3794             endinpos = startinpos + 1;
3795             outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3796             if (unicode_decode_call_errorhandler(
3797                     errors, &errorHandler,
3798                     "ascii", "ordinal not in range(128)",
3799                     starts, size, &startinpos, &endinpos, &exc, &s,
3800                     &v, &outpos, &p))
3801                 goto onError;
3802         }
3803     }
3804     if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3805         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3806             goto onError;
3807     Py_XDECREF(errorHandler);
3808     Py_XDECREF(exc);
3809     return (PyObject *)v;
3810
3811   onError:
3812     Py_XDECREF(v);
3813     Py_XDECREF(errorHandler);
3814     Py_XDECREF(exc);
3815     return NULL;
3816 }
3817
3818 PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3819                                 Py_ssize_t size,
3820                                 const char *errors)
3821 {
3822     return unicode_encode_ucs1(p, size, errors, 128);
3823 }
3824
3825 PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3826 {
3827     if (!PyUnicode_Check(unicode)) {
3828         PyErr_BadArgument();
3829         return NULL;
3830     }
3831     return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3832                                  PyUnicode_GET_SIZE(unicode),
3833                                  NULL);
3834 }
3835
3836 #if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3837
3838 /* --- MBCS codecs for Windows -------------------------------------------- */
3839
3840 #if SIZEOF_INT < SIZEOF_SIZE_T
3841 #define NEED_RETRY
3842 #endif
3843
3844 /* XXX This code is limited to "true" double-byte encodings, as
3845    a) it assumes an incomplete character consists of a single byte, and
3846    b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3847    encodings, see IsDBCSLeadByteEx documentation. */
3848
3849 static int is_dbcs_lead_byte(const char *s, int offset)
3850 {
3851     const char *curr = s + offset;
3852
3853     if (IsDBCSLeadByte(*curr)) {
3854         const char *prev = CharPrev(s, curr);
3855         return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3856     }
3857     return 0;
3858 }
3859
3860 /*
3861  * Decode MBCS string into unicode object. If 'final' is set, converts
3862  * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3863  */
3864 static int decode_mbcs(PyUnicodeObject **v,
3865                        const char *s, /* MBCS string */
3866                        int size, /* sizeof MBCS string */
3867                        int final)
3868 {
3869     Py_UNICODE *p;
3870     Py_ssize_t n = 0;
3871     int usize = 0;
3872
3873     assert(size >= 0);
3874
3875     /* Skip trailing lead-byte unless 'final' is set */
3876     if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3877         --size;
3878
3879     /* First get the size of the result */
3880     if (size > 0) {
3881         usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3882         if (usize == 0) {
3883             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3884             return -1;
3885         }
3886     }
3887
3888     if (*v == NULL) {
3889         /* Create unicode object */
3890         *v = _PyUnicode_New(usize);
3891         if (*v == NULL)
3892             return -1;
3893     }
3894     else {
3895         /* Extend unicode object */
3896         n = PyUnicode_GET_SIZE(*v);
3897         if (_PyUnicode_Resize(v, n + usize) < 0)
3898             return -1;
3899     }
3900
3901     /* Do the conversion */
3902     if (size > 0) {
3903         p = PyUnicode_AS_UNICODE(*v) + n;
3904         if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3905             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3906             return -1;
3907         }
3908     }
3909
3910     return size;
3911 }
3912
3913 PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3914                                        Py_ssize_t size,
3915                                        const char *errors,
3916                                        Py_ssize_t *consumed)
3917 {
3918     PyUnicodeObject *v = NULL;
3919     int done;
3920
3921     if (consumed)
3922         *consumed = 0;
3923
3924 #ifdef NEED_RETRY
3925   retry:
3926     if (size > INT_MAX)
3927         done = decode_mbcs(&v, s, INT_MAX, 0);
3928     else
3929 #endif
3930         done = decode_mbcs(&v, s, (int)size, !consumed);
3931
3932     if (done < 0) {
3933         Py_XDECREF(v);
3934         return NULL;
3935     }
3936
3937     if (consumed)
3938         *consumed += done;
3939
3940 #ifdef NEED_RETRY
3941     if (size > INT_MAX) {
3942         s += done;
3943         size -= done;
3944         goto retry;
3945     }
3946 #endif
3947
3948     return (PyObject *)v;
3949 }
3950
3951 PyObject *PyUnicode_DecodeMBCS(const char *s,
3952                                Py_ssize_t size,
3953                                const char *errors)
3954 {
3955     return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3956 }
3957
3958 /*
3959  * Convert unicode into string object (MBCS).
3960  * Returns 0 if succeed, -1 otherwise.
3961  */
3962 static int encode_mbcs(PyObject **repr,
3963                        const Py_UNICODE *p, /* unicode */
3964                        int size) /* size of unicode */
3965 {
3966     int mbcssize = 0;
3967     Py_ssize_t n = 0;
3968
3969     assert(size >= 0);
3970
3971     /* First get the size of the result */
3972     if (size > 0) {
3973         mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3974         if (mbcssize == 0) {
3975             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3976             return -1;
3977         }
3978     }
3979
3980     if (*repr == NULL) {
3981         /* Create string object */
3982         *repr = PyString_FromStringAndSize(NULL, mbcssize);
3983         if (*repr == NULL)
3984             return -1;
3985     }
3986     else {
3987         /* Extend string object */
3988         n = PyString_Size(*repr);
3989         if (_PyString_Resize(repr, n + mbcssize) < 0)
3990             return -1;
3991     }
3992
3993     /* Do the conversion */
3994     if (size > 0) {
3995         char *s = PyString_AS_STRING(*repr) + n;
3996         if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3997             PyErr_SetFromWindowsErrWithFilename(0, NULL);
3998             return -1;
3999         }
4000     }
4001
4002     return 0;
4003 }
4004
4005 PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4006                                Py_ssize_t size,
4007                                const char *errors)
4008 {
4009     PyObject *repr = NULL;
4010     int ret;
4011
4012 #ifdef NEED_RETRY
4013   retry:
4014     if (size > INT_MAX)
4015         ret = encode_mbcs(&repr, p, INT_MAX);
4016     else
4017 #endif
4018         ret = encode_mbcs(&repr, p, (int)size);
4019
4020     if (ret < 0) {
4021         Py_XDECREF(repr);
4022         return NULL;
4023     }
4024
4025 #ifdef NEED_RETRY
4026     if (size > INT_MAX) {
4027         p += INT_MAX;
4028         size -= INT_MAX;
4029         goto retry;
4030     }
4031 #endif
4032
4033     return repr;
4034 }
4035
4036 PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4037 {
4038     if (!PyUnicode_Check(unicode)) {
4039         PyErr_BadArgument();
4040         return NULL;
4041     }
4042     return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4043                                 PyUnicode_GET_SIZE(unicode),
4044                                 NULL);
4045 }
4046
4047 #undef NEED_RETRY
4048
4049 #endif /* MS_WINDOWS */
4050
4051 /* --- Character Mapping Codec -------------------------------------------- */
4052
4053 PyObject *PyUnicode_DecodeCharmap(const char *s,
4054                                   Py_ssize_t size,
4055                                   PyObject *mapping,
4056                                   const char *errors)
4057 {
4058     const char *starts = s;
4059     Py_ssize_t startinpos;
4060     Py_ssize_t endinpos;
4061     Py_ssize_t outpos;
4062     const char *e;
4063     PyUnicodeObject *v;
4064     Py_UNICODE *p;
4065     Py_ssize_t extrachars = 0;
4066     PyObject *errorHandler = NULL;
4067     PyObject *exc = NULL;
4068     Py_UNICODE *mapstring = NULL;
4069     Py_ssize_t maplen = 0;
4070
4071     /* Default to Latin-1 */
4072     if (mapping == NULL)
4073         return PyUnicode_DecodeLatin1(s, size, errors);
4074
4075     v = _PyUnicode_New(size);
4076     if (v == NULL)
4077         goto onError;
4078     if (size == 0)
4079         return (PyObject *)v;
4080     p = PyUnicode_AS_UNICODE(v);
4081     e = s + size;
4082     if (PyUnicode_CheckExact(mapping)) {
4083         mapstring = PyUnicode_AS_UNICODE(mapping);
4084         maplen = PyUnicode_GET_SIZE(mapping);
4085         while (s < e) {
4086             unsigned char ch = *s;
4087             Py_UNICODE x = 0xfffe; /* illegal value */
4088
4089             if (ch < maplen)
4090                 x = mapstring[ch];
4091
4092             if (x == 0xfffe) {
4093                 /* undefined mapping */
4094                 outpos = p-PyUnicode_AS_UNICODE(v);
4095                 startinpos = s-starts;
4096                 endinpos = startinpos+1;
4097                 if (unicode_decode_call_errorhandler(
4098                         errors, &errorHandler,
4099                         "charmap", "character maps to <undefined>",
4100                         starts, size, &startinpos, &endinpos, &exc, &s,
4101                         &v, &outpos, &p)) {
4102                     goto onError;
4103                 }
4104                 continue;
4105             }
4106             *p++ = x;
4107             ++s;
4108         }
4109     }
4110     else {
4111         while (s < e) {
4112             unsigned char ch = *s;
4113             PyObject *w, *x;
4114
4115             /* Get mapping (char ordinal -> integer, Unicode char or None) */
4116             w = PyInt_FromLong((long)ch);
4117             if (w == NULL)
4118                 goto onError;
4119             x = PyObject_GetItem(mapping, w);
4120             Py_DECREF(w);
4121             if (x == NULL) {
4122                 if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4123                     /* No mapping found means: mapping is undefined. */
4124                     PyErr_Clear();
4125                     x = Py_None;
4126                     Py_INCREF(x);
4127                 } else
4128                     goto onError;
4129             }
4130
4131             /* Apply mapping */
4132             if (PyInt_Check(x)) {
4133                 long value = PyInt_AS_LONG(x);
4134                 if (value < 0 || value > 65535) {
4135                     PyErr_SetString(PyExc_TypeError,
4136                                     "character mapping must be in range(65536)");
4137                     Py_DECREF(x);
4138                     goto onError;
4139                 }
4140                 *p++ = (Py_UNICODE)value;
4141             }
4142             else if (x == Py_None) {
4143                 /* undefined mapping */
4144                 outpos = p-PyUnicode_AS_UNICODE(v);
4145                 startinpos = s-starts;
4146                 endinpos = startinpos+1;
4147                 if (unicode_decode_call_errorhandler(
4148                         errors, &errorHandler,
4149                         "charmap", "character maps to <undefined>",
4150                         starts, size, &startinpos, &endinpos, &exc, &s,
4151                         &v, &outpos, &p)) {
4152                     Py_DECREF(x);
4153                     goto onError;
4154                 }
4155                 Py_DECREF(x);
4156                 continue;
4157             }
4158             else if (PyUnicode_Check(x)) {
4159                 Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4160
4161                 if (targetsize == 1)
4162                     /* 1-1 mapping */
4163                     *p++ = *PyUnicode_AS_UNICODE(x);
4164
4165                 else if (targetsize > 1) {
4166                     /* 1-n mapping */
4167                     if (targetsize > extrachars) {
4168                         /* resize first */
4169                         Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4170                         Py_ssize_t needed = (targetsize - extrachars) + \
4171                             (targetsize << 2);
4172                         extrachars += needed;
4173                         /* XXX overflow detection missing */
4174                         if (_PyUnicode_Resize(&v,
4175                                               PyUnicode_GET_SIZE(v) + needed) < 0) {
4176                             Py_DECREF(x);
4177                             goto onError;
4178                         }
4179                         p = PyUnicode_AS_UNICODE(v) + oldpos;
4180                     }
4181                     Py_UNICODE_COPY(p,
4182                                     PyUnicode_AS_UNICODE(x),
4183                                     targetsize);
4184                     p += targetsize;
4185                     extrachars -= targetsize;
4186                 }
4187                 /* 1-0 mapping: skip the character */
4188             }
4189             else {
4190                 /* wrong return value */
4191                 PyErr_SetString(PyExc_TypeError,
4192                                 "character mapping must return integer, None or unicode");
4193                 Py_DECREF(x);
4194                 goto onError;
4195             }
4196             Py_DECREF(x);
4197             ++s;
4198         }
4199     }
4200     if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4201         if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4202             goto onError;
4203     Py_XDECREF(errorHandler);
4204     Py_XDECREF(exc);
4205     return (PyObject *)v;
4206
4207   onError:
4208     Py_XDECREF(errorHandler);
4209     Py_XDECREF(exc);
4210     Py_XDECREF(v);
4211     return NULL;
4212 }
4213
4214 /* Charmap encoding: the lookup table */
4215
4216 struct encoding_map{
4217     PyObject_HEAD
4218     unsigned char level1[32];
4219     int count2, count3;
4220     unsigned char level23[1];
4221 };
4222
4223 static PyObject*
4224 encoding_map_size(PyObject *obj, PyObject* args)
4225 {
4226     struct encoding_map *map = (struct encoding_map*)obj;
4227     return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4228                           128*map->count3);
4229 }
4230
4231 static PyMethodDef encoding_map_methods[] = {
4232     {"size", encoding_map_size, METH_NOARGS,
4233      PyDoc_STR("Return the size (in bytes) of this object") },
4234     { 0 }
4235 };
4236
4237 static void
4238 encoding_map_dealloc(PyObject* o)
4239 {
4240     PyObject_FREE(o);
4241 }
4242
4243 static PyTypeObject EncodingMapType = {
4244     PyVarObject_HEAD_INIT(NULL, 0)
4245     "EncodingMap",          /*tp_name*/
4246     sizeof(struct encoding_map),   /*tp_basicsize*/
4247     0,                      /*tp_itemsize*/
4248     /* methods */
4249     encoding_map_dealloc,   /*tp_dealloc*/
4250     0,                      /*tp_print*/
4251     0,                      /*tp_getattr*/
4252     0,                      /*tp_setattr*/
4253     0,                      /*tp_compare*/
4254     0,                      /*tp_repr*/
4255     0,                      /*tp_as_number*/
4256     0,                      /*tp_as_sequence*/
4257     0,                      /*tp_as_mapping*/
4258     0,                      /*tp_hash*/
4259     0,                      /*tp_call*/
4260     0,                      /*tp_str*/
4261     0,                      /*tp_getattro*/
4262     0,                      /*tp_setattro*/
4263     0,                      /*tp_as_buffer*/
4264     Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4265     0,                      /*tp_doc*/
4266     0,                      /*tp_traverse*/
4267     0,                      /*tp_clear*/
4268     0,                      /*tp_richcompare*/
4269     0,                      /*tp_weaklistoffset*/
4270     0,                      /*tp_iter*/
4271     0,                      /*tp_iternext*/
4272     encoding_map_methods,   /*tp_methods*/
4273     0,                      /*tp_members*/
4274     0,                      /*tp_getset*/
4275     0,                      /*tp_base*/
4276     0,                      /*tp_dict*/
4277     0,                      /*tp_descr_get*/
4278     0,                      /*tp_descr_set*/
4279     0,                      /*tp_dictoffset*/
4280     0,                      /*tp_init*/
4281     0,                      /*tp_alloc*/
4282     0,                      /*tp_new*/
4283     0,                      /*tp_free*/
4284     0,                      /*tp_is_gc*/
4285 };
4286
4287 PyObject*
4288 PyUnicode_BuildEncodingMap(PyObject* string)
4289 {
4290     Py_UNICODE *decode;
4291     PyObject *result;
4292     struct encoding_map *mresult;
4293     int i;
4294     int need_dict = 0;
4295     unsigned char level1[32];
4296     unsigned char level2[512];
4297     unsigned char *mlevel1, *mlevel2, *mlevel3;
4298     int count2 = 0, count3 = 0;
4299
4300     if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4301         PyErr_BadArgument();
4302         return NULL;
4303     }
4304     decode = PyUnicode_AS_UNICODE(string);
4305     memset(level1, 0xFF, sizeof level1);
4306     memset(level2, 0xFF, sizeof level2);
4307
4308     /* If there isn't a one-to-one mapping of NULL to \0,
4309        or if there are non-BMP characters, we need to use
4310        a mapping dictionary. */
4311     if (decode[0] != 0)
4312         need_dict = 1;
4313     for (i = 1; i < 256; i++) {
4314         int l1, l2;
4315         if (decode[i] == 0
4316 #ifdef Py_UNICODE_WIDE
4317             || decode[i] > 0xFFFF
4318 #endif
4319             ) {
4320             need_dict = 1;
4321             break;
4322         }
4323         if (decode[i] == 0xFFFE)
4324             /* unmapped character */
4325             continue;
4326         l1 = decode[i] >> 11;
4327         l2 = decode[i] >> 7;
4328         if (level1[l1] == 0xFF)
4329             level1[l1] = count2++;
4330         if (level2[l2] == 0xFF)
4331             level2[l2] = count3++;
4332     }
4333
4334     if (count2 >= 0xFF || count3 >= 0xFF)
4335         need_dict = 1;
4336
4337     if (need_dict) {
4338         PyObject *result = PyDict_New();
4339         PyObject *key, *value;
4340         if (!result)
4341             return NULL;
4342         for (i = 0; i < 256; i++) {
4343             value = NULL;
4344             key = PyInt_FromLong(decode[i]);
4345             value = PyInt_FromLong(i);
4346             if (!key || !value)
4347                 goto failed1;
4348             if (PyDict_SetItem(result, key, value) == -1)
4349                 goto failed1;
4350             Py_DECREF(key);
4351             Py_DECREF(value);
4352         }
4353         return result;
4354       failed1:
4355         Py_XDECREF(key);
4356         Py_XDECREF(value);
4357         Py_DECREF(result);
4358         return NULL;
4359     }
4360
4361     /* Create a three-level trie */
4362     result = PyObject_MALLOC(sizeof(struct encoding_map) +
4363                              16*count2 + 128*count3 - 1);
4364     if (!result)
4365         return PyErr_NoMemory();
4366     PyObject_Init(result, &EncodingMapType);
4367     mresult = (struct encoding_map*)result;
4368     mresult->count2 = count2;
4369     mresult->count3 = count3;
4370     mlevel1 = mresult->level1;
4371     mlevel2 = mresult->level23;
4372     mlevel3 = mresult->level23 + 16*count2;
4373     memcpy(mlevel1, level1, 32);
4374     memset(mlevel2, 0xFF, 16*count2);
4375     memset(mlevel3, 0, 128*count3);
4376     count3 = 0;
4377     for (i = 1; i < 256; i++) {
4378         int o1, o2, o3, i2, i3;
4379         if (decode[i] == 0xFFFE)
4380             /* unmapped character */
4381             continue;
4382         o1 = decode[i]>>11;
4383         o2 = (decode[i]>>7) & 0xF;
4384         i2 = 16*mlevel1[o1] + o2;
4385         if (mlevel2[i2] == 0xFF)
4386             mlevel2[i2] = count3++;
4387         o3 = decode[i] & 0x7F;
4388         i3 = 128*mlevel2[i2] + o3;
4389         mlevel3[i3] = i;
4390     }
4391     return result;
4392 }
4393
4394 static int
4395 encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4396 {
4397     struct encoding_map *map = (struct encoding_map*)mapping;
4398     int l1 = c>>11;
4399     int l2 = (c>>7) & 0xF;
4400     int l3 = c & 0x7F;
4401     int i;
4402
4403 #ifdef Py_UNICODE_WIDE
4404     if (c > 0xFFFF) {
4405         return -1;
4406     }
4407 #endif
4408     if (c == 0)
4409         return 0;
4410     /* level 1*/
4411     i = map->level1[l1];
4412     if (i == 0xFF) {
4413         return -1;
4414     }
4415     /* level 2*/
4416     i = map->level23[16*i+l2];
4417     if (i == 0xFF) {
4418         return -1;
4419     }
4420     /* level 3 */
4421     i = map->level23[16*map->count2 + 128*i + l3];
4422     if (i == 0) {
4423         return -1;
4424     }
4425     return i;
4426 }
4427
4428 /* Lookup the character ch in the mapping. If the character
4429    can't be found, Py_None is returned (or NULL, if another
4430    error occurred). */
4431 static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4432 {
4433     PyObject *w = PyInt_FromLong((long)c);
4434     PyObject *x;
4435
4436     if (w == NULL)
4437         return NULL;
4438     x = PyObject_GetItem(mapping, w);
4439     Py_DECREF(w);
4440     if (x == NULL) {
4441         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4442             /* No mapping found means: mapping is undefined. */
4443             PyErr_Clear();
4444             x = Py_None;
4445             Py_INCREF(x);
4446             return x;
4447         } else
4448             return NULL;
4449     }
4450     else if (x == Py_None)
4451         return x;
4452     else if (PyInt_Check(x)) {
4453         long value = PyInt_AS_LONG(x);
4454         if (value < 0 || value > 255) {
4455             PyErr_SetString(PyExc_TypeError,
4456                             "character mapping must be in range(256)");
4457             Py_DECREF(x);
4458             return NULL;
4459         }
4460         return x;
4461     }
4462     else if (PyString_Check(x))
4463         return x;
4464     else {
4465         /* wrong return value */
4466         PyErr_SetString(PyExc_TypeError,
4467                         "character mapping must return integer, None or str");
4468         Py_DECREF(x);
4469         return NULL;
4470     }
4471 }
4472
4473 static int
4474 charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4475 {
4476     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4477     /* exponentially overallocate to minimize reallocations */
4478     if (requiredsize < 2*outsize)
4479         requiredsize = 2*outsize;
4480     if (_PyString_Resize(outobj, requiredsize)) {
4481         return 0;
4482     }
4483     return 1;
4484 }
4485
4486 typedef enum charmapencode_result {
4487     enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4488 }charmapencode_result;
4489 /* lookup the character, put the result in the output string and adjust
4490    various state variables. Reallocate the output string if not enough
4491    space is available. Return a new reference to the object that
4492    was put in the output buffer, or Py_None, if the mapping was undefined
4493    (in which case no character was written) or NULL, if a
4494    reallocation error occurred. The caller must decref the result */
4495 static
4496 charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4497                                           PyObject **outobj, Py_ssize_t *outpos)
4498 {
4499     PyObject *rep;
4500     char *outstart;
4501     Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4502
4503     if (Py_TYPE(mapping) == &EncodingMapType) {
4504         int res = encoding_map_lookup(c, mapping);
4505         Py_ssize_t requiredsize = *outpos+1;
4506         if (res == -1)
4507             return enc_FAILED;
4508         if (outsize<requiredsize)
4509             if (!charmapencode_resize(outobj, outpos, requiredsize))
4510                 return enc_EXCEPTION;
4511         outstart = PyString_AS_STRING(*outobj);
4512         outstart[(*outpos)++] = (char)res;
4513         return enc_SUCCESS;
4514     }
4515
4516     rep = charmapencode_lookup(c, mapping);
4517     if (rep==NULL)
4518         return enc_EXCEPTION;
4519     else if (rep==Py_None) {
4520         Py_DECREF(rep);
4521         return enc_FAILED;
4522     } else {
4523         if (PyInt_Check(rep)) {
4524             Py_ssize_t requiredsize = *outpos+1;
4525             if (outsize<requiredsize)
4526                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4527                     Py_DECREF(rep);
4528                     return enc_EXCEPTION;
4529                 }
4530             outstart = PyString_AS_STRING(*outobj);
4531             outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4532         }
4533         else {
4534             const char *repchars = PyString_AS_STRING(rep);
4535             Py_ssize_t repsize = PyString_GET_SIZE(rep);
4536             Py_ssize_t requiredsize = *outpos+repsize;
4537             if (outsize<requiredsize)
4538                 if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4539                     Py_DECREF(rep);
4540                     return enc_EXCEPTION;
4541                 }
4542             outstart = PyString_AS_STRING(*outobj);
4543             memcpy(outstart + *outpos, repchars, repsize);
4544             *outpos += repsize;
4545         }
4546     }
4547     Py_DECREF(rep);
4548     return enc_SUCCESS;
4549 }
4550
4551 /* handle an error in PyUnicode_EncodeCharmap
4552    Return 0 on success, -1 on error */
4553 static
4554 int charmap_encoding_error(
4555     const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4556     PyObject **exceptionObject,
4557     int *known_errorHandler, PyObject **errorHandler, const char *errors,
4558     PyObject **res, Py_ssize_t *respos)
4559 {
4560     PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4561     Py_ssize_t repsize;
4562     Py_ssize_t newpos;
4563     Py_UNICODE *uni2;
4564     /* startpos for collecting unencodable chars */
4565     Py_ssize_t collstartpos = *inpos;
4566     Py_ssize_t collendpos = *inpos+1;
4567     Py_ssize_t collpos;
4568     char *encoding = "charmap";
4569     char *reason = "character maps to <undefined>";
4570     charmapencode_result x;
4571
4572     /* find all unencodable characters */
4573     while (collendpos < size) {
4574         PyObject *rep;
4575         if (Py_TYPE(mapping) == &EncodingMapType) {
4576             int res = encoding_map_lookup(p[collendpos], mapping);
4577             if (res != -1)
4578                 break;
4579             ++collendpos;
4580             continue;
4581         }
4582
4583         rep = charmapencode_lookup(p[collendpos], mapping);
4584         if (rep==NULL)
4585             return -1;
4586         else if (rep!=Py_None) {
4587             Py_DECREF(rep);
4588             break;
4589         }
4590         Py_DECREF(rep);
4591         ++collendpos;
4592     }
4593     /* cache callback name lookup
4594      * (if not done yet, i.e. it's the first error) */
4595     if (*known_errorHandler==-1) {
4596         if ((errors==NULL) || (!strcmp(errors, "strict")))
4597             *known_errorHandler = 1;
4598         else if (!strcmp(errors, "replace"))
4599             *known_errorHandler = 2;
4600         else if (!strcmp(errors, "ignore"))
4601             *known_errorHandler = 3;
4602         else if (!strcmp(errors, "xmlcharrefreplace"))
4603             *known_errorHandler = 4;
4604         else
4605             *known_errorHandler = 0;
4606     }
4607     switch (*known_errorHandler) {
4608     case 1: /* strict */
4609         raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4610         return -1;
4611     case 2: /* replace */
4612         for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4613             x = charmapencode_output('?', mapping, res, respos);
4614             if (x==enc_EXCEPTION) {
4615                 return -1;
4616             }
4617             else if (x==enc_FAILED) {
4618                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619                 return -1;
4620             }
4621         }
4622         /* fall through */
4623     case 3: /* ignore */
4624         *inpos = collendpos;
4625         break;
4626     case 4: /* xmlcharrefreplace */
4627         /* generate replacement (temporarily (mis)uses p) */
4628         for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4629             char buffer[2+29+1+1];
4630             char *cp;
4631             sprintf(buffer, "&#%d;", (int)p[collpos]);
4632             for (cp = buffer; *cp; ++cp) {
4633                 x = charmapencode_output(*cp, mapping, res, respos);
4634                 if (x==enc_EXCEPTION)
4635                     return -1;
4636                 else if (x==enc_FAILED) {
4637                     raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4638                     return -1;
4639                 }
4640             }
4641         }
4642         *inpos = collendpos;
4643         break;
4644     default:
4645         repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4646                                                       encoding, reason, p, size, exceptionObject,
4647                                                       collstartpos, collendpos, &newpos);
4648         if (repunicode == NULL)
4649             return -1;
4650         /* generate replacement  */
4651         repsize = PyUnicode_GET_SIZE(repunicode);
4652         for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4653             x = charmapencode_output(*uni2, mapping, res, respos);
4654             if (x==enc_EXCEPTION) {
4655                 return -1;
4656             }
4657             else if (x==enc_FAILED) {
4658                 Py_DECREF(repunicode);
4659                 raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4660                 return -1;
4661             }
4662         }
4663         *inpos = newpos;
4664         Py_DECREF(repunicode);
4665     }
4666     return 0;
4667 }
4668
4669 PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4670                                   Py_ssize_t size,
4671                                   PyObject *mapping,
4672                                   const char *errors)
4673 {
4674     /* output object */
4675     PyObject *res = NULL;
4676     /* current input position */
4677     Py_ssize_t inpos = 0;
4678     /* current output position */
4679     Py_ssize_t respos = 0;
4680     PyObject *errorHandler = NULL;
4681     PyObject *exc = NULL;
4682     /* the following variable is used for caching string comparisons
4683      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4684      * 3=ignore, 4=xmlcharrefreplace */
4685     int known_errorHandler = -1;
4686
4687     /* Default to Latin-1 */
4688     if (mapping == NULL)
4689         return PyUnicode_EncodeLatin1(p, size, errors);
4690
4691     /* allocate enough for a simple encoding without
4692        replacements, if we need more, we'll resize */
4693     res = PyString_FromStringAndSize(NULL, size);
4694     if (res == NULL)
4695         goto onError;
4696     if (size == 0)
4697         return res;
4698
4699     while (inpos<size) {
4700         /* try to encode it */
4701         charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4702         if (x==enc_EXCEPTION) /* error */
4703             goto onError;
4704         if (x==enc_FAILED) { /* unencodable character */
4705             if (charmap_encoding_error(p, size, &inpos, mapping,
4706                                        &exc,
4707                                        &known_errorHandler, &errorHandler, errors,
4708                                        &res, &respos)) {
4709                 goto onError;
4710             }
4711         }
4712         else
4713             /* done with this character => adjust input position */
4714             ++inpos;
4715     }
4716
4717     /* Resize if we allocated to much */
4718     if (respos<PyString_GET_SIZE(res)) {
4719         if (_PyString_Resize(&res, respos))
4720             goto onError;
4721     }
4722     Py_XDECREF(exc);
4723     Py_XDECREF(errorHandler);
4724     return res;
4725
4726   onError:
4727     Py_XDECREF(res);
4728     Py_XDECREF(exc);
4729     Py_XDECREF(errorHandler);
4730     return NULL;
4731 }
4732
4733 PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4734                                     PyObject *mapping)
4735 {
4736     if (!PyUnicode_Check(unicode) || mapping == NULL) {
4737         PyErr_BadArgument();
4738         return NULL;
4739     }
4740     return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4741                                    PyUnicode_GET_SIZE(unicode),
4742                                    mapping,
4743                                    NULL);
4744 }
4745
4746 /* create or adjust a UnicodeTranslateError */
4747 static void make_translate_exception(PyObject **exceptionObject,
4748                                      const Py_UNICODE *unicode, Py_ssize_t size,
4749                                      Py_ssize_t startpos, Py_ssize_t endpos,
4750                                      const char *reason)
4751 {
4752     if (*exceptionObject == NULL) {
4753         *exceptionObject = PyUnicodeTranslateError_Create(
4754             unicode, size, startpos, endpos, reason);
4755     }
4756     else {
4757         if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4758             goto onError;
4759         if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4760             goto onError;
4761         if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4762             goto onError;
4763         return;
4764       onError:
4765         Py_DECREF(*exceptionObject);
4766         *exceptionObject = NULL;
4767     }
4768 }
4769
4770 /* raises a UnicodeTranslateError */
4771 static void raise_translate_exception(PyObject **exceptionObject,
4772                                       const Py_UNICODE *unicode, Py_ssize_t size,
4773                                       Py_ssize_t startpos, Py_ssize_t endpos,
4774                                       const char *reason)
4775 {
4776     make_translate_exception(exceptionObject,
4777                              unicode, size, startpos, endpos, reason);
4778     if (*exceptionObject != NULL)
4779         PyCodec_StrictErrors(*exceptionObject);
4780 }
4781
4782 /* error handling callback helper:
4783    build arguments, call the callback and check the arguments,
4784    put the result into newpos and return the replacement string, which
4785    has to be freed by the caller */
4786 static PyObject *unicode_translate_call_errorhandler(const char *errors,
4787                                                      PyObject **errorHandler,
4788                                                      const char *reason,
4789                                                      const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4790                                                      Py_ssize_t startpos, Py_ssize_t endpos,
4791                                                      Py_ssize_t *newpos)
4792 {
4793     static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4794
4795     Py_ssize_t i_newpos;
4796     PyObject *restuple;
4797     PyObject *resunicode;
4798
4799     if (*errorHandler == NULL) {
4800         *errorHandler = PyCodec_LookupError(errors);
4801         if (*errorHandler == NULL)
4802             return NULL;
4803     }
4804
4805     make_translate_exception(exceptionObject,
4806                              unicode, size, startpos, endpos, reason);
4807     if (*exceptionObject == NULL)
4808         return NULL;
4809
4810     restuple = PyObject_CallFunctionObjArgs(
4811         *errorHandler, *exceptionObject, NULL);
4812     if (restuple == NULL)
4813         return NULL;
4814     if (!PyTuple_Check(restuple)) {
4815         PyErr_SetString(PyExc_TypeError, &argparse[4]);
4816         Py_DECREF(restuple);
4817         return NULL;
4818     }
4819     if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4820                           &resunicode, &i_newpos)) {
4821         Py_DECREF(restuple);
4822         return NULL;
4823     }
4824     if (i_newpos<0)
4825         *newpos = size+i_newpos;
4826     else
4827         *newpos = i_newpos;
4828     if (*newpos<0 || *newpos>size) {
4829         PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4830         Py_DECREF(restuple);
4831         return NULL;
4832     }
4833     Py_INCREF(resunicode);
4834     Py_DECREF(restuple);
4835     return resunicode;
4836 }
4837
4838 /* Lookup the character ch in the mapping and put the result in result,
4839    which must be decrefed by the caller.
4840    Return 0 on success, -1 on error */
4841 static
4842 int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4843 {
4844     PyObject *w = PyInt_FromLong((long)c);
4845     PyObject *x;
4846
4847     if (w == NULL)
4848         return -1;
4849     x = PyObject_GetItem(mapping, w);
4850     Py_DECREF(w);
4851     if (x == NULL) {
4852         if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4853             /* No mapping found means: use 1:1 mapping. */
4854             PyErr_Clear();
4855             *result = NULL;
4856             return 0;
4857         } else
4858             return -1;
4859     }
4860     else if (x == Py_None) {
4861         *result = x;
4862         return 0;
4863     }
4864     else if (PyInt_Check(x)) {
4865         long value = PyInt_AS_LONG(x);
4866         long max = PyUnicode_GetMax();
4867         if (value < 0 || value > max) {
4868             PyErr_Format(PyExc_TypeError,
4869                          "character mapping must be in range(0x%lx)", max+1);
4870             Py_DECREF(x);
4871             return -1;
4872         }
4873         *result = x;
4874         return 0;
4875     }
4876     else if (PyUnicode_Check(x)) {
4877         *result = x;
4878         return 0;
4879     }
4880     else {
4881         /* wrong return value */
4882         PyErr_SetString(PyExc_TypeError,
4883                         "character mapping must return integer, None or unicode");
4884         Py_DECREF(x);
4885         return -1;
4886     }
4887 }
4888 /* ensure that *outobj is at least requiredsize characters long,
4889    if not reallocate and adjust various state variables.
4890    Return 0 on success, -1 on error */
4891 static
4892 int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4893                                Py_ssize_t requiredsize)
4894 {
4895     Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4896     if (requiredsize > oldsize) {
4897         /* remember old output position */
4898         Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4899         /* exponentially overallocate to minimize reallocations */
4900         if (requiredsize < 2 * oldsize)
4901             requiredsize = 2 * oldsize;
4902         if (PyUnicode_Resize(outobj, requiredsize) < 0)
4903             return -1;
4904         *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4905     }
4906     return 0;
4907 }
4908 /* lookup the character, put the result in the output string and adjust
4909    various state variables. Return a new reference to the object that
4910    was put in the output buffer in *result, or Py_None, if the mapping was
4911    undefined (in which case no character was written).
4912    The called must decref result.
4913    Return 0 on success, -1 on error. */
4914 static
4915 int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4916                             Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4917                             PyObject **res)
4918 {
4919     if (charmaptranslate_lookup(*curinp, mapping, res))
4920         return -1;
4921     if (*res==NULL) {
4922         /* not found => default to 1:1 mapping */
4923         *(*outp)++ = *curinp;
4924     }
4925     else if (*res==Py_None)
4926         ;
4927     else if (PyInt_Check(*res)) {
4928         /* no overflow check, because we know that the space is enough */
4929         *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4930     }
4931     else if (PyUnicode_Check(*res)) {
4932         Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4933         if (repsize==1) {
4934             /* no overflow check, because we know that the space is enough */
4935             *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4936         }
4937         else if (repsize!=0) {
4938             /* more than one character */
4939             Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4940                 (insize - (curinp-startinp)) +
4941                 repsize - 1;
4942             if (charmaptranslate_makespace(outobj, outp, requiredsize))
4943                 return -1;
4944             memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4945             *outp += repsize;
4946         }
4947     }
4948     else
4949         return -1;
4950     return 0;
4951 }
4952
4953 PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4954                                      Py_ssize_t size,
4955                                      PyObject *mapping,
4956                                      const char *errors)
4957 {
4958     /* output object */
4959     PyObject *res = NULL;
4960     /* pointers to the beginning and end+1 of input */
4961     const Py_UNICODE *startp = p;
4962     const Py_UNICODE *endp = p + size;
4963     /* pointer into the output */
4964     Py_UNICODE *str;
4965     /* current output position */
4966     Py_ssize_t respos = 0;
4967     char *reason = "character maps to <undefined>";
4968     PyObject *errorHandler = NULL;
4969     PyObject *exc = NULL;
4970     /* the following variable is used for caching string comparisons
4971      * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4972      * 3=ignore, 4=xmlcharrefreplace */
4973     int known_errorHandler = -1;
4974
4975     if (mapping == NULL) {
4976         PyErr_BadArgument();
4977         return NULL;
4978     }
4979
4980     /* allocate enough for a simple 1:1 translation without
4981        replacements, if we need more, we'll resize */
4982     res = PyUnicode_FromUnicode(NULL, size);
4983     if (res == NULL)
4984         goto onError;
4985     if (size == 0)
4986         return res;
4987     str = PyUnicode_AS_UNICODE(res);
4988
4989     while (p<endp) {
4990         /* try to encode it */
4991         PyObject *x = NULL;
4992         if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4993             Py_XDECREF(x);
4994             goto onError;
4995         }
4996         Py_XDECREF(x);
4997         if (x!=Py_None) /* it worked => adjust input pointer */
4998             ++p;
4999         else { /* untranslatable character */
5000             PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5001             Py_ssize_t repsize;
5002             Py_ssize_t newpos;
5003             Py_UNICODE *uni2;
5004             /* startpos for collecting untranslatable chars */
5005             const Py_UNICODE *collstart = p;
5006             const Py_UNICODE *collend = p+1;
5007             const Py_UNICODE *coll;
5008
5009             /* find all untranslatable characters */
5010             while (collend < endp) {
5011                 if (charmaptranslate_lookup(*collend, mapping, &x))
5012                     goto onError;
5013                 Py_XDECREF(x);
5014                 if (x!=Py_None)
5015                     break;
5016                 ++collend;
5017             }
5018             /* cache callback name lookup
5019              * (if not done yet, i.e. it's the first error) */
5020             if (known_errorHandler==-1) {
5021                 if ((errors==NULL) || (!strcmp(errors, "strict")))
5022                     known_errorHandler = 1;
5023                 else if (!strcmp(errors, "replace"))
5024                     known_errorHandler = 2;
5025                 else if (!strcmp(errors, "ignore"))
5026                     known_errorHandler = 3;
5027                 else if (!strcmp(errors, "xmlcharrefreplace"))
5028                     known_errorHandler = 4;
5029                 else
5030                     known_errorHandler = 0;
5031             }
5032             switch (known_errorHandler) {
5033             case 1: /* strict */
5034                 raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5035                 goto onError;
5036             case 2: /* replace */
5037                 /* No need to check for space, this is a 1:1 replacement */
5038                 for (coll = collstart; coll<collend; ++coll)
5039                     *str++ = '?';
5040                 /* fall through */
5041             case 3: /* ignore */
5042                 p = collend;
5043                 break;
5044             case 4: /* xmlcharrefreplace */
5045                 /* generate replacement (temporarily (mis)uses p) */
5046                 for (p = collstart; p < collend; ++p) {
5047                     char buffer[2+29+1+1];
5048                     char *cp;
5049                     sprintf(buffer, "&#%d;", (int)*p);
5050                     if (charmaptranslate_makespace(&res, &str,
5051                                                    (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5052                         goto onError;
5053                     for (cp = buffer; *cp; ++cp)
5054                         *str++ = *cp;
5055                 }
5056                 p = collend;
5057                 break;
5058             default:
5059                 repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5060                                                                  reason, startp, size, &exc,
5061                                                                  collstart-startp, collend-startp, &newpos);
5062                 if (repunicode == NULL)
5063                     goto onError;
5064                 /* generate replacement  */
5065                 repsize = PyUnicode_GET_SIZE(repunicode);
5066                 if (charmaptranslate_makespace(&res, &str,
5067                                                (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5068                     Py_DECREF(repunicode);
5069                     goto onError;
5070                 }
5071                 for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5072                     *str++ = *uni2;
5073                 p = startp + newpos;
5074                 Py_DECREF(repunicode);
5075             }
5076         }
5077     }
5078     /* Resize if we allocated to much */
5079     respos = str-PyUnicode_AS_UNICODE(res);
5080     if (respos<PyUnicode_GET_SIZE(res)) {
5081         if (PyUnicode_Resize(&res, respos) < 0)
5082             goto onError;
5083     }
5084     Py_XDECREF(exc);
5085     Py_XDECREF(errorHandler);
5086     return res;
5087
5088   onError:
5089     Py_XDECREF(res);
5090     Py_XDECREF(exc);
5091     Py_XDECREF(errorHandler);
5092     return NULL;
5093 }
5094
5095 PyObject *PyUnicode_Translate(PyObject *str,
5096                               PyObject *mapping,
5097                               const char *errors)
5098 {
5099     PyObject *result;
5100
5101     str = PyUnicode_FromObject(str);
5102     if (str == NULL)
5103         goto onError;
5104     result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5105                                         PyUnicode_GET_SIZE(str),
5106                                         mapping,
5107                                         errors);
5108     Py_DECREF(str);
5109     return result;
5110
5111   onError:
5112     Py_XDECREF(str);
5113     return NULL;
5114 }
5115
5116 /* --- Decimal Encoder ---------------------------------------------------- */
5117
5118 int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5119                             Py_ssize_t length,
5120                             char *output,
5121                             const char *errors)
5122 {
5123     Py_UNICODE *p, *end;
5124     PyObject *errorHandler = NULL;
5125     PyObject *exc = NULL;
5126     const char *encoding = "decimal";
5127     const char *reason = "invalid decimal Unicode string";
5128     /* the following variable is used for caching string comparisons
5129      * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5130     int known_errorHandler = -1;
5131
5132     if (output == NULL) {
5133         PyErr_BadArgument();
5134         return -1;
5135     }
5136
5137     p = s;
5138     end = s + length;
5139     while (p < end) {
5140         register Py_UNICODE ch = *p;
5141         int decimal;
5142         PyObject *repunicode;
5143         Py_ssize_t repsize;
5144         Py_ssize_t newpos;
5145         Py_UNICODE *uni2;
5146         Py_UNICODE *collstart;
5147         Py_UNICODE *collend;
5148
5149         if (Py_UNICODE_ISSPACE(ch)) {
5150             *output++ = ' ';
5151             ++p;
5152             continue;
5153         }
5154         decimal = Py_UNICODE_TODECIMAL(ch);
5155         if (decimal >= 0) {
5156             *output++ = '0' + decimal;
5157             ++p;
5158             continue;
5159         }
5160         if (0 < ch && ch < 256) {
5161             *output++ = (char)ch;
5162             ++p;
5163             continue;
5164         }
5165         /* All other characters are considered unencodable */
5166         collstart = p;
5167         collend = p+1;
5168         while (collend < end) {
5169             if ((0 < *collend && *collend < 256) ||
5170                 !Py_UNICODE_ISSPACE(*collend) ||
5171                 Py_UNICODE_TODECIMAL(*collend))
5172                 break;
5173         }
5174         /* cache callback name lookup
5175          * (if not done yet, i.e. it's the first error) */
5176         if (known_errorHandler==-1) {
5177             if ((errors==NULL) || (!strcmp(errors, "strict")))
5178                 known_errorHandler = 1;
5179             else if (!strcmp(errors, "replace"))
5180                 known_errorHandler = 2;
5181             else if (!strcmp(errors, "ignore"))
5182                 known_errorHandler = 3;
5183             else if (!strcmp(errors, "xmlcharrefreplace"))
5184                 known_errorHandler = 4;
5185             else
5186                 known_errorHandler = 0;
5187         }
5188         switch (known_errorHandler) {
5189         case 1: /* strict */
5190             raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5191             goto onError;
5192         case 2: /* replace */
5193             for (p = collstart; p < collend; ++p)
5194                 *output++ = '?';
5195             /* fall through */
5196         case 3: /* ignore */
5197             p = collend;
5198             break;
5199         case 4: /* xmlcharrefreplace */
5200             /* generate replacement (temporarily (mis)uses p) */
5201             for (p = collstart; p < collend; ++p)
5202                 output += sprintf(output, "&#%d;", (int)*p);
5203             p = collend;
5204             break;
5205         default:
5206             repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5207                                                           encoding, reason, s, length, &exc,
5208                                                           collstart-s, collend-s, &newpos);
5209             if (repunicode == NULL)
5210                 goto onError;
5211             /* generate replacement  */
5212             repsize = PyUnicode_GET_SIZE(repunicode);
5213             for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5214                 Py_UNICODE ch = *uni2;
5215                 if (Py_UNICODE_ISSPACE(ch))
5216                     *output++ = ' ';
5217                 else {
5218                     decimal = Py_UNICODE_TODECIMAL(ch);
5219                     if (decimal >= 0)
5220                         *output++ = '0' + decimal;
5221                     else if (0 < ch && ch < 256)
5222                         *output++ = (char)ch;
5223                     else {
5224                         Py_DECREF(repunicode);
5225                         raise_encode_exception(&exc, encoding,
5226                                                s, length, collstart-s, collend-s, reason);
5227                         goto onError;
5228                     }
5229                 }
5230             }
5231             p = s + newpos;
5232             Py_DECREF(repunicode);
5233         }
5234     }
5235     /* 0-terminate the output string */
5236     *output++ = '\0';
5237     Py_XDECREF(exc);
5238     Py_XDECREF(errorHandler);
5239     return 0;
5240
5241   onError:
5242     Py_XDECREF(exc);
5243     Py_XDECREF(errorHandler);
5244     return -1;
5245 }
5246
5247 /* --- Helpers ------------------------------------------------------------ */
5248
5249 #include "stringlib/unicodedefs.h"
5250 #include "stringlib/fastsearch.h"
5251
5252 #include "stringlib/count.h"
5253 #include "stringlib/find.h"
5254 #include "stringlib/partition.h"
5255 #include "stringlib/split.h"
5256
5257 /* helper macro to fixup start/end slice values */
5258 #define ADJUST_INDICES(start, end, len)         \
5259     if (end > len)                              \
5260         end = len;                              \
5261     else if (end < 0) {                         \
5262         end += len;                             \
5263         if (end < 0)                            \
5264             end = 0;                            \
5265     }                                           \
5266     if (start < 0) {                            \
5267         start += len;                           \
5268         if (start < 0)                          \
5269             start = 0;                          \
5270     }
5271
5272 Py_ssize_t PyUnicode_Count(PyObject *str,
5273                            PyObject *substr,
5274                            Py_ssize_t start,
5275                            Py_ssize_t end)
5276 {
5277     Py_ssize_t result;
5278     PyUnicodeObject* str_obj;
5279     PyUnicodeObject* sub_obj;
5280
5281     str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5282     if (!str_obj)
5283         return -1;
5284     sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5285     if (!sub_obj) {
5286         Py_DECREF(str_obj);
5287         return -1;
5288     }
5289
5290     ADJUST_INDICES(start, end, str_obj->length);
5291     result = stringlib_count(
5292         str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5293         PY_SSIZE_T_MAX
5294         );
5295
5296     Py_DECREF(sub_obj);
5297     Py_DECREF(str_obj);
5298
5299     return result;
5300 }
5301
5302 Py_ssize_t PyUnicode_Find(PyObject *str,
5303                           PyObject *sub,
5304                           Py_ssize_t start,
5305                           Py_ssize_t end,
5306                           int direction)
5307 {
5308     Py_ssize_t result;
5309
5310     str = PyUnicode_FromObject(str);
5311     if (!str)
5312         return -2;
5313     sub = PyUnicode_FromObject(sub);
5314     if (!sub) {
5315         Py_DECREF(str);
5316         return -2;
5317     }
5318
5319     if (direction > 0)
5320         result = stringlib_find_slice(
5321             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5322             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5323             start, end
5324             );
5325     else
5326         result = stringlib_rfind_slice(
5327             PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5328             PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5329             start, end
5330             );
5331
5332     Py_DECREF(str);
5333     Py_DECREF(sub);
5334
5335     return result;
5336 }
5337
5338 static
5339 int tailmatch(PyUnicodeObject *self,
5340               PyUnicodeObject *substring,
5341               Py_ssize_t start,
5342               Py_ssize_t end,
5343               int direction)
5344 {
5345     if (substring->length == 0)
5346         return 1;
5347
5348     ADJUST_INDICES(start, end, self->length);
5349     end -= substring->length;
5350     if (end < start)
5351         return 0;
5352
5353     if (direction > 0) {
5354         if (Py_UNICODE_MATCH(self, end, substring))
5355             return 1;
5356     } else {
5357         if (Py_UNICODE_MATCH(self, start, substring))
5358             return 1;
5359     }
5360
5361     return 0;
5362 }
5363
5364 Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5365                                PyObject *substr,
5366                                Py_ssize_t start,
5367                                Py_ssize_t end,
5368                                int direction)
5369 {
5370     Py_ssize_t result;
5371
5372     str = PyUnicode_FromObject(str);
5373     if (str == NULL)
5374         return -1;
5375     substr = PyUnicode_FromObject(substr);
5376     if (substr == NULL) {
5377         Py_DECREF(str);
5378         return -1;
5379     }
5380
5381     result = tailmatch((PyUnicodeObject *)str,
5382                        (PyUnicodeObject *)substr,
5383                        start, end, direction);
5384     Py_DECREF(str);
5385     Py_DECREF(substr);
5386     return result;
5387 }
5388
5389 /* Apply fixfct filter to the Unicode object self and return a
5390    reference to the modified object */
5391
5392 static
5393 PyObject *fixup(PyUnicodeObject *self,
5394                 int (*fixfct)(PyUnicodeObject *s))
5395 {
5396
5397     PyUnicodeObject *u;
5398
5399     u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5400     if (u == NULL)
5401         return NULL;
5402
5403     Py_UNICODE_COPY(u->str, self->str, self->length);
5404
5405     if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5406         /* fixfct should return TRUE if it modified the buffer. If
5407            FALSE, return a reference to the original buffer instead
5408            (to save space, not time) */
5409         Py_INCREF(self);
5410         Py_DECREF(u);
5411         return (PyObject*) self;
5412     }
5413     return (PyObject*) u;
5414 }
5415
5416 static
5417 int fixupper(PyUnicodeObject *self)
5418 {
5419     Py_ssize_t len = self->length;
5420     Py_UNICODE *s = self->str;
5421     int status = 0;
5422
5423     while (len-- > 0) {
5424         register Py_UNICODE ch;
5425
5426         ch = Py_UNICODE_TOUPPER(*s);
5427         if (ch != *s) {
5428             status = 1;
5429             *s = ch;
5430         }
5431         s++;
5432     }
5433
5434     return status;
5435 }
5436
5437 static
5438 int fixlower(PyUnicodeObject *self)
5439 {
5440     Py_ssize_t len = self->length;
5441     Py_UNICODE *s = self->str;
5442     int status = 0;
5443
5444     while (len-- > 0) {
5445         register Py_UNICODE ch;
5446
5447         ch = Py_UNICODE_TOLOWER(*s);
5448         if (ch != *s) {
5449             status = 1;
5450             *s = ch;
5451         }
5452         s++;
5453     }
5454
5455     return status;
5456 }
5457
5458 static
5459 int fixswapcase(PyUnicodeObject *self)
5460 {
5461     Py_ssize_t len = self->length;
5462     Py_UNICODE *s = self->str;
5463     int status = 0;
5464
5465     while (len-- > 0) {
5466         if (Py_UNICODE_ISUPPER(*s)) {
5467             *s = Py_UNICODE_TOLOWER(*s);
5468             status = 1;
5469         } else if (Py_UNICODE_ISLOWER(*s)) {
5470             *s = Py_UNICODE_TOUPPER(*s);
5471             status = 1;
5472         }
5473         s++;
5474     }
5475
5476     return status;
5477 }
5478
5479 static
5480 int fixcapitalize(PyUnicodeObject *self)
5481 {
5482     Py_ssize_t len = self->length;
5483     Py_UNICODE *s = self->str;
5484     int status = 0;
5485
5486     if (len == 0)
5487         return 0;
5488     if (Py_UNICODE_ISLOWER(*s)) {
5489         *s = Py_UNICODE_TOUPPER(*s);
5490         status = 1;
5491     }
5492     s++;
5493     while (--len > 0) {
5494         if (Py_UNICODE_ISUPPER(*s)) {
5495             *s = Py_UNICODE_TOLOWER(*s);
5496             status = 1;
5497         }
5498         s++;
5499     }
5500     return status;
5501 }
5502
5503 static
5504 int fixtitle(PyUnicodeObject *self)
5505 {
5506     register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5507     register Py_UNICODE *e;
5508     int previous_is_cased;
5509
5510     /* Shortcut for single character strings */
5511     if (PyUnicode_GET_SIZE(self) == 1) {
5512         Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5513         if (*p != ch) {
5514             *p = ch;
5515             return 1;
5516         }
5517         else
5518             return 0;
5519     }
5520
5521     e = p + PyUnicode_GET_SIZE(self);
5522     previous_is_cased = 0;
5523     for (; p < e; p++) {
5524         register const Py_UNICODE ch = *p;
5525
5526         if (previous_is_cased)
5527             *p = Py_UNICODE_TOLOWER(ch);
5528         else
5529             *p = Py_UNICODE_TOTITLE(ch);
5530
5531         if (Py_UNICODE_ISLOWER(ch) ||
5532             Py_UNICODE_ISUPPER(ch) ||
5533             Py_UNICODE_ISTITLE(ch))
5534             previous_is_cased = 1;
5535         else
5536             previous_is_cased = 0;
5537     }
5538     return 1;
5539 }
5540
5541 PyObject *
5542 PyUnicode_Join(PyObject *separator, PyObject *seq)
5543 {
5544     PyObject *internal_separator = NULL;
5545     const Py_UNICODE blank = ' ';
5546     const Py_UNICODE *sep = &blank;
5547     Py_ssize_t seplen = 1;
5548     PyUnicodeObject *res = NULL; /* the result */
5549     Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5550     Py_ssize_t res_used;         /* # used bytes */
5551     Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5552     PyObject *fseq;          /* PySequence_Fast(seq) */
5553     Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5554     PyObject *item;
5555     Py_ssize_t i;
5556
5557     fseq = PySequence_Fast(seq, "");
5558     if (fseq == NULL) {
5559         return NULL;
5560     }
5561
5562     /* Grrrr.  A codec may be invoked to convert str objects to
5563      * Unicode, and so it's possible to call back into Python code
5564      * during PyUnicode_FromObject(), and so it's possible for a sick
5565      * codec to change the size of fseq (if seq is a list).  Therefore
5566      * we have to keep refetching the size -- can't assume seqlen
5567      * is invariant.
5568      */
5569     seqlen = PySequence_Fast_GET_SIZE(fseq);
5570     /* If empty sequence, return u"". */
5571     if (seqlen == 0) {
5572         res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5573         goto Done;
5574     }
5575     /* If singleton sequence with an exact Unicode, return that. */
5576     if (seqlen == 1) {
5577         item = PySequence_Fast_GET_ITEM(fseq, 0);
5578         if (PyUnicode_CheckExact(item)) {
5579             Py_INCREF(item);
5580             res = (PyUnicodeObject *)item;
5581             goto Done;
5582         }
5583     }
5584
5585     /* At least two items to join, or one that isn't exact Unicode. */
5586     if (seqlen > 1) {
5587         /* Set up sep and seplen -- they're needed. */
5588         if (separator == NULL) {
5589             sep = &blank;
5590             seplen = 1;
5591         }
5592         else {
5593             internal_separator = PyUnicode_FromObject(separator);
5594             if (internal_separator == NULL)
5595                 goto onError;
5596             sep = PyUnicode_AS_UNICODE(internal_separator);
5597             seplen = PyUnicode_GET_SIZE(internal_separator);
5598             /* In case PyUnicode_FromObject() mutated seq. */
5599             seqlen = PySequence_Fast_GET_SIZE(fseq);
5600         }
5601     }
5602
5603     /* Get space. */
5604     res = _PyUnicode_New(res_alloc);
5605     if (res == NULL)
5606         goto onError;
5607     res_p = PyUnicode_AS_UNICODE(res);
5608     res_used = 0;
5609
5610     for (i = 0; i < seqlen; ++i) {
5611         Py_ssize_t itemlen;
5612         Py_ssize_t new_res_used;
5613
5614         item = PySequence_Fast_GET_ITEM(fseq, i);
5615         /* Convert item to Unicode. */
5616         if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5617             PyErr_Format(PyExc_TypeError,
5618                          "sequence item %zd: expected string or Unicode,"
5619                          " %.80s found",
5620                          i, Py_TYPE(item)->tp_name);
5621             goto onError;
5622         }
5623         item = PyUnicode_FromObject(item);
5624         if (item == NULL)
5625             goto onError;
5626         /* We own a reference to item from here on. */
5627
5628         /* In case PyUnicode_FromObject() mutated seq. */
5629         seqlen = PySequence_Fast_GET_SIZE(fseq);
5630
5631         /* Make sure we have enough space for the separator and the item. */
5632         itemlen = PyUnicode_GET_SIZE(item);
5633         new_res_used = res_used + itemlen;
5634         if (new_res_used < 0)
5635             goto Overflow;
5636         if (i < seqlen - 1) {
5637             new_res_used += seplen;
5638             if (new_res_used < 0)
5639                 goto Overflow;
5640         }
5641         if (new_res_used > res_alloc) {
5642             /* double allocated size until it's big enough */
5643             do {
5644                 res_alloc += res_alloc;
5645                 if (res_alloc <= 0)
5646                     goto Overflow;
5647             } while (new_res_used > res_alloc);
5648             if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5649                 Py_DECREF(item);
5650                 goto onError;
5651             }
5652             res_p = PyUnicode_AS_UNICODE(res) + res_used;
5653         }
5654
5655         /* Copy item, and maybe the separator. */
5656         Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5657         res_p += itemlen;
5658         if (i < seqlen - 1) {
5659             Py_UNICODE_COPY(res_p, sep, seplen);
5660             res_p += seplen;
5661         }
5662         Py_DECREF(item);
5663         res_used = new_res_used;
5664     }
5665
5666     /* Shrink res to match the used area; this probably can't fail,
5667      * but it's cheap to check.
5668      */
5669     if (_PyUnicode_Resize(&res, res_used) < 0)
5670         goto onError;
5671
5672   Done:
5673     Py_XDECREF(internal_separator);
5674     Py_DECREF(fseq);
5675     return (PyObject *)res;
5676
5677   Overflow:
5678     PyErr_SetString(PyExc_OverflowError,
5679                     "join() result is too long for a Python string");
5680     Py_DECREF(item);
5681     /* fall through */
5682
5683   onError:
5684     Py_XDECREF(internal_separator);
5685     Py_DECREF(fseq);
5686     Py_XDECREF(res);
5687     return NULL;
5688 }
5689
5690 static
5691 PyUnicodeObject *pad(PyUnicodeObject *self,
5692                      Py_ssize_t left,
5693                      Py_ssize_t right,
5694                      Py_UNICODE fill)
5695 {
5696     PyUnicodeObject *u;
5697
5698     if (left < 0)
5699         left = 0;
5700     if (right < 0)
5701         right = 0;
5702
5703     if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5704         Py_INCREF(self);
5705         return self;
5706     }
5707
5708     if (left > PY_SSIZE_T_MAX - self->length ||
5709         right > PY_SSIZE_T_MAX - (left + self->length)) {
5710         PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5711         return NULL;
5712     }
5713     u = _PyUnicode_New(left + self->length + right);
5714     if (u) {
5715         if (left)
5716             Py_UNICODE_FILL(u->str, fill, left);
5717         Py_UNICODE_COPY(u->str + left, self->str, self->length);
5718         if (right)
5719             Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5720     }
5721
5722     return u;
5723 }
5724
5725 PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5726 {
5727     PyObject *list;
5728
5729     string = PyUnicode_FromObject(string);
5730     if (string == NULL)
5731         return NULL;
5732
5733     list = stringlib_splitlines(
5734         (PyObject*) string, PyUnicode_AS_UNICODE(string),
5735         PyUnicode_GET_SIZE(string), keepends);
5736
5737     Py_DECREF(string);
5738     return list;
5739 }
5740
5741 static
5742 PyObject *split(PyUnicodeObject *self,
5743                 PyUnicodeObject *substring,
5744                 Py_ssize_t maxcount)
5745 {
5746     if (maxcount < 0)
5747         maxcount = PY_SSIZE_T_MAX;
5748
5749     if (substring == NULL)
5750         return stringlib_split_whitespace(
5751             (PyObject*) self,  self->str, self->length, maxcount
5752             );
5753
5754     return stringlib_split(
5755         (PyObject*) self,  self->str, self->length,
5756         substring->str, substring->length,
5757         maxcount
5758         );
5759 }
5760
5761 static
5762 PyObject *rsplit(PyUnicodeObject *self,
5763                  PyUnicodeObject *substring,
5764                  Py_ssize_t maxcount)
5765 {
5766     if (maxcount < 0)
5767         maxcount = PY_SSIZE_T_MAX;
5768
5769     if (substring == NULL)
5770         return stringlib_rsplit_whitespace(
5771             (PyObject*) self,  self->str, self->length, maxcount
5772             );
5773
5774     return stringlib_rsplit(
5775         (PyObject*) self,  self->str, self->length,
5776         substring->str, substring->length,
5777         maxcount
5778         );
5779 }
5780
5781 static
5782 PyObject *replace(PyUnicodeObject *self,
5783                   PyUnicodeObject *str1,
5784                   PyUnicodeObject *str2,
5785                   Py_ssize_t maxcount)
5786 {
5787     PyUnicodeObject *u;
5788
5789     if (maxcount < 0)
5790         maxcount = PY_SSIZE_T_MAX;
5791     else if (maxcount == 0 || self->length == 0)
5792         goto nothing;
5793
5794     if (str1->length == str2->length) {
5795         Py_ssize_t i;
5796         /* same length */
5797         if (str1->length == 0)
5798             goto nothing;
5799         if (str1->length == 1) {
5800             /* replace characters */
5801             Py_UNICODE u1, u2;
5802             if (!findchar(self->str, self->length, str1->str[0]))
5803                 goto nothing;
5804             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5805             if (!u)
5806                 return NULL;
5807             Py_UNICODE_COPY(u->str, self->str, self->length);
5808             u1 = str1->str[0];
5809             u2 = str2->str[0];
5810             for (i = 0; i < u->length; i++)
5811                 if (u->str[i] == u1) {
5812                     if (--maxcount < 0)
5813                         break;
5814                     u->str[i] = u2;
5815                 }
5816         } else {
5817             i = stringlib_find(
5818                 self->str, self->length, str1->str, str1->length, 0
5819                 );
5820             if (i < 0)
5821                 goto nothing;
5822             u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5823             if (!u)
5824                 return NULL;
5825             Py_UNICODE_COPY(u->str, self->str, self->length);
5826
5827             /* change everything in-place, starting with this one */
5828             Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5829             i += str1->length;
5830
5831             while ( --maxcount > 0) {
5832                 i = stringlib_find(self->str+i, self->length-i,
5833                                    str1->str, str1->length,
5834                                    i);
5835                 if (i == -1)
5836                     break;
5837                 Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5838                 i += str1->length;
5839             }
5840         }
5841     } else {
5842
5843         Py_ssize_t n, i, j;
5844         Py_ssize_t product, new_size, delta;
5845         Py_UNICODE *p;
5846
5847         /* replace strings */
5848         n = stringlib_count(self->str, self->length, str1->str, str1->length,
5849                             maxcount);
5850         if (n == 0)
5851             goto nothing;
5852         /* new_size = self->length + n * (str2->length - str1->length)); */
5853         delta = (str2->length - str1->length);
5854         if (delta == 0) {
5855             new_size = self->length;
5856         } else {
5857             product = n * (str2->length - str1->length);
5858             if ((product / (str2->length - str1->length)) != n) {
5859                 PyErr_SetString(PyExc_OverflowError,
5860                                 "replace string is too long");
5861                 return NULL;
5862             }
5863             new_size = self->length + product;
5864             if (new_size < 0) {
5865                 PyErr_SetString(PyExc_OverflowError,
5866                                 "replace string is too long");
5867                 return NULL;
5868             }
5869         }
5870         u = _PyUnicode_New(new_size);
5871         if (!u)
5872             return NULL;
5873         i = 0;
5874         p = u->str;
5875         if (str1->length > 0) {
5876             while (n-- > 0) {
5877                 /* look for next match */
5878                 j = stringlib_find(self->str+i, self->length-i,
5879                                    str1->str, str1->length,
5880                                    i);
5881                 if (j == -1)
5882                     break;
5883                 else if (j > i) {
5884                     /* copy unchanged part [i:j] */
5885                     Py_UNICODE_COPY(p, self->str+i, j-i);
5886                     p += j - i;
5887                 }
5888                 /* copy substitution string */
5889                 if (str2->length > 0) {
5890                     Py_UNICODE_COPY(p, str2->str, str2->length);
5891                     p += str2->length;
5892                 }
5893                 i = j + str1->length;
5894             }
5895             if (i < self->length)
5896                 /* copy tail [i:] */
5897                 Py_UNICODE_COPY(p, self->str+i, self->length-i);
5898         } else {
5899             /* interleave */
5900             while (n > 0) {
5901                 Py_UNICODE_COPY(p, str2->str, str2->length);
5902                 p += str2->length;
5903                 if (--n <= 0)
5904                     break;
5905                 *p++ = self->str[i++];
5906             }
5907             Py_UNICODE_COPY(p, self->str+i, self->length-i);
5908         }
5909     }
5910     return (PyObject *) u;
5911
5912   nothing:
5913     /* nothing to replace; return original string (when possible) */
5914     if (PyUnicode_CheckExact(self)) {
5915         Py_INCREF(self);
5916         return (PyObject *) self;
5917     }
5918     return PyUnicode_FromUnicode(self->str, self->length);
5919 }
5920
5921 /* --- Unicode Object Methods --------------------------------------------- */
5922
5923 PyDoc_STRVAR(title__doc__,
5924              "S.title() -> unicode\n\
5925 \n\
5926 Return a titlecased version of S, i.e. words start with title case\n\
5927 characters, all remaining cased characters have lower case.");
5928
5929 static PyObject*
5930 unicode_title(PyUnicodeObject *self)
5931 {
5932     return fixup(self, fixtitle);
5933 }
5934
5935 PyDoc_STRVAR(capitalize__doc__,
5936              "S.capitalize() -> unicode\n\
5937 \n\
5938 Return a capitalized version of S, i.e. make the first character\n\
5939 have upper case and the rest lower case.");
5940
5941 static PyObject*
5942 unicode_capitalize(PyUnicodeObject *self)
5943 {
5944     return fixup(self, fixcapitalize);
5945 }
5946
5947 #if 0
5948 PyDoc_STRVAR(capwords__doc__,
5949              "S.capwords() -> unicode\n\
5950 \n\
5951 Apply .capitalize() to all words in S and return the result with\n\
5952 normalized whitespace (all whitespace strings are replaced by ' ').");
5953
5954 static PyObject*
5955 unicode_capwords(PyUnicodeObject *self)
5956 {
5957     PyObject *list;
5958     PyObject *item;
5959     Py_ssize_t i;
5960
5961     /* Split into words */
5962     list = split(self, NULL, -1);
5963     if (!list)
5964         return NULL;
5965
5966     /* Capitalize each word */
5967     for (i = 0; i < PyList_GET_SIZE(list); i++) {
5968         item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5969                      fixcapitalize);
5970         if (item == NULL)
5971             goto onError;
5972         Py_DECREF(PyList_GET_ITEM(list, i));
5973         PyList_SET_ITEM(list, i, item);
5974     }
5975
5976     /* Join the words to form a new string */
5977     item = PyUnicode_Join(NULL, list);
5978
5979   onError:
5980     Py_DECREF(list);
5981     return (PyObject *)item;
5982 }
5983 #endif
5984
5985 /* Argument converter.  Coerces to a single unicode character */
5986
5987 static int
5988 convert_uc(PyObject *obj, void *addr)
5989 {
5990     Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5991     PyObject *uniobj;
5992     Py_UNICODE *unistr;
5993
5994     uniobj = PyUnicode_FromObject(obj);
5995     if (uniobj == NULL) {
5996         PyErr_SetString(PyExc_TypeError,
5997                         "The fill character cannot be converted to Unicode");
5998         return 0;
5999     }
6000     if (PyUnicode_GET_SIZE(uniobj) != 1) {
6001         PyErr_SetString(PyExc_TypeError,
6002                         "The fill character must be exactly one character long");
6003         Py_DECREF(uniobj);
6004         return 0;
6005     }
6006     unistr = PyUnicode_AS_UNICODE(uniobj);
6007     *fillcharloc = unistr[0];
6008     Py_DECREF(uniobj);
6009     return 1;
6010 }
6011
6012 PyDoc_STRVAR(center__doc__,
6013              "S.center(width[, fillchar]) -> unicode\n\
6014 \n\
6015 Return S centered in a Unicode string of length width. Padding is\n\
6016 done using the specified fill character (default is a space)");
6017
6018 static PyObject *
6019 unicode_center(PyUnicodeObject *self, PyObject *args)
6020 {
6021     Py_ssize_t marg, left;
6022     Py_ssize_t width;
6023     Py_UNICODE fillchar = ' ';
6024
6025     if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6026         return NULL;
6027
6028     if (self->length >= width && PyUnicode_CheckExact(self)) {
6029         Py_INCREF(self);
6030         return (PyObject*) self;
6031     }
6032
6033     marg = width - self->length;
6034     left = marg / 2 + (marg & width & 1);
6035
6036     return (PyObject*) pad(self, left, marg - left, fillchar);
6037 }
6038
6039 #if 0
6040
6041 /* This code should go into some future Unicode collation support
6042    module. The basic comparison should compare ordinals on a naive
6043    basis (this is what Java does and thus Jython too). */
6044
6045 /* speedy UTF-16 code point order comparison */
6046 /* gleaned from: */
6047 /* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6048
6049 static short utf16Fixup[32] =
6050 {
6051     0, 0, 0, 0, 0, 0, 0, 0,
6052     0, 0, 0, 0, 0, 0, 0, 0,
6053     0, 0, 0, 0, 0, 0, 0, 0,
6054     0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6055 };
6056
6057 static int
6058 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6059 {
6060     Py_ssize_t len1, len2;
6061
6062     Py_UNICODE *s1 = str1->str;
6063     Py_UNICODE *s2 = str2->str;
6064
6065     len1 = str1->length;
6066     len2 = str2->length;
6067
6068     while (len1 > 0 && len2 > 0) {
6069         Py_UNICODE c1, c2;
6070
6071         c1 = *s1++;
6072         c2 = *s2++;
6073
6074         if (c1 > (1<<11) * 26)
6075             c1 += utf16Fixup[c1>>11];
6076         if (c2 > (1<<11) * 26)
6077             c2 += utf16Fixup[c2>>11];
6078         /* now c1 and c2 are in UTF-32-compatible order */
6079
6080         if (c1 != c2)
6081             return (c1 < c2) ? -1 : 1;
6082
6083         len1--; len2--;
6084     }
6085
6086     return (len1 < len2) ? -1 : (len1 != len2);
6087 }
6088
6089 #else
6090
6091 static int
6092 unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6093 {
6094     register Py_ssize_t len1, len2;
6095
6096     Py_UNICODE *s1 = str1->str;
6097     Py_UNICODE *s2 = str2->str;
6098
6099     len1 = str1->length;
6100     len2 = str2->length;
6101
6102     while (len1 > 0 && len2 > 0) {
6103         Py_UNICODE c1, c2;
6104
6105         c1 = *s1++;
6106         c2 = *s2++;
6107
6108         if (c1 != c2)
6109             return (c1 < c2) ? -1 : 1;
6110
6111         len1--; len2--;
6112     }
6113
6114     return (len1 < len2) ? -1 : (len1 != len2);
6115 }
6116
6117 #endif
6118
6119 int PyUnicode_Compare(PyObject *left,
6120                       PyObject *right)
6121 {
6122     PyUnicodeObject *u = NULL, *v = NULL;
6123     int result;
6124
6125     /* Coerce the two arguments */
6126     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6127     if (u == NULL)
6128         goto onError;
6129     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6130     if (v == NULL)
6131         goto onError;
6132
6133     /* Shortcut for empty or interned objects */
6134     if (v == u) {
6135         Py_DECREF(u);
6136         Py_DECREF(v);
6137         return 0;
6138     }
6139
6140     result = unicode_compare(u, v);
6141
6142     Py_DECREF(u);
6143     Py_DECREF(v);
6144     return result;
6145
6146   onError:
6147     Py_XDECREF(u);
6148     Py_XDECREF(v);
6149     return -1;
6150 }
6151
6152 PyObject *PyUnicode_RichCompare(PyObject *left,
6153                                 PyObject *right,
6154                                 int op)
6155 {
6156     int result;
6157
6158     result = PyUnicode_Compare(left, right);
6159     if (result == -1 && PyErr_Occurred())
6160         goto onError;
6161
6162     /* Convert the return value to a Boolean */
6163     switch (op) {
6164     case Py_EQ:
6165         result = (result == 0);
6166         break;
6167     case Py_NE:
6168         result = (result != 0);
6169         break;
6170     case Py_LE:
6171         result = (result <= 0);
6172         break;
6173     case Py_GE:
6174         result = (result >= 0);
6175         break;
6176     case Py_LT:
6177         result = (result == -1);
6178         break;
6179     case Py_GT:
6180         result = (result == 1);
6181         break;
6182     }
6183     return PyBool_FromLong(result);
6184
6185   onError:
6186
6187     /* Standard case
6188
6189        Type errors mean that PyUnicode_FromObject() could not convert
6190        one of the arguments (usually the right hand side) to Unicode,
6191        ie. we can't handle the comparison request. However, it is
6192        possible that the other object knows a comparison method, which
6193        is why we return Py_NotImplemented to give the other object a
6194        chance.
6195
6196     */
6197     if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6198         PyErr_Clear();
6199         Py_INCREF(Py_NotImplemented);
6200         return Py_NotImplemented;
6201     }
6202     if (op != Py_EQ && op != Py_NE)
6203         return NULL;
6204
6205     /* Equality comparison.
6206
6207        This is a special case: we silence any PyExc_UnicodeDecodeError
6208        and instead turn it into a PyErr_UnicodeWarning.
6209
6210     */
6211     if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6212         return NULL;
6213     PyErr_Clear();
6214     if (PyErr_Warn(PyExc_UnicodeWarning,
6215                    (op == Py_EQ) ?
6216                    "Unicode equal comparison "
6217                    "failed to convert both arguments to Unicode - "
6218                    "interpreting them as being unequal" :
6219                    "Unicode unequal comparison "
6220                    "failed to convert both arguments to Unicode - "
6221                    "interpreting them as being unequal"
6222             ) < 0)
6223         return NULL;
6224     result = (op == Py_NE);
6225     return PyBool_FromLong(result);
6226 }
6227
6228 int PyUnicode_Contains(PyObject *container,
6229                        PyObject *element)
6230 {
6231     PyObject *str, *sub;
6232     int result;
6233
6234     /* Coerce the two arguments */
6235     sub = PyUnicode_FromObject(element);
6236     if (!sub) {
6237         return -1;
6238     }
6239
6240     str = PyUnicode_FromObject(container);
6241     if (!str) {
6242         Py_DECREF(sub);
6243         return -1;
6244     }
6245
6246     result = stringlib_contains_obj(str, sub);
6247
6248     Py_DECREF(str);
6249     Py_DECREF(sub);
6250
6251     return result;
6252 }
6253
6254 /* Concat to string or Unicode object giving a new Unicode object. */
6255
6256 PyObject *PyUnicode_Concat(PyObject *left,
6257                            PyObject *right)
6258 {
6259     PyUnicodeObject *u = NULL, *v = NULL, *w;
6260
6261     /* Coerce the two arguments */
6262     u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6263     if (u == NULL)
6264         goto onError;
6265     v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6266     if (v == NULL)
6267         goto onError;
6268
6269     /* Shortcuts */
6270     if (v == unicode_empty) {
6271         Py_DECREF(v);
6272         return (PyObject *)u;
6273     }
6274     if (u == unicode_empty) {
6275         Py_DECREF(u);
6276         return (PyObject *)v;
6277     }
6278
6279     /* Concat the two Unicode strings */
6280     w = _PyUnicode_New(u->length + v->length);
6281     if (w == NULL)
6282         goto onError;
6283     Py_UNICODE_COPY(w->str, u->str, u->length);
6284     Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6285
6286     Py_DECREF(u);
6287     Py_DECREF(v);
6288     return (PyObject *)w;
6289
6290   onError:
6291     Py_XDECREF(u);
6292     Py_XDECREF(v);
6293     return NULL;
6294 }
6295
6296 PyDoc_STRVAR(count__doc__,
6297              "S.count(sub[, start[, end]]) -> int\n\
6298 \n\
6299 Return the number of non-overlapping occurrences of substring sub in\n\
6300 Unicode string S[start:end].  Optional arguments start and end are\n\
6301 interpreted as in slice notation.");
6302
6303 static PyObject *
6304 unicode_count(PyUnicodeObject *self, PyObject *args)
6305 {
6306     PyUnicodeObject *substring;
6307     Py_ssize_t start = 0;
6308     Py_ssize_t end = PY_SSIZE_T_MAX;
6309     PyObject *result;
6310
6311     if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6312                                             &start, &end))
6313         return NULL;
6314
6315     ADJUST_INDICES(start, end, self->length);
6316     result = PyInt_FromSsize_t(
6317         stringlib_count(self->str + start, end - start,
6318                         substring->str, substring->length,
6319                         PY_SSIZE_T_MAX)
6320         );
6321
6322     Py_DECREF(substring);
6323
6324     return result;
6325 }
6326
6327 PyDoc_STRVAR(encode__doc__,
6328              "S.encode([encoding[,errors]]) -> string or unicode\n\
6329 \n\
6330 Encodes S using the codec registered for encoding. encoding defaults\n\
6331 to the default encoding. errors may be given to set a different error\n\
6332 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6333 a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6334 'xmlcharrefreplace' as well as any other name registered with\n\
6335 codecs.register_error that can handle UnicodeEncodeErrors.");
6336
6337 static PyObject *
6338 unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6339 {
6340     static char *kwlist[] = {"encoding", "errors", 0};
6341     char *encoding = NULL;
6342     char *errors = NULL;
6343     PyObject *v;
6344
6345     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6346                                      kwlist, &encoding, &errors))
6347         return NULL;
6348     v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6349     if (v == NULL)
6350         goto onError;
6351     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6352         PyErr_Format(PyExc_TypeError,
6353                      "encoder did not return a string/unicode object "
6354                      "(type=%.400s)",
6355                      Py_TYPE(v)->tp_name);
6356         Py_DECREF(v);
6357         return NULL;
6358     }
6359     return v;
6360
6361   onError:
6362     return NULL;
6363 }
6364
6365 PyDoc_STRVAR(decode__doc__,
6366              "S.decode([encoding[,errors]]) -> string or unicode\n\
6367 \n\
6368 Decodes S using the codec registered for encoding. encoding defaults\n\
6369 to the default encoding. errors may be given to set a different error\n\
6370 handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6371 a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6372 as well as any other name registerd with codecs.register_error that is\n\
6373 able to handle UnicodeDecodeErrors.");
6374
6375 static PyObject *
6376 unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6377 {
6378     static char *kwlist[] = {"encoding", "errors", 0};
6379     char *encoding = NULL;
6380     char *errors = NULL;
6381     PyObject *v;
6382
6383     if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6384                                      kwlist, &encoding, &errors))
6385         return NULL;
6386     v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6387     if (v == NULL)
6388         goto onError;
6389     if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6390         PyErr_Format(PyExc_TypeError,
6391                      "decoder did not return a string/unicode object "
6392                      "(type=%.400s)",
6393                      Py_TYPE(v)->tp_name);
6394         Py_DECREF(v);
6395         return NULL;
6396     }
6397     return v;
6398
6399   onError:
6400     return NULL;
6401 }
6402
6403 PyDoc_STRVAR(expandtabs__doc__,
6404              "S.expandtabs([tabsize]) -> unicode\n\
6405 \n\
6406 Return a copy of S where all tab characters are expanded using spaces.\n\
6407 If tabsize is not given, a tab size of 8 characters is assumed.");
6408
6409 static PyObject*
6410 unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6411 {
6412     Py_UNICODE *e;
6413     Py_UNICODE *p;
6414     Py_UNICODE *q;
6415     Py_UNICODE *qe;
6416     Py_ssize_t i, j, incr;
6417     PyUnicodeObject *u;
6418     int tabsize = 8;
6419
6420     if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6421         return NULL;
6422
6423     /* First pass: determine size of output string */
6424     i = 0; /* chars up to and including most recent \n or \r */
6425     j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6426     e = self->str + self->length; /* end of input */
6427     for (p = self->str; p < e; p++)
6428         if (*p == '\t') {
6429             if (tabsize > 0) {
6430                 incr = tabsize - (j % tabsize); /* cannot overflow */
6431                 if (j > PY_SSIZE_T_MAX - incr)
6432                     goto overflow1;
6433                 j += incr;
6434             }
6435         }
6436         else {
6437             if (j > PY_SSIZE_T_MAX - 1)
6438                 goto overflow1;
6439             j++;
6440             if (*p == '\n' || *p == '\r') {
6441                 if (i > PY_SSIZE_T_MAX - j)
6442                     goto overflow1;
6443                 i += j;
6444                 j = 0;
6445             }
6446         }
6447
6448     if (i > PY_SSIZE_T_MAX - j)
6449         goto overflow1;
6450
6451     /* Second pass: create output string and fill it */
6452     u = _PyUnicode_New(i + j);
6453     if (!u)
6454         return NULL;
6455
6456     j = 0; /* same as in first pass */
6457     q = u->str; /* next output char */
6458     qe = u->str + u->length; /* end of output */
6459
6460     for (p = self->str; p < e; p++)
6461         if (*p == '\t') {
6462             if (tabsize > 0) {
6463                 i = tabsize - (j % tabsize);
6464                 j += i;
6465                 while (i--) {
6466                     if (q >= qe)
6467                         goto overflow2;
6468                     *q++ = ' ';
6469                 }
6470             }
6471         }
6472         else {
6473             if (q >= qe)
6474                 goto overflow2;
6475             *q++ = *p;
6476             j++;
6477             if (*p == '\n' || *p == '\r')
6478                 j = 0;
6479         }
6480
6481     return (PyObject*) u;
6482
6483   overflow2:
6484     Py_DECREF(u);
6485   overflow1:
6486     PyErr_SetString(PyExc_OverflowError, "new string is too long");
6487     return NULL;
6488 }
6489
6490 PyDoc_STRVAR(find__doc__,
6491              "S.find(sub [,start [,end]]) -> int\n\
6492 \n\
6493 Return the lowest index in S where substring sub is found,\n\
6494 such that sub is contained within s[start:end].  Optional\n\
6495 arguments start and end are interpreted as in slice notation.\n\
6496 \n\
6497 Return -1 on failure.");
6498
6499 static PyObject *
6500 unicode_find(PyUnicodeObject *self, PyObject *args)
6501 {
6502     PyUnicodeObject *substring;
6503     Py_ssize_t start;
6504     Py_ssize_t end;
6505     Py_ssize_t result;
6506
6507     if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6508                                             &start, &end))
6509         return NULL;
6510
6511     result = stringlib_find_slice(
6512         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6513         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6514         start, end
6515         );
6516
6517     Py_DECREF(substring);
6518
6519     return PyInt_FromSsize_t(result);
6520 }
6521
6522 static PyObject *
6523 unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6524 {
6525     if (index < 0 || index >= self->length) {
6526         PyErr_SetString(PyExc_IndexError, "string index out of range");
6527         return NULL;
6528     }
6529
6530     return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6531 }
6532
6533 static long
6534 unicode_hash(PyUnicodeObject *self)
6535 {
6536     /* Since Unicode objects compare equal to their ASCII string
6537        counterparts, they should use the individual character values
6538        as basis for their hash value.  This is needed to assure that
6539        strings and Unicode objects behave in the same way as
6540        dictionary keys. */
6541
6542     register Py_ssize_t len;
6543     register Py_UNICODE *p;
6544     register long x;
6545
6546     if (self->hash != -1)
6547         return self->hash;
6548     len = PyUnicode_GET_SIZE(self);
6549     p = PyUnicode_AS_UNICODE(self);
6550     x = *p << 7;
6551     while (--len >= 0)
6552         x = (1000003*x) ^ *p++;
6553     x ^= PyUnicode_GET_SIZE(self);
6554     if (x == -1)
6555         x = -2;
6556     self->hash = x;
6557     return x;
6558 }
6559
6560 PyDoc_STRVAR(index__doc__,
6561              "S.index(sub [,start [,end]]) -> int\n\
6562 \n\
6563 Like S.find() but raise ValueError when the substring is not found.");
6564
6565 static PyObject *
6566 unicode_index(PyUnicodeObject *self, PyObject *args)
6567 {
6568     Py_ssize_t result;
6569     PyUnicodeObject *substring;
6570     Py_ssize_t start;
6571     Py_ssize_t end;
6572
6573     if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6574                                             &start, &end))
6575         return NULL;
6576
6577     result = stringlib_find_slice(
6578         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6579         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6580         start, end
6581         );
6582
6583     Py_DECREF(substring);
6584
6585     if (result < 0) {
6586         PyErr_SetString(PyExc_ValueError, "substring not found");
6587         return NULL;
6588     }
6589
6590     return PyInt_FromSsize_t(result);
6591 }
6592
6593 PyDoc_STRVAR(islower__doc__,
6594              "S.islower() -> bool\n\
6595 \n\
6596 Return True if all cased characters in S are lowercase and there is\n\
6597 at least one cased character in S, False otherwise.");
6598
6599 static PyObject*
6600 unicode_islower(PyUnicodeObject *self)
6601 {
6602     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6603     register const Py_UNICODE *e;
6604     int cased;
6605
6606     /* Shortcut for single character strings */
6607     if (PyUnicode_GET_SIZE(self) == 1)
6608         return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6609
6610     /* Special case for empty strings */
6611     if (PyUnicode_GET_SIZE(self) == 0)
6612         return PyBool_FromLong(0);
6613
6614     e = p + PyUnicode_GET_SIZE(self);
6615     cased = 0;
6616     for (; p < e; p++) {
6617         register const Py_UNICODE ch = *p;
6618
6619         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6620             return PyBool_FromLong(0);
6621         else if (!cased && Py_UNICODE_ISLOWER(ch))
6622             cased = 1;
6623     }
6624     return PyBool_FromLong(cased);
6625 }
6626
6627 PyDoc_STRVAR(isupper__doc__,
6628              "S.isupper() -> bool\n\
6629 \n\
6630 Return True if all cased characters in S are uppercase and there is\n\
6631 at least one cased character in S, False otherwise.");
6632
6633 static PyObject*
6634 unicode_isupper(PyUnicodeObject *self)
6635 {
6636     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6637     register const Py_UNICODE *e;
6638     int cased;
6639
6640     /* Shortcut for single character strings */
6641     if (PyUnicode_GET_SIZE(self) == 1)
6642         return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6643
6644     /* Special case for empty strings */
6645     if (PyUnicode_GET_SIZE(self) == 0)
6646         return PyBool_FromLong(0);
6647
6648     e = p + PyUnicode_GET_SIZE(self);
6649     cased = 0;
6650     for (; p < e; p++) {
6651         register const Py_UNICODE ch = *p;
6652
6653         if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6654             return PyBool_FromLong(0);
6655         else if (!cased && Py_UNICODE_ISUPPER(ch))
6656             cased = 1;
6657     }
6658     return PyBool_FromLong(cased);
6659 }
6660
6661 PyDoc_STRVAR(istitle__doc__,
6662              "S.istitle() -> bool\n\
6663 \n\
6664 Return True if S is a titlecased string and there is at least one\n\
6665 character in S, i.e. upper- and titlecase characters may only\n\
6666 follow uncased characters and lowercase characters only cased ones.\n\
6667 Return False otherwise.");
6668
6669 static PyObject*
6670 unicode_istitle(PyUnicodeObject *self)
6671 {
6672     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6673     register const Py_UNICODE *e;
6674     int cased, previous_is_cased;
6675
6676     /* Shortcut for single character strings */
6677     if (PyUnicode_GET_SIZE(self) == 1)
6678         return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6679                                (Py_UNICODE_ISUPPER(*p) != 0));
6680
6681     /* Special case for empty strings */
6682     if (PyUnicode_GET_SIZE(self) == 0)
6683         return PyBool_FromLong(0);
6684
6685     e = p + PyUnicode_GET_SIZE(self);
6686     cased = 0;
6687     previous_is_cased = 0;
6688     for (; p < e; p++) {
6689         register const Py_UNICODE ch = *p;
6690
6691         if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6692             if (previous_is_cased)
6693                 return PyBool_FromLong(0);
6694             previous_is_cased = 1;
6695             cased = 1;
6696         }
6697         else if (Py_UNICODE_ISLOWER(ch)) {
6698             if (!previous_is_cased)
6699                 return PyBool_FromLong(0);
6700             previous_is_cased = 1;
6701             cased = 1;
6702         }
6703         else
6704             previous_is_cased = 0;
6705     }
6706     return PyBool_FromLong(cased);
6707 }
6708
6709 PyDoc_STRVAR(isspace__doc__,
6710              "S.isspace() -> bool\n\
6711 \n\
6712 Return True if all characters in S are whitespace\n\
6713 and there is at least one character in S, False otherwise.");
6714
6715 static PyObject*
6716 unicode_isspace(PyUnicodeObject *self)
6717 {
6718     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719     register const Py_UNICODE *e;
6720
6721     /* Shortcut for single character strings */
6722     if (PyUnicode_GET_SIZE(self) == 1 &&
6723         Py_UNICODE_ISSPACE(*p))
6724         return PyBool_FromLong(1);
6725
6726     /* Special case for empty strings */
6727     if (PyUnicode_GET_SIZE(self) == 0)
6728         return PyBool_FromLong(0);
6729
6730     e = p + PyUnicode_GET_SIZE(self);
6731     for (; p < e; p++) {
6732         if (!Py_UNICODE_ISSPACE(*p))
6733             return PyBool_FromLong(0);
6734     }
6735     return PyBool_FromLong(1);
6736 }
6737
6738 PyDoc_STRVAR(isalpha__doc__,
6739              "S.isalpha() -> bool\n\
6740 \n\
6741 Return True if all characters in S are alphabetic\n\
6742 and there is at least one character in S, False otherwise.");
6743
6744 static PyObject*
6745 unicode_isalpha(PyUnicodeObject *self)
6746 {
6747     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6748     register const Py_UNICODE *e;
6749
6750     /* Shortcut for single character strings */
6751     if (PyUnicode_GET_SIZE(self) == 1 &&
6752         Py_UNICODE_ISALPHA(*p))
6753         return PyBool_FromLong(1);
6754
6755     /* Special case for empty strings */
6756     if (PyUnicode_GET_SIZE(self) == 0)
6757         return PyBool_FromLong(0);
6758
6759     e = p + PyUnicode_GET_SIZE(self);
6760     for (; p < e; p++) {
6761         if (!Py_UNICODE_ISALPHA(*p))
6762             return PyBool_FromLong(0);
6763     }
6764     return PyBool_FromLong(1);
6765 }
6766
6767 PyDoc_STRVAR(isalnum__doc__,
6768              "S.isalnum() -> bool\n\
6769 \n\
6770 Return True if all characters in S are alphanumeric\n\
6771 and there is at least one character in S, False otherwise.");
6772
6773 static PyObject*
6774 unicode_isalnum(PyUnicodeObject *self)
6775 {
6776     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6777     register const Py_UNICODE *e;
6778
6779     /* Shortcut for single character strings */
6780     if (PyUnicode_GET_SIZE(self) == 1 &&
6781         Py_UNICODE_ISALNUM(*p))
6782         return PyBool_FromLong(1);
6783
6784     /* Special case for empty strings */
6785     if (PyUnicode_GET_SIZE(self) == 0)
6786         return PyBool_FromLong(0);
6787
6788     e = p + PyUnicode_GET_SIZE(self);
6789     for (; p < e; p++) {
6790         if (!Py_UNICODE_ISALNUM(*p))
6791             return PyBool_FromLong(0);
6792     }
6793     return PyBool_FromLong(1);
6794 }
6795
6796 PyDoc_STRVAR(isdecimal__doc__,
6797              "S.isdecimal() -> bool\n\
6798 \n\
6799 Return True if there are only decimal characters in S,\n\
6800 False otherwise.");
6801
6802 static PyObject*
6803 unicode_isdecimal(PyUnicodeObject *self)
6804 {
6805     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6806     register const Py_UNICODE *e;
6807
6808     /* Shortcut for single character strings */
6809     if (PyUnicode_GET_SIZE(self) == 1 &&
6810         Py_UNICODE_ISDECIMAL(*p))
6811         return PyBool_FromLong(1);
6812
6813     /* Special case for empty strings */
6814     if (PyUnicode_GET_SIZE(self) == 0)
6815         return PyBool_FromLong(0);
6816
6817     e = p + PyUnicode_GET_SIZE(self);
6818     for (; p < e; p++) {
6819         if (!Py_UNICODE_ISDECIMAL(*p))
6820             return PyBool_FromLong(0);
6821     }
6822     return PyBool_FromLong(1);
6823 }
6824
6825 PyDoc_STRVAR(isdigit__doc__,
6826              "S.isdigit() -> bool\n\
6827 \n\
6828 Return True if all characters in S are digits\n\
6829 and there is at least one character in S, False otherwise.");
6830
6831 static PyObject*
6832 unicode_isdigit(PyUnicodeObject *self)
6833 {
6834     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835     register const Py_UNICODE *e;
6836
6837     /* Shortcut for single character strings */
6838     if (PyUnicode_GET_SIZE(self) == 1 &&
6839         Py_UNICODE_ISDIGIT(*p))
6840         return PyBool_FromLong(1);
6841
6842     /* Special case for empty strings */
6843     if (PyUnicode_GET_SIZE(self) == 0)
6844         return PyBool_FromLong(0);
6845
6846     e = p + PyUnicode_GET_SIZE(self);
6847     for (; p < e; p++) {
6848         if (!Py_UNICODE_ISDIGIT(*p))
6849             return PyBool_FromLong(0);
6850     }
6851     return PyBool_FromLong(1);
6852 }
6853
6854 PyDoc_STRVAR(isnumeric__doc__,
6855              "S.isnumeric() -> bool\n\
6856 \n\
6857 Return True if there are only numeric characters in S,\n\
6858 False otherwise.");
6859
6860 static PyObject*
6861 unicode_isnumeric(PyUnicodeObject *self)
6862 {
6863     register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864     register const Py_UNICODE *e;
6865
6866     /* Shortcut for single character strings */
6867     if (PyUnicode_GET_SIZE(self) == 1 &&
6868         Py_UNICODE_ISNUMERIC(*p))
6869         return PyBool_FromLong(1);
6870
6871     /* Special case for empty strings */
6872     if (PyUnicode_GET_SIZE(self) == 0)
6873         return PyBool_FromLong(0);
6874
6875     e = p + PyUnicode_GET_SIZE(self);
6876     for (; p < e; p++) {
6877         if (!Py_UNICODE_ISNUMERIC(*p))
6878             return PyBool_FromLong(0);
6879     }
6880     return PyBool_FromLong(1);
6881 }
6882
6883 PyDoc_STRVAR(join__doc__,
6884              "S.join(iterable) -> unicode\n\
6885 \n\
6886 Return a string which is the concatenation of the strings in the\n\
6887 iterable.  The separator between elements is S.");
6888
6889 static PyObject*
6890 unicode_join(PyObject *self, PyObject *data)
6891 {
6892     return PyUnicode_Join(self, data);
6893 }
6894
6895 static Py_ssize_t
6896 unicode_length(PyUnicodeObject *self)
6897 {
6898     return self->length;
6899 }
6900
6901 PyDoc_STRVAR(ljust__doc__,
6902              "S.ljust(width[, fillchar]) -> int\n\
6903 \n\
6904 Return S left-justified in a Unicode string of length width. Padding is\n\
6905 done using the specified fill character (default is a space).");
6906
6907 static PyObject *
6908 unicode_ljust(PyUnicodeObject *self, PyObject *args)
6909 {
6910     Py_ssize_t width;
6911     Py_UNICODE fillchar = ' ';
6912
6913     if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6914         return NULL;
6915
6916     if (self->length >= width && PyUnicode_CheckExact(self)) {
6917         Py_INCREF(self);
6918         return (PyObject*) self;
6919     }
6920
6921     return (PyObject*) pad(self, 0, width - self->length, fillchar);
6922 }
6923
6924 PyDoc_STRVAR(lower__doc__,
6925              "S.lower() -> unicode\n\
6926 \n\
6927 Return a copy of the string S converted to lowercase.");
6928
6929 static PyObject*
6930 unicode_lower(PyUnicodeObject *self)
6931 {
6932     return fixup(self, fixlower);
6933 }
6934
6935 #define LEFTSTRIP 0
6936 #define RIGHTSTRIP 1
6937 #define BOTHSTRIP 2
6938
6939 /* Arrays indexed by above */
6940 static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6941
6942 #define STRIPNAME(i) (stripformat[i]+3)
6943
6944 /* externally visible for str.strip(unicode) */
6945 PyObject *
6946 _PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6947 {
6948     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6949     Py_ssize_t len = PyUnicode_GET_SIZE(self);
6950     Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6951     Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6952     Py_ssize_t i, j;
6953
6954     BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6955
6956     i = 0;
6957     if (striptype != RIGHTSTRIP) {
6958         while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6959             i++;
6960         }
6961     }
6962
6963     j = len;
6964     if (striptype != LEFTSTRIP) {
6965         do {
6966             j--;
6967         } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6968         j++;
6969     }
6970
6971     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6972         Py_INCREF(self);
6973         return (PyObject*)self;
6974     }
6975     else
6976         return PyUnicode_FromUnicode(s+i, j-i);
6977 }
6978
6979
6980 static PyObject *
6981 do_strip(PyUnicodeObject *self, int striptype)
6982 {
6983     Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6984     Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6985
6986     i = 0;
6987     if (striptype != RIGHTSTRIP) {
6988         while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6989             i++;
6990         }
6991     }
6992
6993     j = len;
6994     if (striptype != LEFTSTRIP) {
6995         do {
6996             j--;
6997         } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6998         j++;
6999     }
7000
7001     if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7002         Py_INCREF(self);
7003         return (PyObject*)self;
7004     }
7005     else
7006         return PyUnicode_FromUnicode(s+i, j-i);
7007 }
7008
7009
7010 static PyObject *
7011 do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7012 {
7013     PyObject *sep = NULL;
7014
7015     if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7016         return NULL;
7017
7018     if (sep != NULL && sep != Py_None) {
7019         if (PyUnicode_Check(sep))
7020             return _PyUnicode_XStrip(self, striptype, sep);
7021         else if (PyString_Check(sep)) {
7022             PyObject *res;
7023             sep = PyUnicode_FromObject(sep);
7024             if (sep==NULL)
7025                 return NULL;
7026             res = _PyUnicode_XStrip(self, striptype, sep);
7027             Py_DECREF(sep);
7028             return res;
7029         }
7030         else {
7031             PyErr_Format(PyExc_TypeError,
7032                          "%s arg must be None, unicode or str",
7033                          STRIPNAME(striptype));
7034             return NULL;
7035         }
7036     }
7037
7038     return do_strip(self, striptype);
7039 }
7040
7041
7042 PyDoc_STRVAR(strip__doc__,
7043              "S.strip([chars]) -> unicode\n\
7044 \n\
7045 Return a copy of the string S with leading and trailing\n\
7046 whitespace removed.\n\
7047 If chars is given and not None, remove characters in chars instead.\n\
7048 If chars is a str, it will be converted to unicode before stripping");
7049
7050 static PyObject *
7051 unicode_strip(PyUnicodeObject *self, PyObject *args)
7052 {
7053     if (PyTuple_GET_SIZE(args) == 0)
7054         return do_strip(self, BOTHSTRIP); /* Common case */
7055     else
7056         return do_argstrip(self, BOTHSTRIP, args);
7057 }
7058
7059
7060 PyDoc_STRVAR(lstrip__doc__,
7061              "S.lstrip([chars]) -> unicode\n\
7062 \n\
7063 Return a copy of the string S with leading whitespace removed.\n\
7064 If chars is given and not None, remove characters in chars instead.\n\
7065 If chars is a str, it will be converted to unicode before stripping");
7066
7067 static PyObject *
7068 unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7069 {
7070     if (PyTuple_GET_SIZE(args) == 0)
7071         return do_strip(self, LEFTSTRIP); /* Common case */
7072     else
7073         return do_argstrip(self, LEFTSTRIP, args);
7074 }
7075
7076
7077 PyDoc_STRVAR(rstrip__doc__,
7078              "S.rstrip([chars]) -> unicode\n\
7079 \n\
7080 Return a copy of the string S with trailing whitespace removed.\n\
7081 If chars is given and not None, remove characters in chars instead.\n\
7082 If chars is a str, it will be converted to unicode before stripping");
7083
7084 static PyObject *
7085 unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7086 {
7087     if (PyTuple_GET_SIZE(args) == 0)
7088         return do_strip(self, RIGHTSTRIP); /* Common case */
7089     else
7090         return do_argstrip(self, RIGHTSTRIP, args);
7091 }
7092
7093
7094 static PyObject*
7095 unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7096 {
7097     PyUnicodeObject *u;
7098     Py_UNICODE *p;
7099     Py_ssize_t nchars;
7100     size_t nbytes;
7101
7102     if (len < 0)
7103         len = 0;
7104
7105     if (len == 1 && PyUnicode_CheckExact(str)) {
7106         /* no repeat, return original string */
7107         Py_INCREF(str);
7108         return (PyObject*) str;
7109     }
7110
7111     /* ensure # of chars needed doesn't overflow int and # of bytes
7112      * needed doesn't overflow size_t
7113      */
7114     nchars = len * str->length;
7115     if (len && nchars / len != str->length) {
7116         PyErr_SetString(PyExc_OverflowError,
7117                         "repeated string is too long");
7118         return NULL;
7119     }
7120     nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7121     if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7122         PyErr_SetString(PyExc_OverflowError,
7123                         "repeated string is too long");
7124         return NULL;
7125     }
7126     u = _PyUnicode_New(nchars);
7127     if (!u)
7128         return NULL;
7129
7130     p = u->str;
7131
7132     if (str->length == 1 && len > 0) {
7133         Py_UNICODE_FILL(p, str->str[0], len);
7134     } else {
7135         Py_ssize_t done = 0; /* number of characters copied this far */
7136         if (done < nchars) {
7137             Py_UNICODE_COPY(p, str->str, str->length);
7138             done = str->length;
7139         }
7140         while (done < nchars) {
7141             Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7142             Py_UNICODE_COPY(p+done, p, n);
7143             done += n;
7144         }
7145     }
7146
7147     return (PyObject*) u;
7148 }
7149
7150 PyObject *PyUnicode_Replace(PyObject *obj,
7151                             PyObject *subobj,
7152                             PyObject *replobj,
7153                             Py_ssize_t maxcount)
7154 {
7155     PyObject *self;
7156     PyObject *str1;
7157     PyObject *str2;
7158     PyObject *result;
7159
7160     self = PyUnicode_FromObject(obj);
7161     if (self == NULL)
7162         return NULL;
7163     str1 = PyUnicode_FromObject(subobj);
7164     if (str1 == NULL) {
7165         Py_DECREF(self);
7166         return NULL;
7167     }
7168     str2 = PyUnicode_FromObject(replobj);
7169     if (str2 == NULL) {
7170         Py_DECREF(self);
7171         Py_DECREF(str1);
7172         return NULL;
7173     }
7174     result = replace((PyUnicodeObject *)self,
7175                      (PyUnicodeObject *)str1,
7176                      (PyUnicodeObject *)str2,
7177                      maxcount);
7178     Py_DECREF(self);
7179     Py_DECREF(str1);
7180     Py_DECREF(str2);
7181     return result;
7182 }
7183
7184 PyDoc_STRVAR(replace__doc__,
7185              "S.replace(old, new[, count]) -> unicode\n\
7186 \n\
7187 Return a copy of S with all occurrences of substring\n\
7188 old replaced by new.  If the optional argument count is\n\
7189 given, only the first count occurrences are replaced.");
7190
7191 static PyObject*
7192 unicode_replace(PyUnicodeObject *self, PyObject *args)
7193 {
7194     PyUnicodeObject *str1;
7195     PyUnicodeObject *str2;
7196     Py_ssize_t maxcount = -1;
7197     PyObject *result;
7198
7199     if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7200         return NULL;
7201     str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7202     if (str1 == NULL)
7203         return NULL;
7204     str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7205     if (str2 == NULL) {
7206         Py_DECREF(str1);
7207         return NULL;
7208     }
7209
7210     result = replace(self, str1, str2, maxcount);
7211
7212     Py_DECREF(str1);
7213     Py_DECREF(str2);
7214     return result;
7215 }
7216
7217 static
7218 PyObject *unicode_repr(PyObject *unicode)
7219 {
7220     return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7221                                 PyUnicode_GET_SIZE(unicode),
7222                                 1);
7223 }
7224
7225 PyDoc_STRVAR(rfind__doc__,
7226              "S.rfind(sub [,start [,end]]) -> int\n\
7227 \n\
7228 Return the highest index in S where substring sub is found,\n\
7229 such that sub is contained within s[start:end].  Optional\n\
7230 arguments start and end are interpreted as in slice notation.\n\
7231 \n\
7232 Return -1 on failure.");
7233
7234 static PyObject *
7235 unicode_rfind(PyUnicodeObject *self, PyObject *args)
7236 {
7237     PyUnicodeObject *substring;
7238     Py_ssize_t start;
7239     Py_ssize_t end;
7240     Py_ssize_t result;
7241
7242     if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7243                                             &start, &end))
7244         return NULL;
7245
7246     result = stringlib_rfind_slice(
7247         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7248         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7249         start, end
7250         );
7251
7252     Py_DECREF(substring);
7253
7254     return PyInt_FromSsize_t(result);
7255 }
7256
7257 PyDoc_STRVAR(rindex__doc__,
7258              "S.rindex(sub [,start [,end]]) -> int\n\
7259 \n\
7260 Like S.rfind() but raise ValueError when the substring is not found.");
7261
7262 static PyObject *
7263 unicode_rindex(PyUnicodeObject *self, PyObject *args)
7264 {
7265     PyUnicodeObject *substring;
7266     Py_ssize_t start;
7267     Py_ssize_t end;
7268     Py_ssize_t result;
7269
7270     if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7271                                             &start, &end))
7272         return NULL;
7273
7274     result = stringlib_rfind_slice(
7275         PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7276         PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7277         start, end
7278         );
7279
7280     Py_DECREF(substring);
7281
7282     if (result < 0) {
7283         PyErr_SetString(PyExc_ValueError, "substring not found");
7284         return NULL;
7285     }
7286     return PyInt_FromSsize_t(result);
7287 }
7288
7289 PyDoc_STRVAR(rjust__doc__,
7290              "S.rjust(width[, fillchar]) -> unicode\n\
7291 \n\
7292 Return S right-justified in a Unicode string of length width. Padding is\n\
7293 done using the specified fill character (default is a space).");
7294
7295 static PyObject *
7296 unicode_rjust(PyUnicodeObject *self, PyObject *args)
7297 {
7298     Py_ssize_t width;
7299     Py_UNICODE fillchar = ' ';
7300
7301     if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7302         return NULL;
7303
7304     if (self->length >= width && PyUnicode_CheckExact(self)) {
7305         Py_INCREF(self);
7306         return (PyObject*) self;
7307     }
7308
7309     return (PyObject*) pad(self, width - self->length, 0, fillchar);
7310 }
7311
7312 static PyObject*
7313 unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7314 {
7315     /* standard clamping */
7316     if (start < 0)
7317         start = 0;
7318     if (end < 0)
7319         end = 0;
7320     if (end > self->length)
7321         end = self->length;
7322     if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7323         /* full slice, return original string */
7324         Py_INCREF(self);
7325         return (PyObject*) self;
7326     }
7327     if (start > end)
7328         start = end;
7329     /* copy slice */
7330     return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7331                                              end - start);
7332 }
7333
7334 PyObject *PyUnicode_Split(PyObject *s,
7335                           PyObject *sep,
7336                           Py_ssize_t maxsplit)
7337 {
7338     PyObject *result;
7339
7340     s = PyUnicode_FromObject(s);
7341     if (s == NULL)
7342         return NULL;
7343     if (sep != NULL) {
7344         sep = PyUnicode_FromObject(sep);
7345         if (sep == NULL) {
7346             Py_DECREF(s);
7347             return NULL;
7348         }
7349     }
7350
7351     result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7352
7353     Py_DECREF(s);
7354     Py_XDECREF(sep);
7355     return result;
7356 }
7357
7358 PyDoc_STRVAR(split__doc__,
7359              "S.split([sep [,maxsplit]]) -> list of strings\n\
7360 \n\
7361 Return a list of the words in S, using sep as the\n\
7362 delimiter string.  If maxsplit is given, at most maxsplit\n\
7363 splits are done. If sep is not specified or is None, any\n\
7364 whitespace string is a separator and empty strings are\n\
7365 removed from the result.");
7366
7367 static PyObject*
7368 unicode_split(PyUnicodeObject *self, PyObject *args)
7369 {
7370     PyObject *substring = Py_None;
7371     Py_ssize_t maxcount = -1;
7372
7373     if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7374         return NULL;
7375
7376     if (substring == Py_None)
7377         return split(self, NULL, maxcount);
7378     else if (PyUnicode_Check(substring))
7379         return split(self, (PyUnicodeObject *)substring, maxcount);
7380     else
7381         return PyUnicode_Split((PyObject *)self, substring, maxcount);
7382 }
7383
7384 PyObject *
7385 PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7386 {
7387     PyObject* str_obj;
7388     PyObject* sep_obj;
7389     PyObject* out;
7390
7391     str_obj = PyUnicode_FromObject(str_in);
7392     if (!str_obj)
7393         return NULL;
7394     sep_obj = PyUnicode_FromObject(sep_in);
7395     if (!sep_obj) {
7396         Py_DECREF(str_obj);
7397         return NULL;
7398     }
7399
7400     out = stringlib_partition(
7401         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7402         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7403         );
7404
7405     Py_DECREF(sep_obj);
7406     Py_DECREF(str_obj);
7407
7408     return out;
7409 }
7410
7411
7412 PyObject *
7413 PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7414 {
7415     PyObject* str_obj;
7416     PyObject* sep_obj;
7417     PyObject* out;
7418
7419     str_obj = PyUnicode_FromObject(str_in);
7420     if (!str_obj)
7421         return NULL;
7422     sep_obj = PyUnicode_FromObject(sep_in);
7423     if (!sep_obj) {
7424         Py_DECREF(str_obj);
7425         return NULL;
7426     }
7427
7428     out = stringlib_rpartition(
7429         str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7430         sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7431         );
7432
7433     Py_DECREF(sep_obj);
7434     Py_DECREF(str_obj);
7435
7436     return out;
7437 }
7438
7439 PyDoc_STRVAR(partition__doc__,
7440              "S.partition(sep) -> (head, sep, tail)\n\
7441 \n\
7442 Search for the separator sep in S, and return the part before it,\n\
7443 the separator itself, and the part after it.  If the separator is not\n\
7444 found, return S and two empty strings.");
7445
7446 static PyObject*
7447 unicode_partition(PyUnicodeObject *self, PyObject *separator)
7448 {
7449     return PyUnicode_Partition((PyObject *)self, separator);
7450 }
7451
7452 PyDoc_STRVAR(rpartition__doc__,
7453              "S.rpartition(sep) -> (head, sep, tail)\n\
7454 \n\
7455 Search for the separator sep in S, starting at the end of S, and return\n\
7456 the part before it, the separator itself, and the part after it.  If the\n\
7457 separator is not found, return two empty strings and S.");
7458
7459 static PyObject*
7460 unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7461 {
7462     return PyUnicode_RPartition((PyObject *)self, separator);
7463 }
7464
7465 PyObject *PyUnicode_RSplit(PyObject *s,
7466                            PyObject *sep,
7467                            Py_ssize_t maxsplit)
7468 {
7469     PyObject *result;
7470
7471     s = PyUnicode_FromObject(s);
7472     if (s == NULL)
7473         return NULL;
7474     if (sep != NULL) {
7475         sep = PyUnicode_FromObject(sep);
7476         if (sep == NULL) {
7477             Py_DECREF(s);
7478             return NULL;
7479         }
7480     }
7481
7482     result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7483
7484     Py_DECREF(s);
7485     Py_XDECREF(sep);
7486     return result;
7487 }
7488
7489 PyDoc_STRVAR(rsplit__doc__,
7490              "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7491 \n\
7492 Return a list of the words in S, using sep as the\n\
7493 delimiter string, starting at the end of the string and\n\
7494 working to the front.  If maxsplit is given, at most maxsplit\n\
7495 splits are done. If sep is not specified, any whitespace string\n\
7496 is a separator.");
7497
7498 static PyObject*
7499 unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7500 {
7501     PyObject *substring = Py_None;
7502     Py_ssize_t maxcount = -1;
7503
7504     if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7505         return NULL;
7506
7507     if (substring == Py_None)
7508         return rsplit(self, NULL, maxcount);
7509     else if (PyUnicode_Check(substring))
7510         return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7511     else
7512         return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7513 }
7514
7515 PyDoc_STRVAR(splitlines__doc__,
7516              "S.splitlines([keepends]) -> list of strings\n\
7517 \n\
7518 Return a list of the lines in S, breaking at line boundaries.\n\
7519 Line breaks are not included in the resulting list unless keepends\n\
7520 is given and true.");
7521
7522 static PyObject*
7523 unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7524 {
7525     int keepends = 0;
7526
7527     if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7528         return NULL;
7529
7530     return PyUnicode_Splitlines((PyObject *)self, keepends);
7531 }
7532
7533 static
7534 PyObject *unicode_str(PyUnicodeObject *self)
7535 {
7536     return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7537 }
7538
7539 PyDoc_STRVAR(swapcase__doc__,
7540              "S.swapcase() -> unicode\n\
7541 \n\
7542 Return a copy of S with uppercase characters converted to lowercase\n\
7543 and vice versa.");
7544
7545 static PyObject*
7546 unicode_swapcase(PyUnicodeObject *self)
7547 {
7548     return fixup(self, fixswapcase);
7549 }
7550
7551 PyDoc_STRVAR(translate__doc__,
7552              "S.translate(table) -> unicode\n\
7553 \n\
7554 Return a copy of the string S, where all characters have been mapped\n\
7555 through the given translation table, which must be a mapping of\n\
7556 Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7557 Unmapped characters are left untouched. Characters mapped to None\n\
7558 are deleted.");
7559
7560 static PyObject*
7561 unicode_translate(PyUnicodeObject *self, PyObject *table)
7562 {
7563     return PyUnicode_TranslateCharmap(self->str,
7564                                       self->length,
7565                                       table,
7566                                       "ignore");
7567 }
7568
7569 PyDoc_STRVAR(upper__doc__,
7570              "S.upper() -> unicode\n\
7571 \n\
7572 Return a copy of S converted to uppercase.");
7573
7574 static PyObject*
7575 unicode_upper(PyUnicodeObject *self)
7576 {
7577     return fixup(self, fixupper);
7578 }
7579
7580 PyDoc_STRVAR(zfill__doc__,
7581              "S.zfill(width) -> unicode\n\
7582 \n\
7583 Pad a numeric string S with zeros on the left, to fill a field\n\
7584 of the specified width. The string S is never truncated.");
7585
7586 static PyObject *
7587 unicode_zfill(PyUnicodeObject *self, PyObject *args)
7588 {
7589     Py_ssize_t fill;
7590     PyUnicodeObject *u;
7591
7592     Py_ssize_t width;
7593     if (!PyArg_ParseTuple(args, "n:zfill", &width))
7594         return NULL;
7595
7596     if (self->length >= width) {
7597         if (PyUnicode_CheckExact(self)) {
7598             Py_INCREF(self);
7599             return (PyObject*) self;
7600         }
7601         else
7602             return PyUnicode_FromUnicode(
7603                 PyUnicode_AS_UNICODE(self),
7604                 PyUnicode_GET_SIZE(self)
7605                 );
7606     }
7607
7608     fill = width - self->length;
7609
7610     u = pad(self, fill, 0, '0');
7611
7612     if (u == NULL)
7613         return NULL;
7614
7615     if (u->str[fill] == '+' || u->str[fill] == '-') {
7616         /* move sign to beginning of string */
7617         u->str[0] = u->str[fill];
7618         u->str[fill] = '0';
7619     }
7620
7621     return (PyObject*) u;
7622 }
7623
7624 #if 0
7625 static PyObject*
7626 free_listsize(PyUnicodeObject *self)
7627 {
7628     return PyInt_FromLong(numfree);
7629 }
7630 #endif
7631
7632 PyDoc_STRVAR(startswith__doc__,
7633              "S.startswith(prefix[, start[, end]]) -> bool\n\
7634 \n\
7635 Return True if S starts with the specified prefix, False otherwise.\n\
7636 With optional start, test S beginning at that position.\n\
7637 With optional end, stop comparing S at that position.\n\
7638 prefix can also be a tuple of strings to try.");
7639
7640 static PyObject *
7641 unicode_startswith(PyUnicodeObject *self,
7642                    PyObject *args)
7643 {
7644     PyObject *subobj;
7645     PyUnicodeObject *substring;
7646     Py_ssize_t start = 0;
7647     Py_ssize_t end = PY_SSIZE_T_MAX;
7648     int result;
7649
7650     if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7651         return NULL;
7652     if (PyTuple_Check(subobj)) {
7653         Py_ssize_t i;
7654         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7655             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7656                 PyTuple_GET_ITEM(subobj, i));
7657             if (substring == NULL)
7658                 return NULL;
7659             result = tailmatch(self, substring, start, end, -1);
7660             Py_DECREF(substring);
7661             if (result) {
7662                 Py_RETURN_TRUE;
7663             }
7664         }
7665         /* nothing matched */
7666         Py_RETURN_FALSE;
7667     }
7668     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7669     if (substring == NULL) {
7670         if (PyErr_ExceptionMatches(PyExc_TypeError))
7671             PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7672                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7673         return NULL;
7674     }
7675     result = tailmatch(self, substring, start, end, -1);
7676     Py_DECREF(substring);
7677     return PyBool_FromLong(result);
7678 }
7679
7680
7681 PyDoc_STRVAR(endswith__doc__,
7682              "S.endswith(suffix[, start[, end]]) -> bool\n\
7683 \n\
7684 Return True if S ends with the specified suffix, False otherwise.\n\
7685 With optional start, test S beginning at that position.\n\
7686 With optional end, stop comparing S at that position.\n\
7687 suffix can also be a tuple of strings to try.");
7688
7689 static PyObject *
7690 unicode_endswith(PyUnicodeObject *self,
7691                  PyObject *args)
7692 {
7693     PyObject *subobj;
7694     PyUnicodeObject *substring;
7695     Py_ssize_t start = 0;
7696     Py_ssize_t end = PY_SSIZE_T_MAX;
7697     int result;
7698
7699     if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7700         return NULL;
7701     if (PyTuple_Check(subobj)) {
7702         Py_ssize_t i;
7703         for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7704             substring = (PyUnicodeObject *)PyUnicode_FromObject(
7705                 PyTuple_GET_ITEM(subobj, i));
7706             if (substring == NULL)
7707                 return NULL;
7708             result = tailmatch(self, substring, start, end, +1);
7709             Py_DECREF(substring);
7710             if (result) {
7711                 Py_RETURN_TRUE;
7712             }
7713         }
7714         Py_RETURN_FALSE;
7715     }
7716     substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7717     if (substring == NULL) {
7718         if (PyErr_ExceptionMatches(PyExc_TypeError))
7719             PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7720                          "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7721         return NULL;
7722     }
7723     result = tailmatch(self, substring, start, end, +1);
7724     Py_DECREF(substring);
7725     return PyBool_FromLong(result);
7726 }
7727
7728
7729 /* Implements do_string_format, which is unicode because of stringlib */
7730 #include "stringlib/string_format.h"
7731
7732 PyDoc_STRVAR(format__doc__,
7733              "S.format(*args, **kwargs) -> unicode\n\
7734 \n\
7735 Return a formatted version of S, using substitutions from args and kwargs.\n\
7736 The substitutions are identified by braces ('{' and '}').");
7737
7738 static PyObject *
7739 unicode__format__(PyObject *self, PyObject *args)
7740 {
7741     PyObject *format_spec;
7742     PyObject *result = NULL;
7743     PyObject *tmp = NULL;
7744
7745     /* If 2.x, convert format_spec to the same type as value */
7746     /* This is to allow things like u''.format('') */
7747     if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7748         goto done;
7749     if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7750         PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7751                      "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7752         goto done;
7753     }
7754     tmp = PyObject_Unicode(format_spec);
7755     if (tmp == NULL)
7756         goto done;
7757     format_spec = tmp;
7758
7759     result = _PyUnicode_FormatAdvanced(self,
7760                                        PyUnicode_AS_UNICODE(format_spec),
7761                                        PyUnicode_GET_SIZE(format_spec));
7762   done:
7763     Py_XDECREF(tmp);
7764     return result;
7765 }
7766
7767 PyDoc_STRVAR(p_format__doc__,
7768              "S.__format__(format_spec) -> unicode\n\
7769 \n\
7770 Return a formatted version of S as described by format_spec.");
7771
7772 static PyObject *
7773 unicode__sizeof__(PyUnicodeObject *v)
7774 {
7775     return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7776                              sizeof(Py_UNICODE) * (v->length + 1));
7777 }
7778
7779 PyDoc_STRVAR(sizeof__doc__,
7780              "S.__sizeof__() -> size of S in memory, in bytes\n\
7781 \n\
7782 ");
7783
7784 static PyObject *
7785 unicode_getnewargs(PyUnicodeObject *v)
7786 {
7787     return Py_BuildValue("(u#)", v->str, v->length);
7788 }
7789
7790
7791 static PyMethodDef unicode_methods[] = {
7792
7793     /* Order is according to common usage: often used methods should
7794        appear first, since lookup is done sequentially. */
7795
7796     {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7797     {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7798     {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7799     {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7800     {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7801     {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7802     {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7803     {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7804     {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7805     {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7806     {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7807     {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7808     {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7809     {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7810     {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7811     {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7812     {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7813 /*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7814     {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7815     {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7816     {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7817     {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7818     {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7819     {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7820     {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7821     {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7822     {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7823     {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7824     {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7825     {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7826     {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7827     {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7828     {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7829     {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7830     {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7831     {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7832     {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7833     {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7834     {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7835     {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7836     {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7837     {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7838     {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7839     {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7840     {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7841 #if 0
7842     {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7843 #endif
7844
7845 #if 0
7846     /* This one is just used for debugging the implementation. */
7847     {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7848 #endif
7849
7850     {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7851     {NULL, NULL}
7852 };
7853
7854 static PyObject *
7855 unicode_mod(PyObject *v, PyObject *w)
7856 {
7857     if (!PyUnicode_Check(v)) {
7858         Py_INCREF(Py_NotImplemented);
7859         return Py_NotImplemented;
7860     }
7861     return PyUnicode_Format(v, w);
7862 }
7863
7864 static PyNumberMethods unicode_as_number = {
7865     0,              /*nb_add*/
7866     0,              /*nb_subtract*/
7867     0,              /*nb_multiply*/
7868     0,              /*nb_divide*/
7869     unicode_mod,            /*nb_remainder*/
7870 };
7871
7872 static PySequenceMethods unicode_as_sequence = {
7873     (lenfunc) unicode_length,       /* sq_length */
7874     PyUnicode_Concat,           /* sq_concat */
7875     (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7876     (ssizeargfunc) unicode_getitem,     /* sq_item */
7877     (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7878     0,                  /* sq_ass_item */
7879     0,                  /* sq_ass_slice */
7880     PyUnicode_Contains,         /* sq_contains */
7881 };
7882
7883 static PyObject*
7884 unicode_subscript(PyUnicodeObject* self, PyObject* item)
7885 {
7886     if (PyIndex_Check(item)) {
7887         Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7888         if (i == -1 && PyErr_Occurred())
7889             return NULL;
7890         if (i < 0)
7891             i += PyUnicode_GET_SIZE(self);
7892         return unicode_getitem(self, i);
7893     } else if (PySlice_Check(item)) {
7894         Py_ssize_t start, stop, step, slicelength, cur, i;
7895         Py_UNICODE* source_buf;
7896         Py_UNICODE* result_buf;
7897         PyObject* result;
7898
7899         if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7900                                  &start, &stop, &step, &slicelength) < 0) {
7901             return NULL;
7902         }
7903
7904         if (slicelength <= 0) {
7905             return PyUnicode_FromUnicode(NULL, 0);
7906         } else if (start == 0 && step == 1 && slicelength == self->length &&
7907                    PyUnicode_CheckExact(self)) {
7908             Py_INCREF(self);
7909             return (PyObject *)self;
7910         } else if (step == 1) {
7911             return PyUnicode_FromUnicode(self->str + start, slicelength);
7912         } else {
7913             source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7914             result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7915                                                        sizeof(Py_UNICODE));
7916
7917             if (result_buf == NULL)
7918                 return PyErr_NoMemory();
7919
7920             for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7921                 result_buf[i] = source_buf[cur];
7922             }
7923
7924             result = PyUnicode_FromUnicode(result_buf, slicelength);
7925             PyObject_FREE(result_buf);
7926             return result;
7927         }
7928     } else {
7929         PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7930         return NULL;
7931     }
7932 }
7933
7934 static PyMappingMethods unicode_as_mapping = {
7935     (lenfunc)unicode_length,        /* mp_length */
7936     (binaryfunc)unicode_subscript,  /* mp_subscript */
7937     (objobjargproc)0,           /* mp_ass_subscript */
7938 };
7939
7940 static Py_ssize_t
7941 unicode_buffer_getreadbuf(PyUnicodeObject *self,
7942                           Py_ssize_t index,
7943                           const void **ptr)
7944 {
7945     if (index != 0) {
7946         PyErr_SetString(PyExc_SystemError,
7947                         "accessing non-existent unicode segment");
7948         return -1;
7949     }
7950     *ptr = (void *) self->str;
7951     return PyUnicode_GET_DATA_SIZE(self);
7952 }
7953
7954 static Py_ssize_t
7955 unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7956                            const void **ptr)
7957 {
7958     PyErr_SetString(PyExc_TypeError,
7959                     "cannot use unicode as modifiable buffer");
7960     return -1;
7961 }
7962
7963 static int
7964 unicode_buffer_getsegcount(PyUnicodeObject *self,
7965                            Py_ssize_t *lenp)
7966 {
7967     if (lenp)
7968         *lenp = PyUnicode_GET_DATA_SIZE(self);
7969     return 1;
7970 }
7971
7972 static Py_ssize_t
7973 unicode_buffer_getcharbuf(PyUnicodeObject *self,
7974                           Py_ssize_t index,
7975                           const void **ptr)
7976 {
7977     PyObject *str;
7978
7979     if (index != 0) {
7980         PyErr_SetString(PyExc_SystemError,
7981                         "accessing non-existent unicode segment");
7982         return -1;
7983     }
7984     str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7985     if (str == NULL)
7986         return -1;
7987     *ptr = (void *) PyString_AS_STRING(str);
7988     return PyString_GET_SIZE(str);
7989 }
7990
7991 /* Helpers for PyUnicode_Format() */
7992
7993 static PyObject *
7994 getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7995 {
7996     Py_ssize_t argidx = *p_argidx;
7997     if (argidx < arglen) {
7998         (*p_argidx)++;
7999         if (arglen < 0)
8000             return args;
8001         else
8002             return PyTuple_GetItem(args, argidx);
8003     }
8004     PyErr_SetString(PyExc_TypeError,
8005                     "not enough arguments for format string");
8006     return NULL;
8007 }
8008
8009 #define F_LJUST (1<<0)
8010 #define F_SIGN  (1<<1)
8011 #define F_BLANK (1<<2)
8012 #define F_ALT   (1<<3)
8013 #define F_ZERO  (1<<4)
8014
8015 static Py_ssize_t
8016 strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8017 {
8018     register Py_ssize_t i;
8019     Py_ssize_t len = strlen(charbuffer);
8020     for (i = len - 1; i >= 0; i--)
8021         buffer[i] = (Py_UNICODE) charbuffer[i];
8022
8023     return len;
8024 }
8025
8026 static int
8027 longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8028 {
8029     Py_ssize_t result;
8030
8031     PyOS_snprintf((char *)buffer, len, format, x);
8032     result = strtounicode(buffer, (char *)buffer);
8033     return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8034 }
8035
8036 /* XXX To save some code duplication, formatfloat/long/int could have been
8037    shared with stringobject.c, converting from 8-bit to Unicode after the
8038    formatting is done. */
8039
8040 /* Returns a new reference to a PyUnicode object, or NULL on failure. */
8041
8042 static PyObject *
8043 formatfloat(PyObject *v, int flags, int prec, int type)
8044 {
8045     char *p;
8046     PyObject *result;
8047     double x;
8048
8049     x = PyFloat_AsDouble(v);
8050     if (x == -1.0 && PyErr_Occurred())
8051         return NULL;
8052
8053     if (prec < 0)
8054         prec = 6;
8055
8056     p = PyOS_double_to_string(x, type, prec,
8057                               (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8058     if (p == NULL)
8059         return NULL;
8060     result = PyUnicode_FromStringAndSize(p, strlen(p));
8061     PyMem_Free(p);
8062     return result;
8063 }
8064
8065 static PyObject*
8066 formatlong(PyObject *val, int flags, int prec, int type)
8067 {
8068     char *buf;
8069     int i, len;
8070     PyObject *str; /* temporary string object. */
8071     PyUnicodeObject *result;
8072
8073     str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8074     if (!str)
8075         return NULL;
8076     result = _PyUnicode_New(len);
8077     if (!result) {
8078         Py_DECREF(str);
8079         return NULL;
8080     }
8081     for (i = 0; i < len; i++)
8082         result->str[i] = buf[i];
8083     result->str[len] = 0;
8084     Py_DECREF(str);
8085     return (PyObject*)result;
8086 }
8087
8088 static int
8089 formatint(Py_UNICODE *buf,
8090           size_t buflen,
8091           int flags,
8092           int prec,
8093           int type,
8094           PyObject *v)
8095 {
8096     /* fmt = '%#.' + `prec` + 'l' + `type`
8097      * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8098      *                     + 1 + 1
8099      *                   = 24
8100      */
8101     char fmt[64]; /* plenty big enough! */
8102     char *sign;
8103     long x;
8104
8105     x = PyInt_AsLong(v);
8106     if (x == -1 && PyErr_Occurred())
8107         return -1;
8108     if (x < 0 && type == 'u') {
8109         type = 'd';
8110     }
8111     if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8112         sign = "-";
8113     else
8114         sign = "";
8115     if (prec < 0)
8116         prec = 1;
8117
8118     /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8119      * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8120      */
8121     if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8122         PyErr_SetString(PyExc_OverflowError,
8123                         "formatted integer is too long (precision too large?)");
8124         return -1;
8125     }
8126
8127     if ((flags & F_ALT) &&
8128         (type == 'x' || type == 'X')) {
8129         /* When converting under %#x or %#X, there are a number
8130          * of issues that cause pain:
8131          * - when 0 is being converted, the C standard leaves off
8132          *   the '0x' or '0X', which is inconsistent with other
8133          *   %#x/%#X conversions and inconsistent with Python's
8134          *   hex() function
8135          * - there are platforms that violate the standard and
8136          *   convert 0 with the '0x' or '0X'
8137          *   (Metrowerks, Compaq Tru64)
8138          * - there are platforms that give '0x' when converting
8139          *   under %#X, but convert 0 in accordance with the
8140          *   standard (OS/2 EMX)
8141          *
8142          * We can achieve the desired consistency by inserting our
8143          * own '0x' or '0X' prefix, and substituting %x/%X in place
8144          * of %#x/%#X.
8145          *
8146          * Note that this is the same approach as used in
8147          * formatint() in stringobject.c
8148          */
8149         PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8150                       sign, type, prec, type);
8151     }
8152     else {
8153         PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8154                       sign, (flags&F_ALT) ? "#" : "",
8155                       prec, type);
8156     }
8157     if (sign[0])
8158         return longtounicode(buf, buflen, fmt, -x);
8159     else
8160         return longtounicode(buf, buflen, fmt, x);
8161 }
8162
8163 static int
8164 formatchar(Py_UNICODE *buf,
8165            size_t buflen,
8166            PyObject *v)
8167 {
8168     PyObject *unistr;
8169     char *str;
8170     /* presume that the buffer is at least 2 characters long */
8171     if (PyUnicode_Check(v)) {
8172         if (PyUnicode_GET_SIZE(v) != 1)
8173             goto onError;
8174         buf[0] = PyUnicode_AS_UNICODE(v)[0];
8175     }
8176
8177     else if (PyString_Check(v)) {
8178         if (PyString_GET_SIZE(v) != 1)
8179             goto onError;
8180         /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8181            with a UnicodeDecodeError if 'char' is not decodable with the
8182            default encoding (usually ASCII, but it might be something else) */
8183         str = PyString_AS_STRING(v);
8184         if ((unsigned char)str[0] > 0x7F) {
8185             /* the char is not ASCII; try to decode the string using the
8186                default encoding and return -1 to let the UnicodeDecodeError
8187                be raised if the string can't be decoded */
8188             unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8189             if (unistr == NULL)
8190                 return -1;
8191             buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8192             Py_DECREF(unistr);
8193         }
8194         else
8195             buf[0] = (Py_UNICODE)str[0];
8196     }
8197
8198     else {
8199         /* Integer input truncated to a character */
8200         long x;
8201         x = PyInt_AsLong(v);
8202         if (x == -1 && PyErr_Occurred())
8203             goto onError;
8204 #ifdef Py_UNICODE_WIDE
8205         if (x < 0 || x > 0x10ffff) {
8206             PyErr_SetString(PyExc_OverflowError,
8207                             "%c arg not in range(0x110000) "
8208                             "(wide Python build)");
8209             return -1;
8210         }
8211 #else
8212         if (x < 0 || x > 0xffff) {
8213             PyErr_SetString(PyExc_OverflowError,
8214                             "%c arg not in range(0x10000) "
8215                             "(narrow Python build)");
8216             return -1;
8217         }
8218 #endif
8219         buf[0] = (Py_UNICODE) x;
8220     }
8221     buf[1] = '\0';
8222     return 1;
8223
8224   onError:
8225     PyErr_SetString(PyExc_TypeError,
8226                     "%c requires int or char");
8227     return -1;
8228 }
8229
8230 /* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8231
8232    FORMATBUFLEN is the length of the buffer in which the ints &
8233    chars are formatted. XXX This is a magic number. Each formatting
8234    routine does bounds checking to ensure no overflow, but a better
8235    solution may be to malloc a buffer of appropriate size for each
8236    format. For now, the current solution is sufficient.
8237 */
8238 #define FORMATBUFLEN (size_t)120
8239
8240 PyObject *PyUnicode_Format(PyObject *format,
8241                            PyObject *args)
8242 {
8243     Py_UNICODE *fmt, *res;
8244     Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8245     int args_owned = 0;
8246     PyUnicodeObject *result = NULL;
8247     PyObject *dict = NULL;
8248     PyObject *uformat;
8249
8250     if (format == NULL || args == NULL) {
8251         PyErr_BadInternalCall();
8252         return NULL;
8253     }
8254     uformat = PyUnicode_FromObject(format);
8255     if (uformat == NULL)
8256         return NULL;
8257     fmt = PyUnicode_AS_UNICODE(uformat);
8258     fmtcnt = PyUnicode_GET_SIZE(uformat);
8259
8260     reslen = rescnt = fmtcnt + 100;
8261     result = _PyUnicode_New(reslen);
8262     if (result == NULL)
8263         goto onError;
8264     res = PyUnicode_AS_UNICODE(result);
8265
8266     if (PyTuple_Check(args)) {
8267         arglen = PyTuple_Size(args);
8268         argidx = 0;
8269     }
8270     else {
8271         arglen = -1;
8272         argidx = -2;
8273     }
8274     if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8275         !PyObject_TypeCheck(args, &PyBaseString_Type))
8276         dict = args;
8277
8278     while (--fmtcnt >= 0) {
8279         if (*fmt != '%') {
8280             if (--rescnt < 0) {
8281                 rescnt = fmtcnt + 100;
8282                 reslen += rescnt;
8283                 if (_PyUnicode_Resize(&result, reslen) < 0)
8284                     goto onError;
8285                 res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8286                 --rescnt;
8287             }
8288             *res++ = *fmt++;
8289         }
8290         else {
8291             /* Got a format specifier */
8292             int flags = 0;
8293             Py_ssize_t width = -1;
8294             int prec = -1;
8295             Py_UNICODE c = '\0';
8296             Py_UNICODE fill;
8297             int isnumok;
8298             PyObject *v       = NULL;
8299             PyObject *temp    = NULL;
8300             Py_UNICODE *pbuf  = NULL;
8301             Py_UNICODE sign;
8302             Py_ssize_t len;
8303             Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8304
8305             fmt++;
8306             if (*fmt == '(') {
8307                 Py_UNICODE *keystart;
8308                 Py_ssize_t keylen;
8309                 PyObject *key;
8310                 int pcount = 1;
8311
8312                 if (dict == NULL) {
8313                     PyErr_SetString(PyExc_TypeError,
8314                                     "format requires a mapping");
8315                     goto onError;
8316                 }
8317                 ++fmt;
8318                 --fmtcnt;
8319                 keystart = fmt;
8320                 /* Skip over balanced parentheses */
8321                 while (pcount > 0 && --fmtcnt >= 0) {
8322                     if (*fmt == ')')
8323                         --pcount;
8324                     else if (*fmt == '(')
8325                         ++pcount;
8326                     fmt++;
8327                 }
8328                 keylen = fmt - keystart - 1;
8329                 if (fmtcnt < 0 || pcount > 0) {
8330                     PyErr_SetString(PyExc_ValueError,
8331                                     "incomplete format key");
8332                     goto onError;
8333                 }
8334 #if 0
8335                 /* keys are converted to strings using UTF-8 and
8336                    then looked up since Python uses strings to hold
8337                    variables names etc. in its namespaces and we
8338                    wouldn't want to break common idioms. */
8339                 key = PyUnicode_EncodeUTF8(keystart,
8340                                            keylen,
8341                                            NULL);
8342 #else
8343                 key = PyUnicode_FromUnicode(keystart, keylen);
8344 #endif
8345                 if (key == NULL)
8346                     goto onError;
8347                 if (args_owned) {
8348                     Py_DECREF(args);
8349                     args_owned = 0;
8350                 }
8351                 args = PyObject_GetItem(dict, key);
8352                 Py_DECREF(key);
8353                 if (args == NULL) {
8354                     goto onError;
8355                 }
8356                 args_owned = 1;
8357                 arglen = -1;
8358                 argidx = -2;
8359             }
8360             while (--fmtcnt >= 0) {
8361                 switch (c = *fmt++) {
8362                 case '-': flags |= F_LJUST; continue;
8363                 case '+': flags |= F_SIGN; continue;
8364                 case ' ': flags |= F_BLANK; continue;
8365                 case '#': flags |= F_ALT; continue;
8366                 case '0': flags |= F_ZERO; continue;
8367                 }
8368                 break;
8369             }
8370             if (c == '*') {
8371                 v = getnextarg(args, arglen, &argidx);
8372                 if (v == NULL)
8373                     goto onError;
8374                 if (!PyInt_Check(v)) {
8375                     PyErr_SetString(PyExc_TypeError,
8376                                     "* wants int");
8377                     goto onError;
8378                 }
8379                 width = PyInt_AsLong(v);
8380                 if (width < 0) {
8381                     flags |= F_LJUST;
8382                     width = -width;
8383                 }
8384                 if (--fmtcnt >= 0)
8385                     c = *fmt++;
8386             }
8387             else if (c >= '0' && c <= '9') {
8388                 width = c - '0';
8389                 while (--fmtcnt >= 0) {
8390                     c = *fmt++;
8391                     if (c < '0' || c > '9')
8392                         break;
8393                     if ((width*10) / 10 != width) {
8394                         PyErr_SetString(PyExc_ValueError,
8395                                         "width too big");
8396                         goto onError;
8397                     }
8398                     width = width*10 + (c - '0');
8399                 }
8400             }
8401             if (c == '.') {
8402                 prec = 0;
8403                 if (--fmtcnt >= 0)
8404                     c = *fmt++;
8405                 if (c == '*') {
8406                     v = getnextarg(args, arglen, &argidx);
8407                     if (v == NULL)
8408                         goto onError;
8409                     if (!PyInt_Check(v)) {
8410                         PyErr_SetString(PyExc_TypeError,
8411                                         "* wants int");
8412                         goto onError;
8413                     }
8414                     prec = PyInt_AsLong(v);
8415                     if (prec < 0)
8416                         prec = 0;
8417                     if (--fmtcnt >= 0)
8418                         c = *fmt++;
8419                 }
8420                 else if (c >= '0' && c <= '9') {
8421                     prec = c - '0';
8422                     while (--fmtcnt >= 0) {
8423                         c = *fmt++;
8424                         if (c < '0' || c > '9')
8425                             break;
8426                         if ((prec*10) / 10 != prec) {
8427                             PyErr_SetString(PyExc_ValueError,
8428                                             "prec too big");
8429                             goto onError;
8430                         }
8431                         prec = prec*10 + (c - '0');
8432                     }
8433                 }
8434             } /* prec */
8435             if (fmtcnt >= 0) {
8436                 if (c == 'h' || c == 'l' || c == 'L') {
8437                     if (--fmtcnt >= 0)
8438                         c = *fmt++;
8439                 }
8440             }
8441             if (fmtcnt < 0) {
8442                 PyErr_SetString(PyExc_ValueError,
8443                                 "incomplete format");
8444                 goto onError;
8445             }
8446             if (c != '%') {
8447                 v = getnextarg(args, arglen, &argidx);
8448                 if (v == NULL)
8449                     goto onError;
8450             }
8451             sign = 0;
8452             fill = ' ';
8453             switch (c) {
8454
8455             case '%':
8456                 pbuf = formatbuf;
8457                 /* presume that buffer length is at least 1 */
8458                 pbuf[0] = '%';
8459                 len = 1;
8460                 break;
8461
8462             case 's':
8463             case 'r':
8464                 if (PyUnicode_CheckExact(v) && c == 's') {
8465                     temp = v;
8466                     Py_INCREF(temp);
8467                 }
8468                 else {
8469                     PyObject *unicode;
8470                     if (c == 's')
8471                         temp = PyObject_Unicode(v);
8472                     else
8473                         temp = PyObject_Repr(v);
8474                     if (temp == NULL)
8475                         goto onError;
8476                     if (PyUnicode_Check(temp))
8477                         /* nothing to do */;
8478                     else if (PyString_Check(temp)) {
8479                         /* convert to string to Unicode */
8480                         unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8481                                                    PyString_GET_SIZE(temp),
8482                                                    NULL,
8483                                                    "strict");
8484                         Py_DECREF(temp);
8485                         temp = unicode;
8486                         if (temp == NULL)
8487                             goto onError;
8488                     }
8489                     else {
8490                         Py_DECREF(temp);
8491                         PyErr_SetString(PyExc_TypeError,
8492                                         "%s argument has non-string str()");
8493                         goto onError;
8494                     }
8495                 }
8496                 pbuf = PyUnicode_AS_UNICODE(temp);
8497                 len = PyUnicode_GET_SIZE(temp);
8498                 if (prec >= 0 && len > prec)
8499                     len = prec;
8500                 break;
8501
8502             case 'i':
8503             case 'd':
8504             case 'u':
8505             case 'o':
8506             case 'x':
8507             case 'X':
8508                 if (c == 'i')
8509                     c = 'd';
8510                 isnumok = 0;
8511                 if (PyNumber_Check(v)) {
8512                     PyObject *iobj=NULL;
8513
8514                     if (PyInt_Check(v) || (PyLong_Check(v))) {
8515                         iobj = v;
8516                         Py_INCREF(iobj);
8517                     }
8518                     else {
8519                         iobj = PyNumber_Int(v);
8520                         if (iobj==NULL) iobj = PyNumber_Long(v);
8521                     }
8522                     if (iobj!=NULL) {
8523                         if (PyInt_Check(iobj)) {
8524                             isnumok = 1;
8525                             pbuf = formatbuf;
8526                             len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8527                                             flags, prec, c, iobj);
8528                             Py_DECREF(iobj);
8529                             if (len < 0)
8530                                 goto onError;
8531                             sign = 1;
8532                         }
8533                         else if (PyLong_Check(iobj)) {
8534                             isnumok = 1;
8535                             temp = formatlong(iobj, flags, prec, c);
8536                             Py_DECREF(iobj);
8537                             if (!temp)
8538                                 goto onError;
8539                             pbuf = PyUnicode_AS_UNICODE(temp);
8540                             len = PyUnicode_GET_SIZE(temp);
8541                             sign = 1;
8542                         }
8543                         else {
8544                             Py_DECREF(iobj);
8545                         }
8546                     }
8547                 }
8548                 if (!isnumok) {
8549                     PyErr_Format(PyExc_TypeError,
8550                                  "%%%c format: a number is required, "
8551                                  "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8552                     goto onError;
8553                 }
8554                 if (flags & F_ZERO)
8555                     fill = '0';
8556                 break;
8557
8558             case 'e':
8559             case 'E':
8560             case 'f':
8561             case 'F':
8562             case 'g':
8563             case 'G':
8564                 temp = formatfloat(v, flags, prec, c);
8565                 if (temp == NULL)
8566                     goto onError;
8567                 pbuf = PyUnicode_AS_UNICODE(temp);
8568                 len = PyUnicode_GET_SIZE(temp);
8569                 sign = 1;
8570                 if (flags & F_ZERO)
8571                     fill = '0';
8572                 break;
8573
8574             case 'c':
8575                 pbuf = formatbuf;
8576                 len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8577                 if (len < 0)
8578                     goto onError;
8579                 break;
8580
8581             default:
8582                 PyErr_Format(PyExc_ValueError,
8583                              "unsupported format character '%c' (0x%x) "
8584                              "at index %zd",
8585                              (31<=c && c<=126) ? (char)c : '?',
8586                              (int)c,
8587                              (Py_ssize_t)(fmt - 1 -
8588                                           PyUnicode_AS_UNICODE(uformat)));
8589                 goto onError;
8590             }
8591             if (sign) {
8592                 if (*pbuf == '-' || *pbuf == '+') {
8593                     sign = *pbuf++;
8594                     len--;
8595                 }
8596                 else if (flags & F_SIGN)
8597                     sign = '+';
8598                 else if (flags & F_BLANK)
8599                     sign = ' ';
8600                 else
8601                     sign = 0;
8602             }
8603             if (width < len)
8604                 width = len;
8605             if (rescnt - (sign != 0) < width) {
8606                 reslen -= rescnt;
8607                 rescnt = width + fmtcnt + 100;
8608                 reslen += rescnt;
8609                 if (reslen < 0) {
8610                     Py_XDECREF(temp);
8611                     PyErr_NoMemory();
8612                     goto onError;
8613                 }
8614                 if (_PyUnicode_Resize(&result, reslen) < 0) {
8615                     Py_XDECREF(temp);
8616                     goto onError;
8617                 }
8618                 res = PyUnicode_AS_UNICODE(result)
8619                     + reslen - rescnt;
8620             }
8621             if (sign) {
8622                 if (fill != ' ')
8623                     *res++ = sign;
8624                 rescnt--;
8625                 if (width > len)
8626                     width--;
8627             }
8628             if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8629                 assert(pbuf[0] == '0');
8630                 assert(pbuf[1] == c);
8631                 if (fill != ' ') {
8632                     *res++ = *pbuf++;
8633                     *res++ = *pbuf++;
8634                 }
8635                 rescnt -= 2;
8636                 width -= 2;
8637                 if (width < 0)
8638                     width = 0;
8639                 len -= 2;
8640             }
8641             if (width > len && !(flags & F_LJUST)) {
8642                 do {
8643                     --rescnt;
8644                     *res++ = fill;
8645                 } while (--width > len);
8646             }
8647             if (fill == ' ') {
8648                 if (sign)
8649                     *res++ = sign;
8650                 if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8651                     assert(pbuf[0] == '0');
8652                     assert(pbuf[1] == c);
8653                     *res++ = *pbuf++;
8654                     *res++ = *pbuf++;
8655                 }
8656             }
8657             Py_UNICODE_COPY(res, pbuf, len);
8658             res += len;
8659             rescnt -= len;
8660             while (--width >= len) {
8661                 --rescnt;
8662                 *res++ = ' ';
8663             }
8664             if (dict && (argidx < arglen) && c != '%') {
8665                 PyErr_SetString(PyExc_TypeError,
8666                                 "not all arguments converted during string formatting");
8667                 Py_XDECREF(temp);
8668                 goto onError;
8669             }
8670             Py_XDECREF(temp);
8671         } /* '%' */
8672     } /* until end */
8673     if (argidx < arglen && !dict) {
8674         PyErr_SetString(PyExc_TypeError,
8675                         "not all arguments converted during string formatting");
8676         goto onError;
8677     }
8678
8679     if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8680         goto onError;
8681     if (args_owned) {
8682         Py_DECREF(args);
8683     }
8684     Py_DECREF(uformat);
8685     return (PyObject *)result;
8686
8687   onError:
8688     Py_XDECREF(result);
8689     Py_DECREF(uformat);
8690     if (args_owned) {
8691         Py_DECREF(args);
8692     }
8693     return NULL;
8694 }
8695
8696 static PyBufferProcs unicode_as_buffer = {
8697     (readbufferproc) unicode_buffer_getreadbuf,
8698     (writebufferproc) unicode_buffer_getwritebuf,
8699     (segcountproc) unicode_buffer_getsegcount,
8700     (charbufferproc) unicode_buffer_getcharbuf,
8701 };
8702
8703 static PyObject *
8704 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8705
8706 static PyObject *
8707 unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8708 {
8709     PyObject *x = NULL;
8710     static char *kwlist[] = {"string", "encoding", "errors", 0};
8711     char *encoding = NULL;
8712     char *errors = NULL;
8713
8714     if (type != &PyUnicode_Type)
8715         return unicode_subtype_new(type, args, kwds);
8716     if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8717                                      kwlist, &x, &encoding, &errors))
8718         return NULL;
8719     if (x == NULL)
8720         return (PyObject *)_PyUnicode_New(0);
8721     if (encoding == NULL && errors == NULL)
8722         return PyObject_Unicode(x);
8723     else
8724         return PyUnicode_FromEncodedObject(x, encoding, errors);
8725 }
8726
8727 static PyObject *
8728 unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8729 {
8730     PyUnicodeObject *tmp, *pnew;
8731     Py_ssize_t n;
8732
8733     assert(PyType_IsSubtype(type, &PyUnicode_Type));
8734     tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8735     if (tmp == NULL)
8736         return NULL;
8737     assert(PyUnicode_Check(tmp));
8738     pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8739     if (pnew == NULL) {
8740         Py_DECREF(tmp);
8741         return NULL;
8742     }
8743     pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8744     if (pnew->str == NULL) {
8745         _Py_ForgetReference((PyObject *)pnew);
8746         PyObject_Del(pnew);
8747         Py_DECREF(tmp);
8748         return PyErr_NoMemory();
8749     }
8750     Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8751     pnew->length = n;
8752     pnew->hash = tmp->hash;
8753     Py_DECREF(tmp);
8754     return (PyObject *)pnew;
8755 }
8756
8757 PyDoc_STRVAR(unicode_doc,
8758              "unicode(string [, encoding[, errors]]) -> object\n\
8759 \n\
8760 Create a new Unicode object from the given encoded string.\n\
8761 encoding defaults to the current default string encoding.\n\
8762 errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8763
8764 PyTypeObject PyUnicode_Type = {
8765     PyVarObject_HEAD_INIT(&PyType_Type, 0)
8766     "unicode",              /* tp_name */
8767     sizeof(PyUnicodeObject),        /* tp_size */
8768     0,                  /* tp_itemsize */
8769     /* Slots */
8770     (destructor)unicode_dealloc,    /* tp_dealloc */
8771     0,                  /* tp_print */
8772     0,                  /* tp_getattr */
8773     0,                  /* tp_setattr */
8774     0,                  /* tp_compare */
8775     unicode_repr,           /* tp_repr */
8776     &unicode_as_number,         /* tp_as_number */
8777     &unicode_as_sequence,       /* tp_as_sequence */
8778     &unicode_as_mapping,        /* tp_as_mapping */
8779     (hashfunc) unicode_hash,        /* tp_hash*/
8780     0,                  /* tp_call*/
8781     (reprfunc) unicode_str,     /* tp_str */
8782     PyObject_GenericGetAttr,        /* tp_getattro */
8783     0,                  /* tp_setattro */
8784     &unicode_as_buffer,         /* tp_as_buffer */
8785     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8786     Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8787     unicode_doc,            /* tp_doc */
8788     0,                  /* tp_traverse */
8789     0,                  /* tp_clear */
8790     PyUnicode_RichCompare,      /* tp_richcompare */
8791     0,                  /* tp_weaklistoffset */
8792     0,                  /* tp_iter */
8793     0,                  /* tp_iternext */
8794     unicode_methods,            /* tp_methods */
8795     0,                  /* tp_members */
8796     0,                  /* tp_getset */
8797     &PyBaseString_Type,         /* tp_base */
8798     0,                  /* tp_dict */
8799     0,                  /* tp_descr_get */
8800     0,                  /* tp_descr_set */
8801     0,                  /* tp_dictoffset */
8802     0,                  /* tp_init */
8803     0,                  /* tp_alloc */
8804     unicode_new,            /* tp_new */
8805     PyObject_Del,           /* tp_free */
8806 };
8807
8808 /* Initialize the Unicode implementation */
8809
8810 void _PyUnicode_Init(void)
8811 {
8812     int i;
8813
8814     /* XXX - move this array to unicodectype.c ? */
8815     Py_UNICODE linebreak[] = {
8816         0x000A, /* LINE FEED */
8817         0x000D, /* CARRIAGE RETURN */
8818         0x001C, /* FILE SEPARATOR */
8819         0x001D, /* GROUP SEPARATOR */
8820         0x001E, /* RECORD SEPARATOR */
8821         0x0085, /* NEXT LINE */
8822         0x2028, /* LINE SEPARATOR */
8823         0x2029, /* PARAGRAPH SEPARATOR */
8824     };
8825
8826     /* Init the implementation */
8827     free_list = NULL;
8828     numfree = 0;
8829     unicode_empty = _PyUnicode_New(0);
8830     if (!unicode_empty)
8831         return;
8832
8833     strcpy(unicode_default_encoding, "ascii");
8834     for (i = 0; i < 256; i++)
8835         unicode_latin1[i] = NULL;
8836     if (PyType_Ready(&PyUnicode_Type) < 0)
8837         Py_FatalError("Can't initialize 'unicode'");
8838
8839     /* initialize the linebreak bloom filter */
8840     bloom_linebreak = make_bloom_mask(
8841         linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8842         );
8843
8844     PyType_Ready(&EncodingMapType);
8845 }
8846
8847 /* Finalize the Unicode implementation */
8848
8849 int
8850 PyUnicode_ClearFreeList(void)
8851 {
8852     int freelist_size = numfree;
8853     PyUnicodeObject *u;
8854
8855     for (u = free_list; u != NULL;) {
8856         PyUnicodeObject *v = u;
8857         u = *(PyUnicodeObject **)u;
8858         if (v->str)
8859             PyObject_DEL(v->str);
8860         Py_XDECREF(v->defenc);
8861         PyObject_Del(v);
8862         numfree--;
8863     }
8864     free_list = NULL;
8865     assert(numfree == 0);
8866     return freelist_size;
8867 }
8868
8869 void
8870 _PyUnicode_Fini(void)
8871 {
8872     int i;
8873
8874     Py_XDECREF(unicode_empty);
8875     unicode_empty = NULL;
8876
8877     for (i = 0; i < 256; i++) {
8878         if (unicode_latin1[i]) {
8879             Py_DECREF(unicode_latin1[i]);
8880             unicode_latin1[i] = NULL;
8881         }
8882     }
8883     (void)PyUnicode_ClearFreeList();
8884 }
8885
8886 #ifdef __cplusplus
8887 }
8888 #endif