+++ /dev/null
-/* stringlib: split implementation */\r
-\r
-#ifndef STRINGLIB_SPLIT_H\r
-#define STRINGLIB_SPLIT_H\r
-\r
-#ifndef STRINGLIB_FASTSEARCH_H\r
-#error must include "stringlib/fastsearch.h" before including this module\r
-#endif\r
-\r
-/* Overallocate the initial list to reduce the number of reallocs for small\r
- split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three\r
- resizes, to sizes 4, 8, then 16. Most observed string splits are for human\r
- text (roughly 11 words per line) and field delimited data (usually 1-10\r
- fields). For large strings the split algorithms are bandwidth limited\r
- so increasing the preallocation likely will not improve things.*/\r
-\r
-#define MAX_PREALLOC 12\r
-\r
-/* 5 splits gives 6 elements */\r
-#define PREALLOC_SIZE(maxsplit) \\r
- (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)\r
-\r
-#define SPLIT_APPEND(data, left, right) \\r
- sub = STRINGLIB_NEW((data) + (left), \\r
- (right) - (left)); \\r
- if (sub == NULL) \\r
- goto onError; \\r
- if (PyList_Append(list, sub)) { \\r
- Py_DECREF(sub); \\r
- goto onError; \\r
- } \\r
- else \\r
- Py_DECREF(sub);\r
-\r
-#define SPLIT_ADD(data, left, right) { \\r
- sub = STRINGLIB_NEW((data) + (left), \\r
- (right) - (left)); \\r
- if (sub == NULL) \\r
- goto onError; \\r
- if (count < MAX_PREALLOC) { \\r
- PyList_SET_ITEM(list, count, sub); \\r
- } else { \\r
- if (PyList_Append(list, sub)) { \\r
- Py_DECREF(sub); \\r
- goto onError; \\r
- } \\r
- else \\r
- Py_DECREF(sub); \\r
- } \\r
- count++; }\r
-\r
-\r
-/* Always force the list to the expected size. */\r
-#define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count\r
-\r
-Py_LOCAL_INLINE(PyObject *)\r
-stringlib_split_whitespace(PyObject* str_obj,\r
- const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r
- Py_ssize_t maxcount)\r
-{\r
- Py_ssize_t i, j, count=0;\r
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r
- PyObject *sub;\r
-\r
- if (list == NULL)\r
- return NULL;\r
-\r
- i = j = 0;\r
- while (maxcount-- > 0) {\r
- while (i < str_len && STRINGLIB_ISSPACE(str[i]))\r
- i++;\r
- if (i == str_len) break;\r
- j = i; i++;\r
- while (i < str_len && !STRINGLIB_ISSPACE(str[i]))\r
- i++;\r
-#ifndef STRINGLIB_MUTABLE\r
- if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {\r
- /* No whitespace in str_obj, so just use it as list[0] */\r
- Py_INCREF(str_obj);\r
- PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r
- count++;\r
- break;\r
- }\r
-#endif\r
- SPLIT_ADD(str, j, i);\r
- }\r
-\r
- if (i < str_len) {\r
- /* Only occurs when maxcount was reached */\r
- /* Skip any remaining whitespace and copy to end of string */\r
- while (i < str_len && STRINGLIB_ISSPACE(str[i]))\r
- i++;\r
- if (i != str_len)\r
- SPLIT_ADD(str, i, str_len);\r
- }\r
- FIX_PREALLOC_SIZE(list);\r
- return list;\r
-\r
- onError:\r
- Py_DECREF(list);\r
- return NULL;\r
-}\r
-\r
-Py_LOCAL_INLINE(PyObject *)\r
-stringlib_split_char(PyObject* str_obj,\r
- const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r
- const STRINGLIB_CHAR ch,\r
- Py_ssize_t maxcount)\r
-{\r
- Py_ssize_t i, j, count=0;\r
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r
- PyObject *sub;\r
-\r
- if (list == NULL)\r
- return NULL;\r
-\r
- i = j = 0;\r
- while ((j < str_len) && (maxcount-- > 0)) {\r
- for(; j < str_len; j++) {\r
- /* I found that using memchr makes no difference */\r
- if (str[j] == ch) {\r
- SPLIT_ADD(str, i, j);\r
- i = j = j + 1;\r
- break;\r
- }\r
- }\r
- }\r
-#ifndef STRINGLIB_MUTABLE\r
- if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r
- /* ch not in str_obj, so just use str_obj as list[0] */\r
- Py_INCREF(str_obj);\r
- PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r
- count++;\r
- } else\r
-#endif\r
- if (i <= str_len) {\r
- SPLIT_ADD(str, i, str_len);\r
- }\r
- FIX_PREALLOC_SIZE(list);\r
- return list;\r
-\r
- onError:\r
- Py_DECREF(list);\r
- return NULL;\r
-}\r
-\r
-Py_LOCAL_INLINE(PyObject *)\r
-stringlib_split(PyObject* str_obj,\r
- const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r
- const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,\r
- Py_ssize_t maxcount)\r
-{\r
- Py_ssize_t i, j, pos, count=0;\r
- PyObject *list, *sub;\r
-\r
- if (sep_len == 0) {\r
- PyErr_SetString(PyExc_ValueError, "empty separator");\r
- return NULL;\r
- }\r
- else if (sep_len == 1)\r
- return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);\r
-\r
- list = PyList_New(PREALLOC_SIZE(maxcount));\r
- if (list == NULL)\r
- return NULL;\r
-\r
- i = j = 0;\r
- while (maxcount-- > 0) {\r
- pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);\r
- if (pos < 0)\r
- break;\r
- j = i + pos;\r
- SPLIT_ADD(str, i, j);\r
- i = j + sep_len;\r
- }\r
-#ifndef STRINGLIB_MUTABLE\r
- if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r
- /* No match in str_obj, so just use it as list[0] */\r
- Py_INCREF(str_obj);\r
- PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r
- count++;\r
- } else\r
-#endif\r
- {\r
- SPLIT_ADD(str, i, str_len);\r
- }\r
- FIX_PREALLOC_SIZE(list);\r
- return list;\r
-\r
- onError:\r
- Py_DECREF(list);\r
- return NULL;\r
-}\r
-\r
-Py_LOCAL_INLINE(PyObject *)\r
-stringlib_rsplit_whitespace(PyObject* str_obj,\r
- const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r
- Py_ssize_t maxcount)\r
-{\r
- Py_ssize_t i, j, count=0;\r
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r
- PyObject *sub;\r
-\r
- if (list == NULL)\r
- return NULL;\r
-\r
- i = j = str_len - 1;\r
- while (maxcount-- > 0) {\r
- while (i >= 0 && STRINGLIB_ISSPACE(str[i]))\r
- i--;\r
- if (i < 0) break;\r
- j = i; i--;\r
- while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))\r
- i--;\r
-#ifndef STRINGLIB_MUTABLE\r
- if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r
- /* No whitespace in str_obj, so just use it as list[0] */\r
- Py_INCREF(str_obj);\r
- PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r
- count++;\r
- break;\r
- }\r
-#endif\r
- SPLIT_ADD(str, i + 1, j + 1);\r
- }\r
-\r
- if (i >= 0) {\r
- /* Only occurs when maxcount was reached */\r
- /* Skip any remaining whitespace and copy to beginning of string */\r
- while (i >= 0 && STRINGLIB_ISSPACE(str[i]))\r
- i--;\r
- if (i >= 0)\r
- SPLIT_ADD(str, 0, i + 1);\r
- }\r
- FIX_PREALLOC_SIZE(list);\r
- if (PyList_Reverse(list) < 0)\r
- goto onError;\r
- return list;\r
-\r
- onError:\r
- Py_DECREF(list);\r
- return NULL;\r
-}\r
-\r
-Py_LOCAL_INLINE(PyObject *)\r
-stringlib_rsplit_char(PyObject* str_obj,\r
- const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r
- const STRINGLIB_CHAR ch,\r
- Py_ssize_t maxcount)\r
-{\r
- Py_ssize_t i, j, count=0;\r
- PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r
- PyObject *sub;\r
-\r
- if (list == NULL)\r
- return NULL;\r
-\r
- i = j = str_len - 1;\r
- while ((i >= 0) && (maxcount-- > 0)) {\r
- for(; i >= 0; i--) {\r
- if (str[i] == ch) {\r
- SPLIT_ADD(str, i + 1, j + 1);\r
- j = i = i - 1;\r
- break;\r
- }\r
- }\r
- }\r
-#ifndef STRINGLIB_MUTABLE\r
- if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r
- /* ch not in str_obj, so just use str_obj as list[0] */\r
- Py_INCREF(str_obj);\r
- PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r
- count++;\r
- } else\r
-#endif\r
- if (j >= -1) {\r
- SPLIT_ADD(str, 0, j + 1);\r
- }\r
- FIX_PREALLOC_SIZE(list);\r
- if (PyList_Reverse(list) < 0)\r
- goto onError;\r
- return list;\r
-\r
- onError:\r
- Py_DECREF(list);\r
- return NULL;\r
-}\r
-\r
-Py_LOCAL_INLINE(PyObject *)\r
-stringlib_rsplit(PyObject* str_obj,\r
- const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r
- const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,\r
- Py_ssize_t maxcount)\r
-{\r
- Py_ssize_t j, pos, count=0;\r
- PyObject *list, *sub;\r
-\r
- if (sep_len == 0) {\r
- PyErr_SetString(PyExc_ValueError, "empty separator");\r
- return NULL;\r
- }\r
- else if (sep_len == 1)\r
- return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);\r
-\r
- list = PyList_New(PREALLOC_SIZE(maxcount));\r
- if (list == NULL)\r
- return NULL;\r
-\r
- j = str_len;\r
- while (maxcount-- > 0) {\r
- pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);\r
- if (pos < 0)\r
- break;\r
- SPLIT_ADD(str, pos + sep_len, j);\r
- j = pos;\r
- }\r
-#ifndef STRINGLIB_MUTABLE\r
- if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r
- /* No match in str_obj, so just use it as list[0] */\r
- Py_INCREF(str_obj);\r
- PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r
- count++;\r
- } else\r
-#endif\r
- {\r
- SPLIT_ADD(str, 0, j);\r
- }\r
- FIX_PREALLOC_SIZE(list);\r
- if (PyList_Reverse(list) < 0)\r
- goto onError;\r
- return list;\r
-\r
- onError:\r
- Py_DECREF(list);\r
- return NULL;\r
-}\r
-\r
-Py_LOCAL_INLINE(PyObject *)\r
-stringlib_splitlines(PyObject* str_obj,\r
- const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r
- int keepends)\r
-{\r
- /* This does not use the preallocated list because splitlines is\r
- usually run with hundreds of newlines. The overhead of\r
- switching between PyList_SET_ITEM and append causes about a\r
- 2-3% slowdown for that common case. A smarter implementation\r
- could move the if check out, so the SET_ITEMs are done first\r
- and the appends only done when the prealloc buffer is full.\r
- That's too much work for little gain.*/\r
-\r
- register Py_ssize_t i;\r
- register Py_ssize_t j;\r
- PyObject *list = PyList_New(0);\r
- PyObject *sub;\r
-\r
- if (list == NULL)\r
- return NULL;\r
-\r
- for (i = j = 0; i < str_len; ) {\r
- Py_ssize_t eol;\r
-\r
- /* Find a line and append it */\r
- while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))\r
- i++;\r
-\r
- /* Skip the line break reading CRLF as one line break */\r
- eol = i;\r
- if (i < str_len) {\r
- if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')\r
- i += 2;\r
- else\r
- i++;\r
- if (keepends)\r
- eol = i;\r
- }\r
-#ifndef STRINGLIB_MUTABLE\r
- if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {\r
- /* No linebreak in str_obj, so just use it as list[0] */\r
- if (PyList_Append(list, str_obj))\r
- goto onError;\r
- break;\r
- }\r
-#endif\r
- SPLIT_APPEND(str, j, eol);\r
- j = i;\r
- }\r
- return list;\r
-\r
- onError:\r
- Py_DECREF(list);\r
- return NULL;\r
-}\r
-\r
-#endif\r