]>
Commit | Line | Data |
---|---|---|
53b2ba57 DM |
1 | /* stringlib: split implementation */\r |
2 | \r | |
3 | #ifndef STRINGLIB_SPLIT_H\r | |
4 | #define STRINGLIB_SPLIT_H\r | |
5 | \r | |
6 | #ifndef STRINGLIB_FASTSEARCH_H\r | |
7 | #error must include "stringlib/fastsearch.h" before including this module\r | |
8 | #endif\r | |
9 | \r | |
10 | /* Overallocate the initial list to reduce the number of reallocs for small\r | |
11 | split sizes. Eg, "A A A A A A A A A A".split() (10 elements) has three\r | |
12 | resizes, to sizes 4, 8, then 16. Most observed string splits are for human\r | |
13 | text (roughly 11 words per line) and field delimited data (usually 1-10\r | |
14 | fields). For large strings the split algorithms are bandwidth limited\r | |
15 | so increasing the preallocation likely will not improve things.*/\r | |
16 | \r | |
17 | #define MAX_PREALLOC 12\r | |
18 | \r | |
19 | /* 5 splits gives 6 elements */\r | |
20 | #define PREALLOC_SIZE(maxsplit) \\r | |
21 | (maxsplit >= MAX_PREALLOC ? MAX_PREALLOC : maxsplit+1)\r | |
22 | \r | |
23 | #define SPLIT_APPEND(data, left, right) \\r | |
24 | sub = STRINGLIB_NEW((data) + (left), \\r | |
25 | (right) - (left)); \\r | |
26 | if (sub == NULL) \\r | |
27 | goto onError; \\r | |
28 | if (PyList_Append(list, sub)) { \\r | |
29 | Py_DECREF(sub); \\r | |
30 | goto onError; \\r | |
31 | } \\r | |
32 | else \\r | |
33 | Py_DECREF(sub);\r | |
34 | \r | |
35 | #define SPLIT_ADD(data, left, right) { \\r | |
36 | sub = STRINGLIB_NEW((data) + (left), \\r | |
37 | (right) - (left)); \\r | |
38 | if (sub == NULL) \\r | |
39 | goto onError; \\r | |
40 | if (count < MAX_PREALLOC) { \\r | |
41 | PyList_SET_ITEM(list, count, sub); \\r | |
42 | } else { \\r | |
43 | if (PyList_Append(list, sub)) { \\r | |
44 | Py_DECREF(sub); \\r | |
45 | goto onError; \\r | |
46 | } \\r | |
47 | else \\r | |
48 | Py_DECREF(sub); \\r | |
49 | } \\r | |
50 | count++; }\r | |
51 | \r | |
52 | \r | |
53 | /* Always force the list to the expected size. */\r | |
54 | #define FIX_PREALLOC_SIZE(list) Py_SIZE(list) = count\r | |
55 | \r | |
56 | Py_LOCAL_INLINE(PyObject *)\r | |
57 | stringlib_split_whitespace(PyObject* str_obj,\r | |
58 | const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r | |
59 | Py_ssize_t maxcount)\r | |
60 | {\r | |
61 | Py_ssize_t i, j, count=0;\r | |
62 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r | |
63 | PyObject *sub;\r | |
64 | \r | |
65 | if (list == NULL)\r | |
66 | return NULL;\r | |
67 | \r | |
68 | i = j = 0;\r | |
69 | while (maxcount-- > 0) {\r | |
70 | while (i < str_len && STRINGLIB_ISSPACE(str[i]))\r | |
71 | i++;\r | |
72 | if (i == str_len) break;\r | |
73 | j = i; i++;\r | |
74 | while (i < str_len && !STRINGLIB_ISSPACE(str[i]))\r | |
75 | i++;\r | |
76 | #ifndef STRINGLIB_MUTABLE\r | |
77 | if (j == 0 && i == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {\r | |
78 | /* No whitespace in str_obj, so just use it as list[0] */\r | |
79 | Py_INCREF(str_obj);\r | |
80 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r | |
81 | count++;\r | |
82 | break;\r | |
83 | }\r | |
84 | #endif\r | |
85 | SPLIT_ADD(str, j, i);\r | |
86 | }\r | |
87 | \r | |
88 | if (i < str_len) {\r | |
89 | /* Only occurs when maxcount was reached */\r | |
90 | /* Skip any remaining whitespace and copy to end of string */\r | |
91 | while (i < str_len && STRINGLIB_ISSPACE(str[i]))\r | |
92 | i++;\r | |
93 | if (i != str_len)\r | |
94 | SPLIT_ADD(str, i, str_len);\r | |
95 | }\r | |
96 | FIX_PREALLOC_SIZE(list);\r | |
97 | return list;\r | |
98 | \r | |
99 | onError:\r | |
100 | Py_DECREF(list);\r | |
101 | return NULL;\r | |
102 | }\r | |
103 | \r | |
104 | Py_LOCAL_INLINE(PyObject *)\r | |
105 | stringlib_split_char(PyObject* str_obj,\r | |
106 | const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r | |
107 | const STRINGLIB_CHAR ch,\r | |
108 | Py_ssize_t maxcount)\r | |
109 | {\r | |
110 | Py_ssize_t i, j, count=0;\r | |
111 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r | |
112 | PyObject *sub;\r | |
113 | \r | |
114 | if (list == NULL)\r | |
115 | return NULL;\r | |
116 | \r | |
117 | i = j = 0;\r | |
118 | while ((j < str_len) && (maxcount-- > 0)) {\r | |
119 | for(; j < str_len; j++) {\r | |
120 | /* I found that using memchr makes no difference */\r | |
121 | if (str[j] == ch) {\r | |
122 | SPLIT_ADD(str, i, j);\r | |
123 | i = j = j + 1;\r | |
124 | break;\r | |
125 | }\r | |
126 | }\r | |
127 | }\r | |
128 | #ifndef STRINGLIB_MUTABLE\r | |
129 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r | |
130 | /* ch not in str_obj, so just use str_obj as list[0] */\r | |
131 | Py_INCREF(str_obj);\r | |
132 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r | |
133 | count++;\r | |
134 | } else\r | |
135 | #endif\r | |
136 | if (i <= str_len) {\r | |
137 | SPLIT_ADD(str, i, str_len);\r | |
138 | }\r | |
139 | FIX_PREALLOC_SIZE(list);\r | |
140 | return list;\r | |
141 | \r | |
142 | onError:\r | |
143 | Py_DECREF(list);\r | |
144 | return NULL;\r | |
145 | }\r | |
146 | \r | |
147 | Py_LOCAL_INLINE(PyObject *)\r | |
148 | stringlib_split(PyObject* str_obj,\r | |
149 | const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r | |
150 | const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,\r | |
151 | Py_ssize_t maxcount)\r | |
152 | {\r | |
153 | Py_ssize_t i, j, pos, count=0;\r | |
154 | PyObject *list, *sub;\r | |
155 | \r | |
156 | if (sep_len == 0) {\r | |
157 | PyErr_SetString(PyExc_ValueError, "empty separator");\r | |
158 | return NULL;\r | |
159 | }\r | |
160 | else if (sep_len == 1)\r | |
161 | return stringlib_split_char(str_obj, str, str_len, sep[0], maxcount);\r | |
162 | \r | |
163 | list = PyList_New(PREALLOC_SIZE(maxcount));\r | |
164 | if (list == NULL)\r | |
165 | return NULL;\r | |
166 | \r | |
167 | i = j = 0;\r | |
168 | while (maxcount-- > 0) {\r | |
169 | pos = fastsearch(str+i, str_len-i, sep, sep_len, -1, FAST_SEARCH);\r | |
170 | if (pos < 0)\r | |
171 | break;\r | |
172 | j = i + pos;\r | |
173 | SPLIT_ADD(str, i, j);\r | |
174 | i = j + sep_len;\r | |
175 | }\r | |
176 | #ifndef STRINGLIB_MUTABLE\r | |
177 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r | |
178 | /* No match in str_obj, so just use it as list[0] */\r | |
179 | Py_INCREF(str_obj);\r | |
180 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r | |
181 | count++;\r | |
182 | } else\r | |
183 | #endif\r | |
184 | {\r | |
185 | SPLIT_ADD(str, i, str_len);\r | |
186 | }\r | |
187 | FIX_PREALLOC_SIZE(list);\r | |
188 | return list;\r | |
189 | \r | |
190 | onError:\r | |
191 | Py_DECREF(list);\r | |
192 | return NULL;\r | |
193 | }\r | |
194 | \r | |
195 | Py_LOCAL_INLINE(PyObject *)\r | |
196 | stringlib_rsplit_whitespace(PyObject* str_obj,\r | |
197 | const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r | |
198 | Py_ssize_t maxcount)\r | |
199 | {\r | |
200 | Py_ssize_t i, j, count=0;\r | |
201 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r | |
202 | PyObject *sub;\r | |
203 | \r | |
204 | if (list == NULL)\r | |
205 | return NULL;\r | |
206 | \r | |
207 | i = j = str_len - 1;\r | |
208 | while (maxcount-- > 0) {\r | |
209 | while (i >= 0 && STRINGLIB_ISSPACE(str[i]))\r | |
210 | i--;\r | |
211 | if (i < 0) break;\r | |
212 | j = i; i--;\r | |
213 | while (i >= 0 && !STRINGLIB_ISSPACE(str[i]))\r | |
214 | i--;\r | |
215 | #ifndef STRINGLIB_MUTABLE\r | |
216 | if (j == str_len - 1 && i < 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r | |
217 | /* No whitespace in str_obj, so just use it as list[0] */\r | |
218 | Py_INCREF(str_obj);\r | |
219 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r | |
220 | count++;\r | |
221 | break;\r | |
222 | }\r | |
223 | #endif\r | |
224 | SPLIT_ADD(str, i + 1, j + 1);\r | |
225 | }\r | |
226 | \r | |
227 | if (i >= 0) {\r | |
228 | /* Only occurs when maxcount was reached */\r | |
229 | /* Skip any remaining whitespace and copy to beginning of string */\r | |
230 | while (i >= 0 && STRINGLIB_ISSPACE(str[i]))\r | |
231 | i--;\r | |
232 | if (i >= 0)\r | |
233 | SPLIT_ADD(str, 0, i + 1);\r | |
234 | }\r | |
235 | FIX_PREALLOC_SIZE(list);\r | |
236 | if (PyList_Reverse(list) < 0)\r | |
237 | goto onError;\r | |
238 | return list;\r | |
239 | \r | |
240 | onError:\r | |
241 | Py_DECREF(list);\r | |
242 | return NULL;\r | |
243 | }\r | |
244 | \r | |
245 | Py_LOCAL_INLINE(PyObject *)\r | |
246 | stringlib_rsplit_char(PyObject* str_obj,\r | |
247 | const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r | |
248 | const STRINGLIB_CHAR ch,\r | |
249 | Py_ssize_t maxcount)\r | |
250 | {\r | |
251 | Py_ssize_t i, j, count=0;\r | |
252 | PyObject *list = PyList_New(PREALLOC_SIZE(maxcount));\r | |
253 | PyObject *sub;\r | |
254 | \r | |
255 | if (list == NULL)\r | |
256 | return NULL;\r | |
257 | \r | |
258 | i = j = str_len - 1;\r | |
259 | while ((i >= 0) && (maxcount-- > 0)) {\r | |
260 | for(; i >= 0; i--) {\r | |
261 | if (str[i] == ch) {\r | |
262 | SPLIT_ADD(str, i + 1, j + 1);\r | |
263 | j = i = i - 1;\r | |
264 | break;\r | |
265 | }\r | |
266 | }\r | |
267 | }\r | |
268 | #ifndef STRINGLIB_MUTABLE\r | |
269 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r | |
270 | /* ch not in str_obj, so just use str_obj as list[0] */\r | |
271 | Py_INCREF(str_obj);\r | |
272 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r | |
273 | count++;\r | |
274 | } else\r | |
275 | #endif\r | |
276 | if (j >= -1) {\r | |
277 | SPLIT_ADD(str, 0, j + 1);\r | |
278 | }\r | |
279 | FIX_PREALLOC_SIZE(list);\r | |
280 | if (PyList_Reverse(list) < 0)\r | |
281 | goto onError;\r | |
282 | return list;\r | |
283 | \r | |
284 | onError:\r | |
285 | Py_DECREF(list);\r | |
286 | return NULL;\r | |
287 | }\r | |
288 | \r | |
289 | Py_LOCAL_INLINE(PyObject *)\r | |
290 | stringlib_rsplit(PyObject* str_obj,\r | |
291 | const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r | |
292 | const STRINGLIB_CHAR* sep, Py_ssize_t sep_len,\r | |
293 | Py_ssize_t maxcount)\r | |
294 | {\r | |
295 | Py_ssize_t j, pos, count=0;\r | |
296 | PyObject *list, *sub;\r | |
297 | \r | |
298 | if (sep_len == 0) {\r | |
299 | PyErr_SetString(PyExc_ValueError, "empty separator");\r | |
300 | return NULL;\r | |
301 | }\r | |
302 | else if (sep_len == 1)\r | |
303 | return stringlib_rsplit_char(str_obj, str, str_len, sep[0], maxcount);\r | |
304 | \r | |
305 | list = PyList_New(PREALLOC_SIZE(maxcount));\r | |
306 | if (list == NULL)\r | |
307 | return NULL;\r | |
308 | \r | |
309 | j = str_len;\r | |
310 | while (maxcount-- > 0) {\r | |
311 | pos = fastsearch(str, j, sep, sep_len, -1, FAST_RSEARCH);\r | |
312 | if (pos < 0)\r | |
313 | break;\r | |
314 | SPLIT_ADD(str, pos + sep_len, j);\r | |
315 | j = pos;\r | |
316 | }\r | |
317 | #ifndef STRINGLIB_MUTABLE\r | |
318 | if (count == 0 && STRINGLIB_CHECK_EXACT(str_obj)) {\r | |
319 | /* No match in str_obj, so just use it as list[0] */\r | |
320 | Py_INCREF(str_obj);\r | |
321 | PyList_SET_ITEM(list, 0, (PyObject *)str_obj);\r | |
322 | count++;\r | |
323 | } else\r | |
324 | #endif\r | |
325 | {\r | |
326 | SPLIT_ADD(str, 0, j);\r | |
327 | }\r | |
328 | FIX_PREALLOC_SIZE(list);\r | |
329 | if (PyList_Reverse(list) < 0)\r | |
330 | goto onError;\r | |
331 | return list;\r | |
332 | \r | |
333 | onError:\r | |
334 | Py_DECREF(list);\r | |
335 | return NULL;\r | |
336 | }\r | |
337 | \r | |
338 | Py_LOCAL_INLINE(PyObject *)\r | |
339 | stringlib_splitlines(PyObject* str_obj,\r | |
340 | const STRINGLIB_CHAR* str, Py_ssize_t str_len,\r | |
341 | int keepends)\r | |
342 | {\r | |
343 | /* This does not use the preallocated list because splitlines is\r | |
344 | usually run with hundreds of newlines. The overhead of\r | |
345 | switching between PyList_SET_ITEM and append causes about a\r | |
346 | 2-3% slowdown for that common case. A smarter implementation\r | |
347 | could move the if check out, so the SET_ITEMs are done first\r | |
348 | and the appends only done when the prealloc buffer is full.\r | |
349 | That's too much work for little gain.*/\r | |
350 | \r | |
351 | register Py_ssize_t i;\r | |
352 | register Py_ssize_t j;\r | |
353 | PyObject *list = PyList_New(0);\r | |
354 | PyObject *sub;\r | |
355 | \r | |
356 | if (list == NULL)\r | |
357 | return NULL;\r | |
358 | \r | |
359 | for (i = j = 0; i < str_len; ) {\r | |
360 | Py_ssize_t eol;\r | |
361 | \r | |
362 | /* Find a line and append it */\r | |
363 | while (i < str_len && !STRINGLIB_ISLINEBREAK(str[i]))\r | |
364 | i++;\r | |
365 | \r | |
366 | /* Skip the line break reading CRLF as one line break */\r | |
367 | eol = i;\r | |
368 | if (i < str_len) {\r | |
369 | if (str[i] == '\r' && i + 1 < str_len && str[i+1] == '\n')\r | |
370 | i += 2;\r | |
371 | else\r | |
372 | i++;\r | |
373 | if (keepends)\r | |
374 | eol = i;\r | |
375 | }\r | |
376 | #ifndef STRINGLIB_MUTABLE\r | |
377 | if (j == 0 && eol == str_len && STRINGLIB_CHECK_EXACT(str_obj)) {\r | |
378 | /* No linebreak in str_obj, so just use it as list[0] */\r | |
379 | if (PyList_Append(list, str_obj))\r | |
380 | goto onError;\r | |
381 | break;\r | |
382 | }\r | |
383 | #endif\r | |
384 | SPLIT_APPEND(str, j, eol);\r | |
385 | j = i;\r | |
386 | }\r | |
387 | return list;\r | |
388 | \r | |
389 | onError:\r | |
390 | Py_DECREF(list);\r | |
391 | return NULL;\r | |
392 | }\r | |
393 | \r | |
394 | #endif\r |