]>
Commit | Line | Data |
---|---|---|
c8042e10 DM |
1 | \r |
2 | /* Tokenizer implementation */\r | |
3 | \r | |
4 | #include "Python.h"\r | |
5 | #include "pgenheaders.h"\r | |
6 | \r | |
7 | #include <ctype.h>\r | |
8 | #include <assert.h>\r | |
9 | \r | |
10 | #include "tokenizer.h"\r | |
11 | #include "errcode.h"\r | |
12 | \r | |
13 | #ifndef PGEN\r | |
14 | #include "unicodeobject.h"\r | |
15 | #include "stringobject.h"\r | |
16 | #include "fileobject.h"\r | |
17 | #include "codecs.h"\r | |
18 | #include "abstract.h"\r | |
19 | #include "pydebug.h"\r | |
20 | #endif /* PGEN */\r | |
21 | \r | |
22 | extern char *PyOS_Readline(FILE *, FILE *, char *);\r | |
23 | /* Return malloc'ed string including trailing \n;\r | |
24 | empty malloc'ed string for EOF;\r | |
25 | NULL if interrupted */\r | |
26 | \r | |
27 | /* Don't ever change this -- it would break the portability of Python code */\r | |
28 | #define TABSIZE 8\r | |
29 | \r | |
30 | /* Forward */\r | |
31 | static struct tok_state *tok_new(void);\r | |
32 | static int tok_nextc(struct tok_state *tok);\r | |
33 | static void tok_backup(struct tok_state *tok, int c);\r | |
34 | \r | |
35 | /* Token names */\r | |
36 | \r | |
37 | char *_PyParser_TokenNames[] = {\r | |
38 | "ENDMARKER",\r | |
39 | "NAME",\r | |
40 | "NUMBER",\r | |
41 | "STRING",\r | |
42 | "NEWLINE",\r | |
43 | "INDENT",\r | |
44 | "DEDENT",\r | |
45 | "LPAR",\r | |
46 | "RPAR",\r | |
47 | "LSQB",\r | |
48 | "RSQB",\r | |
49 | "COLON",\r | |
50 | "COMMA",\r | |
51 | "SEMI",\r | |
52 | "PLUS",\r | |
53 | "MINUS",\r | |
54 | "STAR",\r | |
55 | "SLASH",\r | |
56 | "VBAR",\r | |
57 | "AMPER",\r | |
58 | "LESS",\r | |
59 | "GREATER",\r | |
60 | "EQUAL",\r | |
61 | "DOT",\r | |
62 | "PERCENT",\r | |
63 | "BACKQUOTE",\r | |
64 | "LBRACE",\r | |
65 | "RBRACE",\r | |
66 | "EQEQUAL",\r | |
67 | "NOTEQUAL",\r | |
68 | "LESSEQUAL",\r | |
69 | "GREATEREQUAL",\r | |
70 | "TILDE",\r | |
71 | "CIRCUMFLEX",\r | |
72 | "LEFTSHIFT",\r | |
73 | "RIGHTSHIFT",\r | |
74 | "DOUBLESTAR",\r | |
75 | "PLUSEQUAL",\r | |
76 | "MINEQUAL",\r | |
77 | "STAREQUAL",\r | |
78 | "SLASHEQUAL",\r | |
79 | "PERCENTEQUAL",\r | |
80 | "AMPEREQUAL",\r | |
81 | "VBAREQUAL",\r | |
82 | "CIRCUMFLEXEQUAL",\r | |
83 | "LEFTSHIFTEQUAL",\r | |
84 | "RIGHTSHIFTEQUAL",\r | |
85 | "DOUBLESTAREQUAL",\r | |
86 | "DOUBLESLASH",\r | |
87 | "DOUBLESLASHEQUAL",\r | |
88 | "AT",\r | |
89 | /* This table must match the #defines in token.h! */\r | |
90 | "OP",\r | |
91 | "<ERRORTOKEN>",\r | |
92 | "<N_TOKENS>"\r | |
93 | };\r | |
94 | \r | |
95 | /* Create and initialize a new tok_state structure */\r | |
96 | \r | |
97 | static struct tok_state *\r | |
98 | tok_new(void)\r | |
99 | {\r | |
100 | struct tok_state *tok = (struct tok_state *)PyMem_MALLOC(\r | |
101 | sizeof(struct tok_state));\r | |
102 | if (tok == NULL)\r | |
103 | return NULL;\r | |
104 | tok->buf = tok->cur = tok->end = tok->inp = tok->start = NULL;\r | |
105 | tok->done = E_OK;\r | |
106 | tok->fp = NULL;\r | |
107 | tok->input = NULL;\r | |
108 | tok->tabsize = TABSIZE;\r | |
109 | tok->indent = 0;\r | |
110 | tok->indstack[0] = 0;\r | |
111 | tok->atbol = 1;\r | |
112 | tok->pendin = 0;\r | |
113 | tok->prompt = tok->nextprompt = NULL;\r | |
114 | tok->lineno = 0;\r | |
115 | tok->level = 0;\r | |
116 | tok->filename = NULL;\r | |
117 | tok->altwarning = 0;\r | |
118 | tok->alterror = 0;\r | |
119 | tok->alttabsize = 1;\r | |
120 | tok->altindstack[0] = 0;\r | |
121 | tok->decoding_state = 0;\r | |
122 | tok->decoding_erred = 0;\r | |
123 | tok->read_coding_spec = 0;\r | |
124 | tok->encoding = NULL;\r | |
125 | tok->cont_line = 0;\r | |
126 | #ifndef PGEN\r | |
127 | tok->decoding_readline = NULL;\r | |
128 | tok->decoding_buffer = NULL;\r | |
129 | #endif\r | |
130 | return tok;\r | |
131 | }\r | |
132 | \r | |
133 | static char *\r | |
134 | new_string(const char *s, Py_ssize_t len)\r | |
135 | {\r | |
136 | char* result = (char *)PyMem_MALLOC(len + 1);\r | |
137 | if (result != NULL) {\r | |
138 | memcpy(result, s, len);\r | |
139 | result[len] = '\0';\r | |
140 | }\r | |
141 | return result;\r | |
142 | }\r | |
143 | \r | |
144 | #ifdef PGEN\r | |
145 | \r | |
146 | static char *\r | |
147 | decoding_fgets(char *s, int size, struct tok_state *tok)\r | |
148 | {\r | |
149 | return fgets(s, size, tok->fp);\r | |
150 | }\r | |
151 | \r | |
152 | static int\r | |
153 | decoding_feof(struct tok_state *tok)\r | |
154 | {\r | |
155 | return feof(tok->fp);\r | |
156 | }\r | |
157 | \r | |
158 | static char *\r | |
159 | decode_str(const char *str, int exec_input, struct tok_state *tok)\r | |
160 | {\r | |
161 | return new_string(str, strlen(str));\r | |
162 | }\r | |
163 | \r | |
164 | #else /* PGEN */\r | |
165 | \r | |
166 | static char *\r | |
167 | error_ret(struct tok_state *tok) /* XXX */\r | |
168 | {\r | |
169 | tok->decoding_erred = 1;\r | |
170 | if (tok->fp != NULL && tok->buf != NULL) /* see PyTokenizer_Free */\r | |
171 | PyMem_FREE(tok->buf);\r | |
172 | tok->buf = NULL;\r | |
173 | return NULL; /* as if it were EOF */\r | |
174 | }\r | |
175 | \r | |
176 | \r | |
177 | static char *\r | |
178 | get_normal_name(char *s) /* for utf-8 and latin-1 */\r | |
179 | {\r | |
180 | char buf[13];\r | |
181 | int i;\r | |
182 | for (i = 0; i < 12; i++) {\r | |
183 | int c = s[i];\r | |
184 | if (c == '\0')\r | |
185 | break;\r | |
186 | else if (c == '_')\r | |
187 | buf[i] = '-';\r | |
188 | else\r | |
189 | buf[i] = tolower(c);\r | |
190 | }\r | |
191 | buf[i] = '\0';\r | |
192 | if (strcmp(buf, "utf-8") == 0 ||\r | |
193 | strncmp(buf, "utf-8-", 6) == 0)\r | |
194 | return "utf-8";\r | |
195 | else if (strcmp(buf, "latin-1") == 0 ||\r | |
196 | strcmp(buf, "iso-8859-1") == 0 ||\r | |
197 | strcmp(buf, "iso-latin-1") == 0 ||\r | |
198 | strncmp(buf, "latin-1-", 8) == 0 ||\r | |
199 | strncmp(buf, "iso-8859-1-", 11) == 0 ||\r | |
200 | strncmp(buf, "iso-latin-1-", 12) == 0)\r | |
201 | return "iso-8859-1";\r | |
202 | else\r | |
203 | return s;\r | |
204 | }\r | |
205 | \r | |
206 | /* Return the coding spec in S, or NULL if none is found. */\r | |
207 | \r | |
208 | static char *\r | |
209 | get_coding_spec(const char *s, Py_ssize_t size)\r | |
210 | {\r | |
211 | Py_ssize_t i;\r | |
212 | /* Coding spec must be in a comment, and that comment must be\r | |
213 | * the only statement on the source code line. */\r | |
214 | for (i = 0; i < size - 6; i++) {\r | |
215 | if (s[i] == '#')\r | |
216 | break;\r | |
217 | if (s[i] != ' ' && s[i] != '\t' && s[i] != '\014')\r | |
218 | return NULL;\r | |
219 | }\r | |
220 | for (; i < size - 6; i++) { /* XXX inefficient search */\r | |
221 | const char* t = s + i;\r | |
222 | if (strncmp(t, "coding", 6) == 0) {\r | |
223 | const char* begin = NULL;\r | |
224 | t += 6;\r | |
225 | if (t[0] != ':' && t[0] != '=')\r | |
226 | continue;\r | |
227 | do {\r | |
228 | t++;\r | |
229 | } while (t[0] == '\x20' || t[0] == '\t');\r | |
230 | \r | |
231 | begin = t;\r | |
232 | while (Py_ISALNUM(t[0]) ||\r | |
233 | t[0] == '-' || t[0] == '_' || t[0] == '.')\r | |
234 | t++;\r | |
235 | \r | |
236 | if (begin < t) {\r | |
237 | char* r = new_string(begin, t - begin);\r | |
238 | char* q = get_normal_name(r);\r | |
239 | if (r != q) {\r | |
240 | PyMem_FREE(r);\r | |
241 | r = new_string(q, strlen(q));\r | |
242 | }\r | |
243 | return r;\r | |
244 | }\r | |
245 | }\r | |
246 | }\r | |
247 | return NULL;\r | |
248 | }\r | |
249 | \r | |
250 | /* Check whether the line contains a coding spec. If it does,\r | |
251 | invoke the set_readline function for the new encoding.\r | |
252 | This function receives the tok_state and the new encoding.\r | |
253 | Return 1 on success, 0 on failure. */\r | |
254 | \r | |
255 | static int\r | |
256 | check_coding_spec(const char* line, Py_ssize_t size, struct tok_state *tok,\r | |
257 | int set_readline(struct tok_state *, const char *))\r | |
258 | {\r | |
259 | char * cs;\r | |
260 | int r = 1;\r | |
261 | \r | |
262 | if (tok->cont_line) {\r | |
263 | /* It's a continuation line, so it can't be a coding spec. */\r | |
264 | tok->read_coding_spec = 1;\r | |
265 | return 1;\r | |
266 | }\r | |
267 | cs = get_coding_spec(line, size);\r | |
268 | if (!cs) {\r | |
269 | Py_ssize_t i;\r | |
270 | for (i = 0; i < size; i++) {\r | |
271 | if (line[i] == '#' || line[i] == '\n' || line[i] == '\r')\r | |
272 | break;\r | |
273 | if (line[i] != ' ' && line[i] != '\t' && line[i] != '\014') {\r | |
274 | /* Stop checking coding spec after a line containing\r | |
275 | * anything except a comment. */\r | |
276 | tok->read_coding_spec = 1;\r | |
277 | break;\r | |
278 | }\r | |
279 | }\r | |
280 | } else {\r | |
281 | tok->read_coding_spec = 1;\r | |
282 | if (tok->encoding == NULL) {\r | |
283 | assert(tok->decoding_state == 1); /* raw */\r | |
284 | if (strcmp(cs, "utf-8") == 0 ||\r | |
285 | strcmp(cs, "iso-8859-1") == 0) {\r | |
286 | tok->encoding = cs;\r | |
287 | } else {\r | |
288 | #ifdef Py_USING_UNICODE\r | |
289 | r = set_readline(tok, cs);\r | |
290 | if (r) {\r | |
291 | tok->encoding = cs;\r | |
292 | tok->decoding_state = -1;\r | |
293 | }\r | |
294 | else {\r | |
295 | PyErr_Format(PyExc_SyntaxError,\r | |
296 | "encoding problem: %s", cs);\r | |
297 | PyMem_FREE(cs);\r | |
298 | }\r | |
299 | #else\r | |
300 | /* Without Unicode support, we cannot\r | |
301 | process the coding spec. Since there\r | |
302 | won't be any Unicode literals, that\r | |
303 | won't matter. */\r | |
304 | PyMem_FREE(cs);\r | |
305 | #endif\r | |
306 | }\r | |
307 | } else { /* then, compare cs with BOM */\r | |
308 | r = (strcmp(tok->encoding, cs) == 0);\r | |
309 | if (!r)\r | |
310 | PyErr_Format(PyExc_SyntaxError,\r | |
311 | "encoding problem: %s with BOM", cs);\r | |
312 | PyMem_FREE(cs);\r | |
313 | }\r | |
314 | }\r | |
315 | return r;\r | |
316 | }\r | |
317 | \r | |
318 | /* See whether the file starts with a BOM. If it does,\r | |
319 | invoke the set_readline function with the new encoding.\r | |
320 | Return 1 on success, 0 on failure. */\r | |
321 | \r | |
322 | static int\r | |
323 | check_bom(int get_char(struct tok_state *),\r | |
324 | void unget_char(int, struct tok_state *),\r | |
325 | int set_readline(struct tok_state *, const char *),\r | |
326 | struct tok_state *tok)\r | |
327 | {\r | |
328 | int ch1, ch2, ch3;\r | |
329 | ch1 = get_char(tok);\r | |
330 | tok->decoding_state = 1;\r | |
331 | if (ch1 == EOF) {\r | |
332 | return 1;\r | |
333 | } else if (ch1 == 0xEF) {\r | |
334 | ch2 = get_char(tok);\r | |
335 | if (ch2 != 0xBB) {\r | |
336 | unget_char(ch2, tok);\r | |
337 | unget_char(ch1, tok);\r | |
338 | return 1;\r | |
339 | }\r | |
340 | ch3 = get_char(tok);\r | |
341 | if (ch3 != 0xBF) {\r | |
342 | unget_char(ch3, tok);\r | |
343 | unget_char(ch2, tok);\r | |
344 | unget_char(ch1, tok);\r | |
345 | return 1;\r | |
346 | }\r | |
347 | #if 0\r | |
348 | /* Disable support for UTF-16 BOMs until a decision\r | |
349 | is made whether this needs to be supported. */\r | |
350 | } else if (ch1 == 0xFE) {\r | |
351 | ch2 = get_char(tok);\r | |
352 | if (ch2 != 0xFF) {\r | |
353 | unget_char(ch2, tok);\r | |
354 | unget_char(ch1, tok);\r | |
355 | return 1;\r | |
356 | }\r | |
357 | if (!set_readline(tok, "utf-16-be"))\r | |
358 | return 0;\r | |
359 | tok->decoding_state = -1;\r | |
360 | } else if (ch1 == 0xFF) {\r | |
361 | ch2 = get_char(tok);\r | |
362 | if (ch2 != 0xFE) {\r | |
363 | unget_char(ch2, tok);\r | |
364 | unget_char(ch1, tok);\r | |
365 | return 1;\r | |
366 | }\r | |
367 | if (!set_readline(tok, "utf-16-le"))\r | |
368 | return 0;\r | |
369 | tok->decoding_state = -1;\r | |
370 | #endif\r | |
371 | } else {\r | |
372 | unget_char(ch1, tok);\r | |
373 | return 1;\r | |
374 | }\r | |
375 | if (tok->encoding != NULL)\r | |
376 | PyMem_FREE(tok->encoding);\r | |
377 | tok->encoding = new_string("utf-8", 5); /* resulting is in utf-8 */\r | |
378 | return 1;\r | |
379 | }\r | |
380 | \r | |
381 | /* Read a line of text from TOK into S, using the stream in TOK.\r | |
382 | Return NULL on failure, else S.\r | |
383 | \r | |
384 | On entry, tok->decoding_buffer will be one of:\r | |
385 | 1) NULL: need to call tok->decoding_readline to get a new line\r | |
386 | 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and\r | |
387 | stored the result in tok->decoding_buffer\r | |
388 | 3) PyStringObject *: previous call to fp_readl did not have enough room\r | |
389 | (in the s buffer) to copy entire contents of the line read\r | |
390 | by tok->decoding_readline. tok->decoding_buffer has the overflow.\r | |
391 | In this case, fp_readl is called in a loop (with an expanded buffer)\r | |
392 | until the buffer ends with a '\n' (or until the end of the file is\r | |
393 | reached): see tok_nextc and its calls to decoding_fgets.\r | |
394 | */\r | |
395 | \r | |
396 | static char *\r | |
397 | fp_readl(char *s, int size, struct tok_state *tok)\r | |
398 | {\r | |
399 | #ifndef Py_USING_UNICODE\r | |
400 | /* In a non-Unicode built, this should never be called. */\r | |
401 | Py_FatalError("fp_readl should not be called in this build.");\r | |
402 | return NULL; /* Keep compiler happy (not reachable) */\r | |
403 | #else\r | |
404 | PyObject* utf8 = NULL;\r | |
405 | PyObject* buf = tok->decoding_buffer;\r | |
406 | char *str;\r | |
407 | Py_ssize_t utf8len;\r | |
408 | \r | |
409 | /* Ask for one less byte so we can terminate it */\r | |
410 | assert(size > 0);\r | |
411 | size--;\r | |
412 | \r | |
413 | if (buf == NULL) {\r | |
414 | buf = PyObject_CallObject(tok->decoding_readline, NULL);\r | |
415 | if (buf == NULL)\r | |
416 | return error_ret(tok);\r | |
417 | if (!PyUnicode_Check(buf)) {\r | |
418 | Py_DECREF(buf);\r | |
419 | PyErr_SetString(PyExc_SyntaxError,\r | |
420 | "codec did not return a unicode object");\r | |
421 | return error_ret(tok);\r | |
422 | }\r | |
423 | } else {\r | |
424 | tok->decoding_buffer = NULL;\r | |
425 | if (PyString_CheckExact(buf))\r | |
426 | utf8 = buf;\r | |
427 | }\r | |
428 | if (utf8 == NULL) {\r | |
429 | utf8 = PyUnicode_AsUTF8String(buf);\r | |
430 | Py_DECREF(buf);\r | |
431 | if (utf8 == NULL)\r | |
432 | return error_ret(tok);\r | |
433 | }\r | |
434 | str = PyString_AsString(utf8);\r | |
435 | utf8len = PyString_GET_SIZE(utf8);\r | |
436 | if (utf8len > size) {\r | |
437 | tok->decoding_buffer = PyString_FromStringAndSize(str+size, utf8len-size);\r | |
438 | if (tok->decoding_buffer == NULL) {\r | |
439 | Py_DECREF(utf8);\r | |
440 | return error_ret(tok);\r | |
441 | }\r | |
442 | utf8len = size;\r | |
443 | }\r | |
444 | memcpy(s, str, utf8len);\r | |
445 | s[utf8len] = '\0';\r | |
446 | Py_DECREF(utf8);\r | |
447 | if (utf8len == 0)\r | |
448 | return NULL; /* EOF */\r | |
449 | return s;\r | |
450 | #endif\r | |
451 | }\r | |
452 | \r | |
453 | /* Set the readline function for TOK to a StreamReader's\r | |
454 | readline function. The StreamReader is named ENC.\r | |
455 | \r | |
456 | This function is called from check_bom and check_coding_spec.\r | |
457 | \r | |
458 | ENC is usually identical to the future value of tok->encoding,\r | |
459 | except for the (currently unsupported) case of UTF-16.\r | |
460 | \r | |
461 | Return 1 on success, 0 on failure. */\r | |
462 | \r | |
463 | static int\r | |
464 | fp_setreadl(struct tok_state *tok, const char* enc)\r | |
465 | {\r | |
466 | PyObject *reader, *stream, *readline;\r | |
467 | \r | |
468 | /* XXX: constify filename argument. */\r | |
469 | stream = PyFile_FromFile(tok->fp, (char*)tok->filename, "rb", NULL);\r | |
470 | if (stream == NULL)\r | |
471 | return 0;\r | |
472 | \r | |
473 | reader = PyCodec_StreamReader(enc, stream, NULL);\r | |
474 | Py_DECREF(stream);\r | |
475 | if (reader == NULL)\r | |
476 | return 0;\r | |
477 | \r | |
478 | readline = PyObject_GetAttrString(reader, "readline");\r | |
479 | Py_DECREF(reader);\r | |
480 | if (readline == NULL)\r | |
481 | return 0;\r | |
482 | \r | |
483 | tok->decoding_readline = readline;\r | |
484 | return 1;\r | |
485 | }\r | |
486 | \r | |
487 | /* Fetch the next byte from TOK. */\r | |
488 | \r | |
489 | static int fp_getc(struct tok_state *tok) {\r | |
490 | return getc(tok->fp);\r | |
491 | }\r | |
492 | \r | |
493 | /* Unfetch the last byte back into TOK. */\r | |
494 | \r | |
495 | static void fp_ungetc(int c, struct tok_state *tok) {\r | |
496 | ungetc(c, tok->fp);\r | |
497 | }\r | |
498 | \r | |
499 | /* Read a line of input from TOK. Determine encoding\r | |
500 | if necessary. */\r | |
501 | \r | |
502 | static char *\r | |
503 | decoding_fgets(char *s, int size, struct tok_state *tok)\r | |
504 | {\r | |
505 | char *line = NULL;\r | |
506 | int badchar = 0;\r | |
507 | for (;;) {\r | |
508 | if (tok->decoding_state < 0) {\r | |
509 | /* We already have a codec associated with\r | |
510 | this input. */\r | |
511 | line = fp_readl(s, size, tok);\r | |
512 | break;\r | |
513 | } else if (tok->decoding_state > 0) {\r | |
514 | /* We want a 'raw' read. */\r | |
515 | line = Py_UniversalNewlineFgets(s, size,\r | |
516 | tok->fp, NULL);\r | |
517 | break;\r | |
518 | } else {\r | |
519 | /* We have not yet determined the encoding.\r | |
520 | If an encoding is found, use the file-pointer\r | |
521 | reader functions from now on. */\r | |
522 | if (!check_bom(fp_getc, fp_ungetc, fp_setreadl, tok))\r | |
523 | return error_ret(tok);\r | |
524 | assert(tok->decoding_state != 0);\r | |
525 | }\r | |
526 | }\r | |
527 | if (line != NULL && tok->lineno < 2 && !tok->read_coding_spec) {\r | |
528 | if (!check_coding_spec(line, strlen(line), tok, fp_setreadl)) {\r | |
529 | return error_ret(tok);\r | |
530 | }\r | |
531 | }\r | |
532 | #ifndef PGEN\r | |
533 | /* The default encoding is ASCII, so make sure we don't have any\r | |
534 | non-ASCII bytes in it. */\r | |
535 | if (line && !tok->encoding) {\r | |
536 | unsigned char *c;\r | |
537 | for (c = (unsigned char *)line; *c; c++)\r | |
538 | if (*c > 127) {\r | |
539 | badchar = *c;\r | |
540 | break;\r | |
541 | }\r | |
542 | }\r | |
543 | if (badchar) {\r | |
544 | char buf[500];\r | |
545 | /* Need to add 1 to the line number, since this line\r | |
546 | has not been counted, yet. */\r | |
547 | sprintf(buf,\r | |
548 | "Non-ASCII character '\\x%.2x' "\r | |
549 | "in file %.200s on line %i, "\r | |
550 | "but no encoding declared; "\r | |
551 | "see http://python.org/dev/peps/pep-0263/ for details",\r | |
552 | badchar, tok->filename, tok->lineno + 1);\r | |
553 | PyErr_SetString(PyExc_SyntaxError, buf);\r | |
554 | return error_ret(tok);\r | |
555 | }\r | |
556 | #endif\r | |
557 | return line;\r | |
558 | }\r | |
559 | \r | |
560 | static int\r | |
561 | decoding_feof(struct tok_state *tok)\r | |
562 | {\r | |
563 | if (tok->decoding_state >= 0) {\r | |
564 | return feof(tok->fp);\r | |
565 | } else {\r | |
566 | PyObject* buf = tok->decoding_buffer;\r | |
567 | if (buf == NULL) {\r | |
568 | buf = PyObject_CallObject(tok->decoding_readline, NULL);\r | |
569 | if (buf == NULL) {\r | |
570 | error_ret(tok);\r | |
571 | return 1;\r | |
572 | } else {\r | |
573 | tok->decoding_buffer = buf;\r | |
574 | }\r | |
575 | }\r | |
576 | return PyObject_Length(buf) == 0;\r | |
577 | }\r | |
578 | }\r | |
579 | \r | |
580 | /* Fetch a byte from TOK, using the string buffer. */\r | |
581 | \r | |
582 | static int\r | |
583 | buf_getc(struct tok_state *tok) {\r | |
584 | return Py_CHARMASK(*tok->str++);\r | |
585 | }\r | |
586 | \r | |
587 | /* Unfetch a byte from TOK, using the string buffer. */\r | |
588 | \r | |
589 | static void\r | |
590 | buf_ungetc(int c, struct tok_state *tok) {\r | |
591 | tok->str--;\r | |
592 | assert(Py_CHARMASK(*tok->str) == c); /* tok->cur may point to read-only segment */\r | |
593 | }\r | |
594 | \r | |
595 | /* Set the readline function for TOK to ENC. For the string-based\r | |
596 | tokenizer, this means to just record the encoding. */\r | |
597 | \r | |
598 | static int\r | |
599 | buf_setreadl(struct tok_state *tok, const char* enc) {\r | |
600 | tok->enc = enc;\r | |
601 | return 1;\r | |
602 | }\r | |
603 | \r | |
604 | /* Return a UTF-8 encoding Python string object from the\r | |
605 | C byte string STR, which is encoded with ENC. */\r | |
606 | \r | |
607 | #ifdef Py_USING_UNICODE\r | |
608 | static PyObject *\r | |
609 | translate_into_utf8(const char* str, const char* enc) {\r | |
610 | PyObject *utf8;\r | |
611 | PyObject* buf = PyUnicode_Decode(str, strlen(str), enc, NULL);\r | |
612 | if (buf == NULL)\r | |
613 | return NULL;\r | |
614 | utf8 = PyUnicode_AsUTF8String(buf);\r | |
615 | Py_DECREF(buf);\r | |
616 | return utf8;\r | |
617 | }\r | |
618 | #endif\r | |
619 | \r | |
620 | \r | |
621 | static char *\r | |
622 | translate_newlines(const char *s, int exec_input, struct tok_state *tok) {\r | |
623 | int skip_next_lf = 0, needed_length = strlen(s) + 2, final_length;\r | |
624 | char *buf, *current;\r | |
625 | char c = '\0';\r | |
626 | buf = PyMem_MALLOC(needed_length);\r | |
627 | if (buf == NULL) {\r | |
628 | tok->done = E_NOMEM;\r | |
629 | return NULL;\r | |
630 | }\r | |
631 | for (current = buf; *s; s++, current++) {\r | |
632 | c = *s;\r | |
633 | if (skip_next_lf) {\r | |
634 | skip_next_lf = 0;\r | |
635 | if (c == '\n') {\r | |
636 | c = *++s;\r | |
637 | if (!c)\r | |
638 | break;\r | |
639 | }\r | |
640 | }\r | |
641 | if (c == '\r') {\r | |
642 | skip_next_lf = 1;\r | |
643 | c = '\n';\r | |
644 | }\r | |
645 | *current = c;\r | |
646 | }\r | |
647 | /* If this is exec input, add a newline to the end of the string if\r | |
648 | there isn't one already. */\r | |
649 | if (exec_input && c != '\n') {\r | |
650 | *current = '\n';\r | |
651 | current++;\r | |
652 | }\r | |
653 | *current = '\0';\r | |
654 | final_length = current - buf + 1;\r | |
655 | if (final_length < needed_length && final_length)\r | |
656 | /* should never fail */\r | |
657 | buf = PyMem_REALLOC(buf, final_length);\r | |
658 | return buf;\r | |
659 | }\r | |
660 | \r | |
661 | /* Decode a byte string STR for use as the buffer of TOK.\r | |
662 | Look for encoding declarations inside STR, and record them\r | |
663 | inside TOK. */\r | |
664 | \r | |
665 | static const char *\r | |
666 | decode_str(const char *input, int single, struct tok_state *tok)\r | |
667 | {\r | |
668 | PyObject* utf8 = NULL;\r | |
669 | const char *str;\r | |
670 | const char *s;\r | |
671 | const char *newl[2] = {NULL, NULL};\r | |
672 | int lineno = 0;\r | |
673 | tok->input = str = translate_newlines(input, single, tok);\r | |
674 | if (str == NULL)\r | |
675 | return NULL;\r | |
676 | tok->enc = NULL;\r | |
677 | tok->str = str;\r | |
678 | if (!check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))\r | |
679 | return error_ret(tok);\r | |
680 | str = tok->str; /* string after BOM if any */\r | |
681 | assert(str);\r | |
682 | #ifdef Py_USING_UNICODE\r | |
683 | if (tok->enc != NULL) {\r | |
684 | utf8 = translate_into_utf8(str, tok->enc);\r | |
685 | if (utf8 == NULL)\r | |
686 | return error_ret(tok);\r | |
687 | str = PyString_AsString(utf8);\r | |
688 | }\r | |
689 | #endif\r | |
690 | for (s = str;; s++) {\r | |
691 | if (*s == '\0') break;\r | |
692 | else if (*s == '\n') {\r | |
693 | assert(lineno < 2);\r | |
694 | newl[lineno] = s;\r | |
695 | lineno++;\r | |
696 | if (lineno == 2) break;\r | |
697 | }\r | |
698 | }\r | |
699 | tok->enc = NULL;\r | |
700 | /* need to check line 1 and 2 separately since check_coding_spec\r | |
701 | assumes a single line as input */\r | |
702 | if (newl[0]) {\r | |
703 | if (!check_coding_spec(str, newl[0] - str, tok, buf_setreadl))\r | |
704 | return error_ret(tok);\r | |
705 | if (tok->enc == NULL && !tok->read_coding_spec && newl[1]) {\r | |
706 | if (!check_coding_spec(newl[0]+1, newl[1] - newl[0],\r | |
707 | tok, buf_setreadl))\r | |
708 | return error_ret(tok);\r | |
709 | }\r | |
710 | }\r | |
711 | #ifdef Py_USING_UNICODE\r | |
712 | if (tok->enc != NULL) {\r | |
713 | assert(utf8 == NULL);\r | |
714 | utf8 = translate_into_utf8(str, tok->enc);\r | |
715 | if (utf8 == NULL)\r | |
716 | return error_ret(tok);\r | |
717 | str = PyString_AsString(utf8);\r | |
718 | }\r | |
719 | #endif\r | |
720 | assert(tok->decoding_buffer == NULL);\r | |
721 | tok->decoding_buffer = utf8; /* CAUTION */\r | |
722 | return str;\r | |
723 | }\r | |
724 | \r | |
725 | #endif /* PGEN */\r | |
726 | \r | |
727 | /* Set up tokenizer for string */\r | |
728 | \r | |
729 | struct tok_state *\r | |
730 | PyTokenizer_FromString(const char *str, int exec_input)\r | |
731 | {\r | |
732 | struct tok_state *tok = tok_new();\r | |
733 | if (tok == NULL)\r | |
734 | return NULL;\r | |
735 | str = (char *)decode_str(str, exec_input, tok);\r | |
736 | if (str == NULL) {\r | |
737 | PyTokenizer_Free(tok);\r | |
738 | return NULL;\r | |
739 | }\r | |
740 | \r | |
741 | /* XXX: constify members. */\r | |
742 | tok->buf = tok->cur = tok->end = tok->inp = (char*)str;\r | |
743 | return tok;\r | |
744 | }\r | |
745 | \r | |
746 | \r | |
747 | /* Set up tokenizer for file */\r | |
748 | \r | |
749 | struct tok_state *\r | |
750 | PyTokenizer_FromFile(FILE *fp, char *ps1, char *ps2)\r | |
751 | {\r | |
752 | struct tok_state *tok = tok_new();\r | |
753 | if (tok == NULL)\r | |
754 | return NULL;\r | |
755 | if ((tok->buf = (char *)PyMem_MALLOC(BUFSIZ)) == NULL) {\r | |
756 | PyTokenizer_Free(tok);\r | |
757 | return NULL;\r | |
758 | }\r | |
759 | tok->cur = tok->inp = tok->buf;\r | |
760 | tok->end = tok->buf + BUFSIZ;\r | |
761 | tok->fp = fp;\r | |
762 | tok->prompt = ps1;\r | |
763 | tok->nextprompt = ps2;\r | |
764 | return tok;\r | |
765 | }\r | |
766 | \r | |
767 | \r | |
768 | /* Free a tok_state structure */\r | |
769 | \r | |
770 | void\r | |
771 | PyTokenizer_Free(struct tok_state *tok)\r | |
772 | {\r | |
773 | if (tok->encoding != NULL)\r | |
774 | PyMem_FREE(tok->encoding);\r | |
775 | #ifndef PGEN\r | |
776 | Py_XDECREF(tok->decoding_readline);\r | |
777 | Py_XDECREF(tok->decoding_buffer);\r | |
778 | #endif\r | |
779 | if (tok->fp != NULL && tok->buf != NULL)\r | |
780 | PyMem_FREE(tok->buf);\r | |
781 | if (tok->input)\r | |
782 | PyMem_FREE((char *)tok->input);\r | |
783 | PyMem_FREE(tok);\r | |
784 | }\r | |
785 | \r | |
786 | #if !defined(PGEN) && defined(Py_USING_UNICODE)\r | |
787 | static int\r | |
788 | tok_stdin_decode(struct tok_state *tok, char **inp)\r | |
789 | {\r | |
790 | PyObject *enc, *sysstdin, *decoded, *utf8;\r | |
791 | const char *encoding;\r | |
792 | char *converted;\r | |
793 | \r | |
794 | if (PySys_GetFile((char *)"stdin", NULL) != stdin)\r | |
795 | return 0;\r | |
796 | sysstdin = PySys_GetObject("stdin");\r | |
797 | if (sysstdin == NULL || !PyFile_Check(sysstdin))\r | |
798 | return 0;\r | |
799 | \r | |
800 | enc = ((PyFileObject *)sysstdin)->f_encoding;\r | |
801 | if (enc == NULL || !PyString_Check(enc))\r | |
802 | return 0;\r | |
803 | Py_INCREF(enc);\r | |
804 | \r | |
805 | encoding = PyString_AsString(enc);\r | |
806 | decoded = PyUnicode_Decode(*inp, strlen(*inp), encoding, NULL);\r | |
807 | if (decoded == NULL)\r | |
808 | goto error_clear;\r | |
809 | \r | |
810 | utf8 = PyUnicode_AsEncodedString(decoded, "utf-8", NULL);\r | |
811 | Py_DECREF(decoded);\r | |
812 | if (utf8 == NULL)\r | |
813 | goto error_clear;\r | |
814 | \r | |
815 | assert(PyString_Check(utf8));\r | |
816 | converted = new_string(PyString_AS_STRING(utf8),\r | |
817 | PyString_GET_SIZE(utf8));\r | |
818 | Py_DECREF(utf8);\r | |
819 | if (converted == NULL)\r | |
820 | goto error_nomem;\r | |
821 | \r | |
822 | PyMem_FREE(*inp);\r | |
823 | *inp = converted;\r | |
824 | if (tok->encoding != NULL)\r | |
825 | PyMem_FREE(tok->encoding);\r | |
826 | tok->encoding = new_string(encoding, strlen(encoding));\r | |
827 | if (tok->encoding == NULL)\r | |
828 | goto error_nomem;\r | |
829 | \r | |
830 | Py_DECREF(enc);\r | |
831 | return 0;\r | |
832 | \r | |
833 | error_nomem:\r | |
834 | Py_DECREF(enc);\r | |
835 | tok->done = E_NOMEM;\r | |
836 | return -1;\r | |
837 | \r | |
838 | error_clear:\r | |
839 | Py_DECREF(enc);\r | |
840 | if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError)) {\r | |
841 | tok->done = E_ERROR;\r | |
842 | return -1;\r | |
843 | }\r | |
844 | /* Fallback to iso-8859-1: for backward compatibility */\r | |
845 | PyErr_Clear();\r | |
846 | return 0;\r | |
847 | }\r | |
848 | #endif\r | |
849 | \r | |
850 | /* Get next char, updating state; error code goes into tok->done */\r | |
851 | \r | |
852 | static int\r | |
853 | tok_nextc(register struct tok_state *tok)\r | |
854 | {\r | |
855 | for (;;) {\r | |
856 | if (tok->cur != tok->inp) {\r | |
857 | return Py_CHARMASK(*tok->cur++); /* Fast path */\r | |
858 | }\r | |
859 | if (tok->done != E_OK)\r | |
860 | return EOF;\r | |
861 | if (tok->fp == NULL) {\r | |
862 | char *end = strchr(tok->inp, '\n');\r | |
863 | if (end != NULL)\r | |
864 | end++;\r | |
865 | else {\r | |
866 | end = strchr(tok->inp, '\0');\r | |
867 | if (end == tok->inp) {\r | |
868 | tok->done = E_EOF;\r | |
869 | return EOF;\r | |
870 | }\r | |
871 | }\r | |
872 | if (tok->start == NULL)\r | |
873 | tok->buf = tok->cur;\r | |
874 | tok->line_start = tok->cur;\r | |
875 | tok->lineno++;\r | |
876 | tok->inp = end;\r | |
877 | return Py_CHARMASK(*tok->cur++);\r | |
878 | }\r | |
879 | if (tok->prompt != NULL) {\r | |
880 | char *newtok = PyOS_Readline(stdin, stdout, tok->prompt);\r | |
881 | if (tok->nextprompt != NULL)\r | |
882 | tok->prompt = tok->nextprompt;\r | |
883 | if (newtok == NULL)\r | |
884 | tok->done = E_INTR;\r | |
885 | else if (*newtok == '\0') {\r | |
886 | PyMem_FREE(newtok);\r | |
887 | tok->done = E_EOF;\r | |
888 | }\r | |
889 | #if !defined(PGEN) && defined(Py_USING_UNICODE)\r | |
890 | else if (tok_stdin_decode(tok, &newtok) != 0)\r | |
891 | PyMem_FREE(newtok);\r | |
892 | #endif\r | |
893 | else if (tok->start != NULL) {\r | |
894 | size_t start = tok->start - tok->buf;\r | |
895 | size_t oldlen = tok->cur - tok->buf;\r | |
896 | size_t newlen = oldlen + strlen(newtok);\r | |
897 | char *buf = tok->buf;\r | |
898 | buf = (char *)PyMem_REALLOC(buf, newlen+1);\r | |
899 | tok->lineno++;\r | |
900 | if (buf == NULL) {\r | |
901 | PyMem_FREE(tok->buf);\r | |
902 | tok->buf = NULL;\r | |
903 | PyMem_FREE(newtok);\r | |
904 | tok->done = E_NOMEM;\r | |
905 | return EOF;\r | |
906 | }\r | |
907 | tok->buf = buf;\r | |
908 | tok->cur = tok->buf + oldlen;\r | |
909 | tok->line_start = tok->cur;\r | |
910 | strcpy(tok->buf + oldlen, newtok);\r | |
911 | PyMem_FREE(newtok);\r | |
912 | tok->inp = tok->buf + newlen;\r | |
913 | tok->end = tok->inp + 1;\r | |
914 | tok->start = tok->buf + start;\r | |
915 | }\r | |
916 | else {\r | |
917 | tok->lineno++;\r | |
918 | if (tok->buf != NULL)\r | |
919 | PyMem_FREE(tok->buf);\r | |
920 | tok->buf = newtok;\r | |
921 | tok->line_start = tok->buf;\r | |
922 | tok->cur = tok->buf;\r | |
923 | tok->line_start = tok->buf;\r | |
924 | tok->inp = strchr(tok->buf, '\0');\r | |
925 | tok->end = tok->inp + 1;\r | |
926 | }\r | |
927 | }\r | |
928 | else {\r | |
929 | int done = 0;\r | |
930 | Py_ssize_t cur = 0;\r | |
931 | char *pt;\r | |
932 | if (tok->start == NULL) {\r | |
933 | if (tok->buf == NULL) {\r | |
934 | tok->buf = (char *)\r | |
935 | PyMem_MALLOC(BUFSIZ);\r | |
936 | if (tok->buf == NULL) {\r | |
937 | tok->done = E_NOMEM;\r | |
938 | return EOF;\r | |
939 | }\r | |
940 | tok->end = tok->buf + BUFSIZ;\r | |
941 | }\r | |
942 | if (decoding_fgets(tok->buf, (int)(tok->end - tok->buf),\r | |
943 | tok) == NULL) {\r | |
944 | tok->done = E_EOF;\r | |
945 | done = 1;\r | |
946 | }\r | |
947 | else {\r | |
948 | tok->done = E_OK;\r | |
949 | tok->inp = strchr(tok->buf, '\0');\r | |
950 | done = tok->inp[-1] == '\n';\r | |
951 | }\r | |
952 | }\r | |
953 | else {\r | |
954 | cur = tok->cur - tok->buf;\r | |
955 | if (decoding_feof(tok)) {\r | |
956 | tok->done = E_EOF;\r | |
957 | done = 1;\r | |
958 | }\r | |
959 | else\r | |
960 | tok->done = E_OK;\r | |
961 | }\r | |
962 | tok->lineno++;\r | |
963 | /* Read until '\n' or EOF */\r | |
964 | while (!done) {\r | |
965 | Py_ssize_t curstart = tok->start == NULL ? -1 :\r | |
966 | tok->start - tok->buf;\r | |
967 | Py_ssize_t curvalid = tok->inp - tok->buf;\r | |
968 | Py_ssize_t newsize = curvalid + BUFSIZ;\r | |
969 | char *newbuf = tok->buf;\r | |
970 | newbuf = (char *)PyMem_REALLOC(newbuf,\r | |
971 | newsize);\r | |
972 | if (newbuf == NULL) {\r | |
973 | tok->done = E_NOMEM;\r | |
974 | tok->cur = tok->inp;\r | |
975 | return EOF;\r | |
976 | }\r | |
977 | tok->buf = newbuf;\r | |
978 | tok->inp = tok->buf + curvalid;\r | |
979 | tok->end = tok->buf + newsize;\r | |
980 | tok->start = curstart < 0 ? NULL :\r | |
981 | tok->buf + curstart;\r | |
982 | if (decoding_fgets(tok->inp,\r | |
983 | (int)(tok->end - tok->inp),\r | |
984 | tok) == NULL) {\r | |
985 | /* Break out early on decoding\r | |
986 | errors, as tok->buf will be NULL\r | |
987 | */\r | |
988 | if (tok->decoding_erred)\r | |
989 | return EOF;\r | |
990 | /* Last line does not end in \n,\r | |
991 | fake one */\r | |
992 | strcpy(tok->inp, "\n");\r | |
993 | }\r | |
994 | tok->inp = strchr(tok->inp, '\0');\r | |
995 | done = tok->inp[-1] == '\n';\r | |
996 | }\r | |
997 | if (tok->buf != NULL) {\r | |
998 | tok->cur = tok->buf + cur;\r | |
999 | tok->line_start = tok->cur;\r | |
1000 | /* replace "\r\n" with "\n" */\r | |
1001 | /* For Mac leave the \r, giving a syntax error */\r | |
1002 | pt = tok->inp - 2;\r | |
1003 | if (pt >= tok->buf && *pt == '\r') {\r | |
1004 | *pt++ = '\n';\r | |
1005 | *pt = '\0';\r | |
1006 | tok->inp = pt;\r | |
1007 | }\r | |
1008 | }\r | |
1009 | }\r | |
1010 | if (tok->done != E_OK) {\r | |
1011 | if (tok->prompt != NULL)\r | |
1012 | PySys_WriteStderr("\n");\r | |
1013 | tok->cur = tok->inp;\r | |
1014 | return EOF;\r | |
1015 | }\r | |
1016 | }\r | |
1017 | /*NOTREACHED*/\r | |
1018 | }\r | |
1019 | \r | |
1020 | \r | |
1021 | /* Back-up one character */\r | |
1022 | \r | |
1023 | static void\r | |
1024 | tok_backup(register struct tok_state *tok, register int c)\r | |
1025 | {\r | |
1026 | if (c != EOF) {\r | |
1027 | if (--tok->cur < tok->buf)\r | |
1028 | Py_FatalError("tok_backup: beginning of buffer");\r | |
1029 | if (*tok->cur != c)\r | |
1030 | *tok->cur = c;\r | |
1031 | }\r | |
1032 | }\r | |
1033 | \r | |
1034 | \r | |
1035 | /* Return the token corresponding to a single character */\r | |
1036 | \r | |
1037 | int\r | |
1038 | PyToken_OneChar(int c)\r | |
1039 | {\r | |
1040 | switch (c) {\r | |
1041 | case '(': return LPAR;\r | |
1042 | case ')': return RPAR;\r | |
1043 | case '[': return LSQB;\r | |
1044 | case ']': return RSQB;\r | |
1045 | case ':': return COLON;\r | |
1046 | case ',': return COMMA;\r | |
1047 | case ';': return SEMI;\r | |
1048 | case '+': return PLUS;\r | |
1049 | case '-': return MINUS;\r | |
1050 | case '*': return STAR;\r | |
1051 | case '/': return SLASH;\r | |
1052 | case '|': return VBAR;\r | |
1053 | case '&': return AMPER;\r | |
1054 | case '<': return LESS;\r | |
1055 | case '>': return GREATER;\r | |
1056 | case '=': return EQUAL;\r | |
1057 | case '.': return DOT;\r | |
1058 | case '%': return PERCENT;\r | |
1059 | case '`': return BACKQUOTE;\r | |
1060 | case '{': return LBRACE;\r | |
1061 | case '}': return RBRACE;\r | |
1062 | case '^': return CIRCUMFLEX;\r | |
1063 | case '~': return TILDE;\r | |
1064 | case '@': return AT;\r | |
1065 | default: return OP;\r | |
1066 | }\r | |
1067 | }\r | |
1068 | \r | |
1069 | \r | |
1070 | int\r | |
1071 | PyToken_TwoChars(int c1, int c2)\r | |
1072 | {\r | |
1073 | switch (c1) {\r | |
1074 | case '=':\r | |
1075 | switch (c2) {\r | |
1076 | case '=': return EQEQUAL;\r | |
1077 | }\r | |
1078 | break;\r | |
1079 | case '!':\r | |
1080 | switch (c2) {\r | |
1081 | case '=': return NOTEQUAL;\r | |
1082 | }\r | |
1083 | break;\r | |
1084 | case '<':\r | |
1085 | switch (c2) {\r | |
1086 | case '>': return NOTEQUAL;\r | |
1087 | case '=': return LESSEQUAL;\r | |
1088 | case '<': return LEFTSHIFT;\r | |
1089 | }\r | |
1090 | break;\r | |
1091 | case '>':\r | |
1092 | switch (c2) {\r | |
1093 | case '=': return GREATEREQUAL;\r | |
1094 | case '>': return RIGHTSHIFT;\r | |
1095 | }\r | |
1096 | break;\r | |
1097 | case '+':\r | |
1098 | switch (c2) {\r | |
1099 | case '=': return PLUSEQUAL;\r | |
1100 | }\r | |
1101 | break;\r | |
1102 | case '-':\r | |
1103 | switch (c2) {\r | |
1104 | case '=': return MINEQUAL;\r | |
1105 | }\r | |
1106 | break;\r | |
1107 | case '*':\r | |
1108 | switch (c2) {\r | |
1109 | case '*': return DOUBLESTAR;\r | |
1110 | case '=': return STAREQUAL;\r | |
1111 | }\r | |
1112 | break;\r | |
1113 | case '/':\r | |
1114 | switch (c2) {\r | |
1115 | case '/': return DOUBLESLASH;\r | |
1116 | case '=': return SLASHEQUAL;\r | |
1117 | }\r | |
1118 | break;\r | |
1119 | case '|':\r | |
1120 | switch (c2) {\r | |
1121 | case '=': return VBAREQUAL;\r | |
1122 | }\r | |
1123 | break;\r | |
1124 | case '%':\r | |
1125 | switch (c2) {\r | |
1126 | case '=': return PERCENTEQUAL;\r | |
1127 | }\r | |
1128 | break;\r | |
1129 | case '&':\r | |
1130 | switch (c2) {\r | |
1131 | case '=': return AMPEREQUAL;\r | |
1132 | }\r | |
1133 | break;\r | |
1134 | case '^':\r | |
1135 | switch (c2) {\r | |
1136 | case '=': return CIRCUMFLEXEQUAL;\r | |
1137 | }\r | |
1138 | break;\r | |
1139 | }\r | |
1140 | return OP;\r | |
1141 | }\r | |
1142 | \r | |
1143 | int\r | |
1144 | PyToken_ThreeChars(int c1, int c2, int c3)\r | |
1145 | {\r | |
1146 | switch (c1) {\r | |
1147 | case '<':\r | |
1148 | switch (c2) {\r | |
1149 | case '<':\r | |
1150 | switch (c3) {\r | |
1151 | case '=':\r | |
1152 | return LEFTSHIFTEQUAL;\r | |
1153 | }\r | |
1154 | break;\r | |
1155 | }\r | |
1156 | break;\r | |
1157 | case '>':\r | |
1158 | switch (c2) {\r | |
1159 | case '>':\r | |
1160 | switch (c3) {\r | |
1161 | case '=':\r | |
1162 | return RIGHTSHIFTEQUAL;\r | |
1163 | }\r | |
1164 | break;\r | |
1165 | }\r | |
1166 | break;\r | |
1167 | case '*':\r | |
1168 | switch (c2) {\r | |
1169 | case '*':\r | |
1170 | switch (c3) {\r | |
1171 | case '=':\r | |
1172 | return DOUBLESTAREQUAL;\r | |
1173 | }\r | |
1174 | break;\r | |
1175 | }\r | |
1176 | break;\r | |
1177 | case '/':\r | |
1178 | switch (c2) {\r | |
1179 | case '/':\r | |
1180 | switch (c3) {\r | |
1181 | case '=':\r | |
1182 | return DOUBLESLASHEQUAL;\r | |
1183 | }\r | |
1184 | break;\r | |
1185 | }\r | |
1186 | break;\r | |
1187 | }\r | |
1188 | return OP;\r | |
1189 | }\r | |
1190 | \r | |
1191 | static int\r | |
1192 | indenterror(struct tok_state *tok)\r | |
1193 | {\r | |
1194 | if (tok->alterror) {\r | |
1195 | tok->done = E_TABSPACE;\r | |
1196 | tok->cur = tok->inp;\r | |
1197 | return 1;\r | |
1198 | }\r | |
1199 | if (tok->altwarning) {\r | |
1200 | PySys_WriteStderr("%s: inconsistent use of tabs and spaces "\r | |
1201 | "in indentation\n", tok->filename);\r | |
1202 | tok->altwarning = 0;\r | |
1203 | }\r | |
1204 | return 0;\r | |
1205 | }\r | |
1206 | \r | |
1207 | /* Get next token, after space stripping etc. */\r | |
1208 | \r | |
1209 | static int\r | |
1210 | tok_get(register struct tok_state *tok, char **p_start, char **p_end)\r | |
1211 | {\r | |
1212 | register int c;\r | |
1213 | int blankline;\r | |
1214 | \r | |
1215 | *p_start = *p_end = NULL;\r | |
1216 | nextline:\r | |
1217 | tok->start = NULL;\r | |
1218 | blankline = 0;\r | |
1219 | \r | |
1220 | /* Get indentation level */\r | |
1221 | if (tok->atbol) {\r | |
1222 | register int col = 0;\r | |
1223 | register int altcol = 0;\r | |
1224 | tok->atbol = 0;\r | |
1225 | for (;;) {\r | |
1226 | c = tok_nextc(tok);\r | |
1227 | if (c == ' ')\r | |
1228 | col++, altcol++;\r | |
1229 | else if (c == '\t') {\r | |
1230 | col = (col/tok->tabsize + 1) * tok->tabsize;\r | |
1231 | altcol = (altcol/tok->alttabsize + 1)\r | |
1232 | * tok->alttabsize;\r | |
1233 | }\r | |
1234 | else if (c == '\014') /* Control-L (formfeed) */\r | |
1235 | col = altcol = 0; /* For Emacs users */\r | |
1236 | else\r | |
1237 | break;\r | |
1238 | }\r | |
1239 | tok_backup(tok, c);\r | |
1240 | if (c == '#' || c == '\n') {\r | |
1241 | /* Lines with only whitespace and/or comments\r | |
1242 | shouldn't affect the indentation and are\r | |
1243 | not passed to the parser as NEWLINE tokens,\r | |
1244 | except *totally* empty lines in interactive\r | |
1245 | mode, which signal the end of a command group. */\r | |
1246 | if (col == 0 && c == '\n' && tok->prompt != NULL)\r | |
1247 | blankline = 0; /* Let it through */\r | |
1248 | else\r | |
1249 | blankline = 1; /* Ignore completely */\r | |
1250 | /* We can't jump back right here since we still\r | |
1251 | may need to skip to the end of a comment */\r | |
1252 | }\r | |
1253 | if (!blankline && tok->level == 0) {\r | |
1254 | if (col == tok->indstack[tok->indent]) {\r | |
1255 | /* No change */\r | |
1256 | if (altcol != tok->altindstack[tok->indent]) {\r | |
1257 | if (indenterror(tok))\r | |
1258 | return ERRORTOKEN;\r | |
1259 | }\r | |
1260 | }\r | |
1261 | else if (col > tok->indstack[tok->indent]) {\r | |
1262 | /* Indent -- always one */\r | |
1263 | if (tok->indent+1 >= MAXINDENT) {\r | |
1264 | tok->done = E_TOODEEP;\r | |
1265 | tok->cur = tok->inp;\r | |
1266 | return ERRORTOKEN;\r | |
1267 | }\r | |
1268 | if (altcol <= tok->altindstack[tok->indent]) {\r | |
1269 | if (indenterror(tok))\r | |
1270 | return ERRORTOKEN;\r | |
1271 | }\r | |
1272 | tok->pendin++;\r | |
1273 | tok->indstack[++tok->indent] = col;\r | |
1274 | tok->altindstack[tok->indent] = altcol;\r | |
1275 | }\r | |
1276 | else /* col < tok->indstack[tok->indent] */ {\r | |
1277 | /* Dedent -- any number, must be consistent */\r | |
1278 | while (tok->indent > 0 &&\r | |
1279 | col < tok->indstack[tok->indent]) {\r | |
1280 | tok->pendin--;\r | |
1281 | tok->indent--;\r | |
1282 | }\r | |
1283 | if (col != tok->indstack[tok->indent]) {\r | |
1284 | tok->done = E_DEDENT;\r | |
1285 | tok->cur = tok->inp;\r | |
1286 | return ERRORTOKEN;\r | |
1287 | }\r | |
1288 | if (altcol != tok->altindstack[tok->indent]) {\r | |
1289 | if (indenterror(tok))\r | |
1290 | return ERRORTOKEN;\r | |
1291 | }\r | |
1292 | }\r | |
1293 | }\r | |
1294 | }\r | |
1295 | \r | |
1296 | tok->start = tok->cur;\r | |
1297 | \r | |
1298 | /* Return pending indents/dedents */\r | |
1299 | if (tok->pendin != 0) {\r | |
1300 | if (tok->pendin < 0) {\r | |
1301 | tok->pendin++;\r | |
1302 | return DEDENT;\r | |
1303 | }\r | |
1304 | else {\r | |
1305 | tok->pendin--;\r | |
1306 | return INDENT;\r | |
1307 | }\r | |
1308 | }\r | |
1309 | \r | |
1310 | again:\r | |
1311 | tok->start = NULL;\r | |
1312 | /* Skip spaces */\r | |
1313 | do {\r | |
1314 | c = tok_nextc(tok);\r | |
1315 | } while (c == ' ' || c == '\t' || c == '\014');\r | |
1316 | \r | |
1317 | /* Set start of current token */\r | |
1318 | tok->start = tok->cur - 1;\r | |
1319 | \r | |
1320 | /* Skip comment, while looking for tab-setting magic */\r | |
1321 | if (c == '#') {\r | |
1322 | static char *tabforms[] = {\r | |
1323 | "tab-width:", /* Emacs */\r | |
1324 | ":tabstop=", /* vim, full form */\r | |
1325 | ":ts=", /* vim, abbreviated form */\r | |
1326 | "set tabsize=", /* will vi never die? */\r | |
1327 | /* more templates can be added here to support other editors */\r | |
1328 | };\r | |
1329 | char cbuf[80];\r | |
1330 | char *tp, **cp;\r | |
1331 | tp = cbuf;\r | |
1332 | do {\r | |
1333 | *tp++ = c = tok_nextc(tok);\r | |
1334 | } while (c != EOF && c != '\n' &&\r | |
1335 | (size_t)(tp - cbuf + 1) < sizeof(cbuf));\r | |
1336 | *tp = '\0';\r | |
1337 | for (cp = tabforms;\r | |
1338 | cp < tabforms + sizeof(tabforms)/sizeof(tabforms[0]);\r | |
1339 | cp++) {\r | |
1340 | if ((tp = strstr(cbuf, *cp))) {\r | |
1341 | int newsize = atoi(tp + strlen(*cp));\r | |
1342 | \r | |
1343 | if (newsize >= 1 && newsize <= 40) {\r | |
1344 | tok->tabsize = newsize;\r | |
1345 | if (Py_VerboseFlag)\r | |
1346 | PySys_WriteStderr(\r | |
1347 | "Tab size set to %d\n",\r | |
1348 | newsize);\r | |
1349 | }\r | |
1350 | }\r | |
1351 | }\r | |
1352 | while (c != EOF && c != '\n')\r | |
1353 | c = tok_nextc(tok);\r | |
1354 | }\r | |
1355 | \r | |
1356 | /* Check for EOF and errors now */\r | |
1357 | if (c == EOF) {\r | |
1358 | return tok->done == E_EOF ? ENDMARKER : ERRORTOKEN;\r | |
1359 | }\r | |
1360 | \r | |
1361 | /* Identifier (most frequent token!) */\r | |
1362 | if (Py_ISALPHA(c) || c == '_') {\r | |
1363 | /* Process r"", u"" and ur"" */\r | |
1364 | switch (c) {\r | |
1365 | case 'b':\r | |
1366 | case 'B':\r | |
1367 | c = tok_nextc(tok);\r | |
1368 | if (c == 'r' || c == 'R')\r | |
1369 | c = tok_nextc(tok);\r | |
1370 | if (c == '"' || c == '\'')\r | |
1371 | goto letter_quote;\r | |
1372 | break;\r | |
1373 | case 'r':\r | |
1374 | case 'R':\r | |
1375 | c = tok_nextc(tok);\r | |
1376 | if (c == '"' || c == '\'')\r | |
1377 | goto letter_quote;\r | |
1378 | break;\r | |
1379 | case 'u':\r | |
1380 | case 'U':\r | |
1381 | c = tok_nextc(tok);\r | |
1382 | if (c == 'r' || c == 'R')\r | |
1383 | c = tok_nextc(tok);\r | |
1384 | if (c == '"' || c == '\'')\r | |
1385 | goto letter_quote;\r | |
1386 | break;\r | |
1387 | }\r | |
1388 | while (c != EOF && (Py_ISALNUM(c) || c == '_')) {\r | |
1389 | c = tok_nextc(tok);\r | |
1390 | }\r | |
1391 | tok_backup(tok, c);\r | |
1392 | *p_start = tok->start;\r | |
1393 | *p_end = tok->cur;\r | |
1394 | return NAME;\r | |
1395 | }\r | |
1396 | \r | |
1397 | /* Newline */\r | |
1398 | if (c == '\n') {\r | |
1399 | tok->atbol = 1;\r | |
1400 | if (blankline || tok->level > 0)\r | |
1401 | goto nextline;\r | |
1402 | *p_start = tok->start;\r | |
1403 | *p_end = tok->cur - 1; /* Leave '\n' out of the string */\r | |
1404 | tok->cont_line = 0;\r | |
1405 | return NEWLINE;\r | |
1406 | }\r | |
1407 | \r | |
1408 | /* Period or number starting with period? */\r | |
1409 | if (c == '.') {\r | |
1410 | c = tok_nextc(tok);\r | |
1411 | if (isdigit(c)) {\r | |
1412 | goto fraction;\r | |
1413 | }\r | |
1414 | else {\r | |
1415 | tok_backup(tok, c);\r | |
1416 | *p_start = tok->start;\r | |
1417 | *p_end = tok->cur;\r | |
1418 | return DOT;\r | |
1419 | }\r | |
1420 | }\r | |
1421 | \r | |
1422 | /* Number */\r | |
1423 | if (isdigit(c)) {\r | |
1424 | if (c == '0') {\r | |
1425 | /* Hex, octal or binary -- maybe. */\r | |
1426 | c = tok_nextc(tok);\r | |
1427 | if (c == '.')\r | |
1428 | goto fraction;\r | |
1429 | #ifndef WITHOUT_COMPLEX\r | |
1430 | if (c == 'j' || c == 'J')\r | |
1431 | goto imaginary;\r | |
1432 | #endif\r | |
1433 | if (c == 'x' || c == 'X') {\r | |
1434 | \r | |
1435 | /* Hex */\r | |
1436 | c = tok_nextc(tok);\r | |
1437 | if (!isxdigit(c)) {\r | |
1438 | tok->done = E_TOKEN;\r | |
1439 | tok_backup(tok, c);\r | |
1440 | return ERRORTOKEN;\r | |
1441 | }\r | |
1442 | do {\r | |
1443 | c = tok_nextc(tok);\r | |
1444 | } while (isxdigit(c));\r | |
1445 | }\r | |
1446 | else if (c == 'o' || c == 'O') {\r | |
1447 | /* Octal */\r | |
1448 | c = tok_nextc(tok);\r | |
1449 | if (c < '0' || c >= '8') {\r | |
1450 | tok->done = E_TOKEN;\r | |
1451 | tok_backup(tok, c);\r | |
1452 | return ERRORTOKEN;\r | |
1453 | }\r | |
1454 | do {\r | |
1455 | c = tok_nextc(tok);\r | |
1456 | } while ('0' <= c && c < '8');\r | |
1457 | }\r | |
1458 | else if (c == 'b' || c == 'B') {\r | |
1459 | /* Binary */\r | |
1460 | c = tok_nextc(tok);\r | |
1461 | if (c != '0' && c != '1') {\r | |
1462 | tok->done = E_TOKEN;\r | |
1463 | tok_backup(tok, c);\r | |
1464 | return ERRORTOKEN;\r | |
1465 | }\r | |
1466 | do {\r | |
1467 | c = tok_nextc(tok);\r | |
1468 | } while (c == '0' || c == '1');\r | |
1469 | }\r | |
1470 | else {\r | |
1471 | int found_decimal = 0;\r | |
1472 | /* Octal; c is first char of it */\r | |
1473 | /* There's no 'isoctdigit' macro, sigh */\r | |
1474 | while ('0' <= c && c < '8') {\r | |
1475 | c = tok_nextc(tok);\r | |
1476 | }\r | |
1477 | if (isdigit(c)) {\r | |
1478 | found_decimal = 1;\r | |
1479 | do {\r | |
1480 | c = tok_nextc(tok);\r | |
1481 | } while (isdigit(c));\r | |
1482 | }\r | |
1483 | if (c == '.')\r | |
1484 | goto fraction;\r | |
1485 | else if (c == 'e' || c == 'E')\r | |
1486 | goto exponent;\r | |
1487 | #ifndef WITHOUT_COMPLEX\r | |
1488 | else if (c == 'j' || c == 'J')\r | |
1489 | goto imaginary;\r | |
1490 | #endif\r | |
1491 | else if (found_decimal) {\r | |
1492 | tok->done = E_TOKEN;\r | |
1493 | tok_backup(tok, c);\r | |
1494 | return ERRORTOKEN;\r | |
1495 | }\r | |
1496 | }\r | |
1497 | if (c == 'l' || c == 'L')\r | |
1498 | c = tok_nextc(tok);\r | |
1499 | }\r | |
1500 | else {\r | |
1501 | /* Decimal */\r | |
1502 | do {\r | |
1503 | c = tok_nextc(tok);\r | |
1504 | } while (isdigit(c));\r | |
1505 | if (c == 'l' || c == 'L')\r | |
1506 | c = tok_nextc(tok);\r | |
1507 | else {\r | |
1508 | /* Accept floating point numbers. */\r | |
1509 | if (c == '.') {\r | |
1510 | fraction:\r | |
1511 | /* Fraction */\r | |
1512 | do {\r | |
1513 | c = tok_nextc(tok);\r | |
1514 | } while (isdigit(c));\r | |
1515 | }\r | |
1516 | if (c == 'e' || c == 'E') {\r | |
1517 | int e;\r | |
1518 | exponent:\r | |
1519 | e = c;\r | |
1520 | /* Exponent part */\r | |
1521 | c = tok_nextc(tok);\r | |
1522 | if (c == '+' || c == '-') {\r | |
1523 | c = tok_nextc(tok);\r | |
1524 | if (!isdigit(c)) {\r | |
1525 | tok->done = E_TOKEN;\r | |
1526 | tok_backup(tok, c);\r | |
1527 | return ERRORTOKEN;\r | |
1528 | }\r | |
1529 | } else if (!isdigit(c)) {\r | |
1530 | tok_backup(tok, c);\r | |
1531 | tok_backup(tok, e);\r | |
1532 | *p_start = tok->start;\r | |
1533 | *p_end = tok->cur;\r | |
1534 | return NUMBER;\r | |
1535 | }\r | |
1536 | do {\r | |
1537 | c = tok_nextc(tok);\r | |
1538 | } while (isdigit(c));\r | |
1539 | }\r | |
1540 | #ifndef WITHOUT_COMPLEX\r | |
1541 | if (c == 'j' || c == 'J')\r | |
1542 | /* Imaginary part */\r | |
1543 | imaginary:\r | |
1544 | c = tok_nextc(tok);\r | |
1545 | #endif\r | |
1546 | }\r | |
1547 | }\r | |
1548 | tok_backup(tok, c);\r | |
1549 | *p_start = tok->start;\r | |
1550 | *p_end = tok->cur;\r | |
1551 | return NUMBER;\r | |
1552 | }\r | |
1553 | \r | |
1554 | letter_quote:\r | |
1555 | /* String */\r | |
1556 | if (c == '\'' || c == '"') {\r | |
1557 | Py_ssize_t quote2 = tok->cur - tok->start + 1;\r | |
1558 | int quote = c;\r | |
1559 | int triple = 0;\r | |
1560 | int tripcount = 0;\r | |
1561 | for (;;) {\r | |
1562 | c = tok_nextc(tok);\r | |
1563 | if (c == '\n') {\r | |
1564 | if (!triple) {\r | |
1565 | tok->done = E_EOLS;\r | |
1566 | tok_backup(tok, c);\r | |
1567 | return ERRORTOKEN;\r | |
1568 | }\r | |
1569 | tripcount = 0;\r | |
1570 | tok->cont_line = 1; /* multiline string. */\r | |
1571 | }\r | |
1572 | else if (c == EOF) {\r | |
1573 | if (triple)\r | |
1574 | tok->done = E_EOFS;\r | |
1575 | else\r | |
1576 | tok->done = E_EOLS;\r | |
1577 | tok->cur = tok->inp;\r | |
1578 | return ERRORTOKEN;\r | |
1579 | }\r | |
1580 | else if (c == quote) {\r | |
1581 | tripcount++;\r | |
1582 | if (tok->cur - tok->start == quote2) {\r | |
1583 | c = tok_nextc(tok);\r | |
1584 | if (c == quote) {\r | |
1585 | triple = 1;\r | |
1586 | tripcount = 0;\r | |
1587 | continue;\r | |
1588 | }\r | |
1589 | tok_backup(tok, c);\r | |
1590 | }\r | |
1591 | if (!triple || tripcount == 3)\r | |
1592 | break;\r | |
1593 | }\r | |
1594 | else if (c == '\\') {\r | |
1595 | tripcount = 0;\r | |
1596 | c = tok_nextc(tok);\r | |
1597 | if (c == EOF) {\r | |
1598 | tok->done = E_EOLS;\r | |
1599 | tok->cur = tok->inp;\r | |
1600 | return ERRORTOKEN;\r | |
1601 | }\r | |
1602 | }\r | |
1603 | else\r | |
1604 | tripcount = 0;\r | |
1605 | }\r | |
1606 | *p_start = tok->start;\r | |
1607 | *p_end = tok->cur;\r | |
1608 | return STRING;\r | |
1609 | }\r | |
1610 | \r | |
1611 | /* Line continuation */\r | |
1612 | if (c == '\\') {\r | |
1613 | c = tok_nextc(tok);\r | |
1614 | if (c != '\n') {\r | |
1615 | tok->done = E_LINECONT;\r | |
1616 | tok->cur = tok->inp;\r | |
1617 | return ERRORTOKEN;\r | |
1618 | }\r | |
1619 | tok->cont_line = 1;\r | |
1620 | goto again; /* Read next line */\r | |
1621 | }\r | |
1622 | \r | |
1623 | /* Check for two-character token */\r | |
1624 | {\r | |
1625 | int c2 = tok_nextc(tok);\r | |
1626 | int token = PyToken_TwoChars(c, c2);\r | |
1627 | #ifndef PGEN\r | |
1628 | if (Py_Py3kWarningFlag && token == NOTEQUAL && c == '<') {\r | |
1629 | if (PyErr_WarnExplicit(PyExc_DeprecationWarning,\r | |
1630 | "<> not supported in 3.x; use !=",\r | |
1631 | tok->filename, tok->lineno,\r | |
1632 | NULL, NULL)) {\r | |
1633 | return ERRORTOKEN;\r | |
1634 | }\r | |
1635 | }\r | |
1636 | #endif\r | |
1637 | if (token != OP) {\r | |
1638 | int c3 = tok_nextc(tok);\r | |
1639 | int token3 = PyToken_ThreeChars(c, c2, c3);\r | |
1640 | if (token3 != OP) {\r | |
1641 | token = token3;\r | |
1642 | } else {\r | |
1643 | tok_backup(tok, c3);\r | |
1644 | }\r | |
1645 | *p_start = tok->start;\r | |
1646 | *p_end = tok->cur;\r | |
1647 | return token;\r | |
1648 | }\r | |
1649 | tok_backup(tok, c2);\r | |
1650 | }\r | |
1651 | \r | |
1652 | /* Keep track of parentheses nesting level */\r | |
1653 | switch (c) {\r | |
1654 | case '(':\r | |
1655 | case '[':\r | |
1656 | case '{':\r | |
1657 | tok->level++;\r | |
1658 | break;\r | |
1659 | case ')':\r | |
1660 | case ']':\r | |
1661 | case '}':\r | |
1662 | tok->level--;\r | |
1663 | break;\r | |
1664 | }\r | |
1665 | \r | |
1666 | /* Punctuation character */\r | |
1667 | *p_start = tok->start;\r | |
1668 | *p_end = tok->cur;\r | |
1669 | return PyToken_OneChar(c);\r | |
1670 | }\r | |
1671 | \r | |
1672 | int\r | |
1673 | PyTokenizer_Get(struct tok_state *tok, char **p_start, char **p_end)\r | |
1674 | {\r | |
1675 | int result = tok_get(tok, p_start, p_end);\r | |
1676 | if (tok->decoding_erred) {\r | |
1677 | result = ERRORTOKEN;\r | |
1678 | tok->done = E_DECODE;\r | |
1679 | }\r | |
1680 | return result;\r | |
1681 | }\r | |
1682 | \r | |
1683 | /* This function is only called from parsetok. However, it cannot live\r | |
1684 | there, as it must be empty for PGEN, and we can check for PGEN only\r | |
1685 | in this file. */\r | |
1686 | \r | |
1687 | #if defined(PGEN) || !defined(Py_USING_UNICODE)\r | |
1688 | char*\r | |
1689 | PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int* offset)\r | |
1690 | {\r | |
1691 | return NULL;\r | |
1692 | }\r | |
1693 | #else\r | |
1694 | #ifdef Py_USING_UNICODE\r | |
1695 | static PyObject *\r | |
1696 | dec_utf8(const char *enc, const char *text, size_t len) {\r | |
1697 | PyObject *ret = NULL;\r | |
1698 | PyObject *unicode_text = PyUnicode_DecodeUTF8(text, len, "replace");\r | |
1699 | if (unicode_text) {\r | |
1700 | ret = PyUnicode_AsEncodedString(unicode_text, enc, "replace");\r | |
1701 | Py_DECREF(unicode_text);\r | |
1702 | }\r | |
1703 | if (!ret) {\r | |
1704 | PyErr_Clear();\r | |
1705 | }\r | |
1706 | return ret;\r | |
1707 | }\r | |
1708 | char *\r | |
1709 | PyTokenizer_RestoreEncoding(struct tok_state* tok, int len, int *offset)\r | |
1710 | {\r | |
1711 | char *text = NULL;\r | |
1712 | if (tok->encoding) {\r | |
1713 | /* convert source to original encondig */\r | |
1714 | PyObject *lineobj = dec_utf8(tok->encoding, tok->buf, len);\r | |
1715 | if (lineobj != NULL) {\r | |
1716 | int linelen = PyString_Size(lineobj);\r | |
1717 | const char *line = PyString_AsString(lineobj);\r | |
1718 | text = PyObject_MALLOC(linelen + 1);\r | |
1719 | if (text != NULL && line != NULL) {\r | |
1720 | if (linelen)\r | |
1721 | strncpy(text, line, linelen);\r | |
1722 | text[linelen] = '\0';\r | |
1723 | }\r | |
1724 | Py_DECREF(lineobj);\r | |
1725 | \r | |
1726 | /* adjust error offset */\r | |
1727 | if (*offset > 1) {\r | |
1728 | PyObject *offsetobj = dec_utf8(tok->encoding,\r | |
1729 | tok->buf, *offset-1);\r | |
1730 | if (offsetobj) {\r | |
1731 | *offset = PyString_Size(offsetobj) + 1;\r | |
1732 | Py_DECREF(offsetobj);\r | |
1733 | }\r | |
1734 | }\r | |
1735 | \r | |
1736 | }\r | |
1737 | }\r | |
1738 | return text;\r | |
1739 | \r | |
1740 | }\r | |
1741 | #endif /* defined(Py_USING_UNICODE) */\r | |
1742 | #endif\r | |
1743 | \r | |
1744 | \r | |
1745 | #ifdef Py_DEBUG\r | |
1746 | \r | |
1747 | void\r | |
1748 | tok_dump(int type, char *start, char *end)\r | |
1749 | {\r | |
1750 | printf("%s", _PyParser_TokenNames[type]);\r | |
1751 | if (type == NAME || type == NUMBER || type == STRING || type == OP)\r | |
1752 | printf("(%.*s)", (int)(end - start), start);\r | |
1753 | }\r | |
1754 | \r | |
1755 | #endif\r |