]>
git.proxmox.com Git - mirror_edk2.git/blob - AppPkg/Applications/Python/Python-2.7.10/Parser/tokenizer.c
086dc56741c5821cc57e6725af4f34830d3e70e4
2 /* Tokenizer implementation */
5 #include "pgenheaders.h"
10 #include "tokenizer.h"
14 #include "unicodeobject.h"
15 #include "stringobject.h"
16 #include "fileobject.h"
22 extern char *PyOS_Readline(FILE *, FILE *, char *);
23 /* Return malloc'ed string including trailing \n;
24 empty malloc'ed string for EOF;
25 NULL if interrupted */
27 /* Don't ever change this -- it would break the portability of Python code */
31 static struct tok_state
*tok_new(void);
32 static int tok_nextc(struct tok_state
*tok
);
33 static void tok_backup(struct tok_state
*tok
, int c
);
37 char *_PyParser_TokenNames
[] = {
89 /* This table must match the #defines in token.h! */
95 /* Create and initialize a new tok_state structure */
97 static struct tok_state
*
100 struct tok_state
*tok
= (struct tok_state
*)PyMem_MALLOC(
101 sizeof(struct tok_state
));
104 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= tok
->start
= NULL
;
108 tok
->tabsize
= TABSIZE
;
110 tok
->indstack
[0] = 0;
113 tok
->prompt
= tok
->nextprompt
= NULL
;
116 tok
->filename
= NULL
;
120 tok
->altindstack
[0] = 0;
121 tok
->decoding_state
= 0;
122 tok
->decoding_erred
= 0;
123 tok
->read_coding_spec
= 0;
124 tok
->encoding
= NULL
;
127 tok
->decoding_readline
= NULL
;
128 tok
->decoding_buffer
= NULL
;
134 new_string(const char *s
, Py_ssize_t len
)
136 char* result
= (char *)PyMem_MALLOC(len
+ 1);
137 if (result
!= NULL
) {
138 memcpy(result
, s
, len
);
147 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
149 return fgets(s
, size
, tok
->fp
);
153 decoding_feof(struct tok_state
*tok
)
155 return feof(tok
->fp
);
159 decode_str(const char *str
, int exec_input
, struct tok_state
*tok
)
161 return new_string(str
, strlen(str
));
167 error_ret(struct tok_state
*tok
) /* XXX */
169 tok
->decoding_erred
= 1;
170 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
) /* see PyTokenizer_Free */
171 PyMem_FREE(tok
->buf
);
173 return NULL
; /* as if it were EOF */
178 get_normal_name(char *s
) /* for utf-8 and latin-1 */
182 for (i
= 0; i
< 12; i
++) {
192 if (strcmp(buf
, "utf-8") == 0 ||
193 strncmp(buf
, "utf-8-", 6) == 0)
195 else if (strcmp(buf
, "latin-1") == 0 ||
196 strcmp(buf
, "iso-8859-1") == 0 ||
197 strcmp(buf
, "iso-latin-1") == 0 ||
198 strncmp(buf
, "latin-1-", 8) == 0 ||
199 strncmp(buf
, "iso-8859-1-", 11) == 0 ||
200 strncmp(buf
, "iso-latin-1-", 12) == 0)
206 /* Return the coding spec in S, or NULL if none is found. */
209 get_coding_spec(const char *s
, Py_ssize_t size
)
212 /* Coding spec must be in a comment, and that comment must be
213 * the only statement on the source code line. */
214 for (i
= 0; i
< size
- 6; i
++) {
217 if (s
[i
] != ' ' && s
[i
] != '\t' && s
[i
] != '\014')
220 for (; i
< size
- 6; i
++) { /* XXX inefficient search */
221 const char* t
= s
+ i
;
222 if (strncmp(t
, "coding", 6) == 0) {
223 const char* begin
= NULL
;
225 if (t
[0] != ':' && t
[0] != '=')
229 } while (t
[0] == '\x20' || t
[0] == '\t');
232 while (Py_ISALNUM(t
[0]) ||
233 t
[0] == '-' || t
[0] == '_' || t
[0] == '.')
237 char* r
= new_string(begin
, t
- begin
);
238 char* q
= get_normal_name(r
);
241 r
= new_string(q
, strlen(q
));
250 /* Check whether the line contains a coding spec. If it does,
251 invoke the set_readline function for the new encoding.
252 This function receives the tok_state and the new encoding.
253 Return 1 on success, 0 on failure. */
256 check_coding_spec(const char* line
, Py_ssize_t size
, struct tok_state
*tok
,
257 int set_readline(struct tok_state
*, const char *))
262 if (tok
->cont_line
) {
263 /* It's a continuation line, so it can't be a coding spec. */
264 tok
->read_coding_spec
= 1;
267 cs
= get_coding_spec(line
, size
);
270 for (i
= 0; i
< size
; i
++) {
271 if (line
[i
] == '#' || line
[i
] == '\n' || line
[i
] == '\r')
273 if (line
[i
] != ' ' && line
[i
] != '\t' && line
[i
] != '\014') {
274 /* Stop checking coding spec after a line containing
275 * anything except a comment. */
276 tok
->read_coding_spec
= 1;
281 tok
->read_coding_spec
= 1;
282 if (tok
->encoding
== NULL
) {
283 assert(tok
->decoding_state
== 1); /* raw */
284 if (strcmp(cs
, "utf-8") == 0 ||
285 strcmp(cs
, "iso-8859-1") == 0) {
288 #ifdef Py_USING_UNICODE
289 r
= set_readline(tok
, cs
);
292 tok
->decoding_state
= -1;
295 PyErr_Format(PyExc_SyntaxError
,
296 "encoding problem: %s", cs
);
300 /* Without Unicode support, we cannot
301 process the coding spec. Since there
302 won't be any Unicode literals, that
307 } else { /* then, compare cs with BOM */
308 r
= (strcmp(tok
->encoding
, cs
) == 0);
310 PyErr_Format(PyExc_SyntaxError
,
311 "encoding problem: %s with BOM", cs
);
318 /* See whether the file starts with a BOM. If it does,
319 invoke the set_readline function with the new encoding.
320 Return 1 on success, 0 on failure. */
323 check_bom(int get_char(struct tok_state
*),
324 void unget_char(int, struct tok_state
*),
325 int set_readline(struct tok_state
*, const char *),
326 struct tok_state
*tok
)
330 tok
->decoding_state
= 1;
333 } else if (ch1
== 0xEF) {
336 unget_char(ch2
, tok
);
337 unget_char(ch1
, tok
);
342 unget_char(ch3
, tok
);
343 unget_char(ch2
, tok
);
344 unget_char(ch1
, tok
);
348 /* Disable support for UTF-16 BOMs until a decision
349 is made whether this needs to be supported. */
350 } else if (ch1
== 0xFE) {
353 unget_char(ch2
, tok
);
354 unget_char(ch1
, tok
);
357 if (!set_readline(tok
, "utf-16-be"))
359 tok
->decoding_state
= -1;
360 } else if (ch1
== 0xFF) {
363 unget_char(ch2
, tok
);
364 unget_char(ch1
, tok
);
367 if (!set_readline(tok
, "utf-16-le"))
369 tok
->decoding_state
= -1;
372 unget_char(ch1
, tok
);
375 if (tok
->encoding
!= NULL
)
376 PyMem_FREE(tok
->encoding
);
377 tok
->encoding
= new_string("utf-8", 5); /* resulting is in utf-8 */
381 /* Read a line of text from TOK into S, using the stream in TOK.
382 Return NULL on failure, else S.
384 On entry, tok->decoding_buffer will be one of:
385 1) NULL: need to call tok->decoding_readline to get a new line
386 2) PyUnicodeObject *: decoding_feof has called tok->decoding_readline and
387 stored the result in tok->decoding_buffer
388 3) PyStringObject *: previous call to fp_readl did not have enough room
389 (in the s buffer) to copy entire contents of the line read
390 by tok->decoding_readline. tok->decoding_buffer has the overflow.
391 In this case, fp_readl is called in a loop (with an expanded buffer)
392 until the buffer ends with a '\n' (or until the end of the file is
393 reached): see tok_nextc and its calls to decoding_fgets.
397 fp_readl(char *s
, int size
, struct tok_state
*tok
)
399 #ifndef Py_USING_UNICODE
400 /* In a non-Unicode built, this should never be called. */
401 Py_FatalError("fp_readl should not be called in this build.");
402 return NULL
; /* Keep compiler happy (not reachable) */
404 PyObject
* utf8
= NULL
;
405 PyObject
* buf
= tok
->decoding_buffer
;
409 /* Ask for one less byte so we can terminate it */
414 buf
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
416 return error_ret(tok
);
417 if (!PyUnicode_Check(buf
)) {
419 PyErr_SetString(PyExc_SyntaxError
,
420 "codec did not return a unicode object");
421 return error_ret(tok
);
424 tok
->decoding_buffer
= NULL
;
425 if (PyString_CheckExact(buf
))
429 utf8
= PyUnicode_AsUTF8String(buf
);
432 return error_ret(tok
);
434 str
= PyString_AsString(utf8
);
435 utf8len
= PyString_GET_SIZE(utf8
);
436 if (utf8len
> size
) {
437 tok
->decoding_buffer
= PyString_FromStringAndSize(str
+size
, utf8len
-size
);
438 if (tok
->decoding_buffer
== NULL
) {
440 return error_ret(tok
);
444 memcpy(s
, str
, utf8len
);
448 return NULL
; /* EOF */
453 /* Set the readline function for TOK to a StreamReader's
454 readline function. The StreamReader is named ENC.
456 This function is called from check_bom and check_coding_spec.
458 ENC is usually identical to the future value of tok->encoding,
459 except for the (currently unsupported) case of UTF-16.
461 Return 1 on success, 0 on failure. */
464 fp_setreadl(struct tok_state
*tok
, const char* enc
)
466 PyObject
*reader
, *stream
, *readline
;
468 /* XXX: constify filename argument. */
469 stream
= PyFile_FromFile(tok
->fp
, (char*)tok
->filename
, "rb", NULL
);
473 reader
= PyCodec_StreamReader(enc
, stream
, NULL
);
478 readline
= PyObject_GetAttrString(reader
, "readline");
480 if (readline
== NULL
)
483 tok
->decoding_readline
= readline
;
487 /* Fetch the next byte from TOK. */
489 static int fp_getc(struct tok_state
*tok
) {
490 return getc(tok
->fp
);
493 /* Unfetch the last byte back into TOK. */
495 static void fp_ungetc(int c
, struct tok_state
*tok
) {
499 /* Read a line of input from TOK. Determine encoding
503 decoding_fgets(char *s
, int size
, struct tok_state
*tok
)
508 if (tok
->decoding_state
< 0) {
509 /* We already have a codec associated with
511 line
= fp_readl(s
, size
, tok
);
513 } else if (tok
->decoding_state
> 0) {
514 /* We want a 'raw' read. */
515 line
= Py_UniversalNewlineFgets(s
, size
,
519 /* We have not yet determined the encoding.
520 If an encoding is found, use the file-pointer
521 reader functions from now on. */
522 if (!check_bom(fp_getc
, fp_ungetc
, fp_setreadl
, tok
))
523 return error_ret(tok
);
524 assert(tok
->decoding_state
!= 0);
527 if (line
!= NULL
&& tok
->lineno
< 2 && !tok
->read_coding_spec
) {
528 if (!check_coding_spec(line
, strlen(line
), tok
, fp_setreadl
)) {
529 return error_ret(tok
);
533 /* The default encoding is ASCII, so make sure we don't have any
534 non-ASCII bytes in it. */
535 if (line
&& !tok
->encoding
) {
537 for (c
= (unsigned char *)line
; *c
; c
++)
545 /* Need to add 1 to the line number, since this line
546 has not been counted, yet. */
548 "Non-ASCII character '\\x%.2x' "
549 "in file %.200s on line %i, "
550 "but no encoding declared; "
551 "see http://python.org/dev/peps/pep-0263/ for details",
552 badchar
, tok
->filename
, tok
->lineno
+ 1);
553 PyErr_SetString(PyExc_SyntaxError
, buf
);
554 return error_ret(tok
);
561 decoding_feof(struct tok_state
*tok
)
563 if (tok
->decoding_state
>= 0) {
564 return feof(tok
->fp
);
566 PyObject
* buf
= tok
->decoding_buffer
;
568 buf
= PyObject_CallObject(tok
->decoding_readline
, NULL
);
573 tok
->decoding_buffer
= buf
;
576 return PyObject_Length(buf
) == 0;
580 /* Fetch a byte from TOK, using the string buffer. */
583 buf_getc(struct tok_state
*tok
) {
584 return Py_CHARMASK(*tok
->str
++);
587 /* Unfetch a byte from TOK, using the string buffer. */
590 buf_ungetc(int c
, struct tok_state
*tok
) {
592 assert(Py_CHARMASK(*tok
->str
) == c
); /* tok->cur may point to read-only segment */
595 /* Set the readline function for TOK to ENC. For the string-based
596 tokenizer, this means to just record the encoding. */
599 buf_setreadl(struct tok_state
*tok
, const char* enc
) {
604 /* Return a UTF-8 encoding Python string object from the
605 C byte string STR, which is encoded with ENC. */
607 #ifdef Py_USING_UNICODE
609 translate_into_utf8(const char* str
, const char* enc
) {
611 PyObject
* buf
= PyUnicode_Decode(str
, strlen(str
), enc
, NULL
);
614 utf8
= PyUnicode_AsUTF8String(buf
);
622 translate_newlines(const char *s
, int exec_input
, struct tok_state
*tok
) {
623 int skip_next_lf
= 0, needed_length
= strlen(s
) + 2, final_length
;
626 buf
= PyMem_MALLOC(needed_length
);
631 for (current
= buf
; *s
; s
++, current
++) {
647 /* If this is exec input, add a newline to the end of the string if
648 there isn't one already. */
649 if (exec_input
&& c
!= '\n') {
654 final_length
= current
- buf
+ 1;
655 if (final_length
< needed_length
&& final_length
)
656 /* should never fail */
657 buf
= PyMem_REALLOC(buf
, final_length
);
661 /* Decode a byte string STR for use as the buffer of TOK.
662 Look for encoding declarations inside STR, and record them
666 decode_str(const char *input
, int single
, struct tok_state
*tok
)
668 PyObject
* utf8
= NULL
;
671 const char *newl
[2] = {NULL
, NULL
};
673 tok
->input
= str
= translate_newlines(input
, single
, tok
);
678 if (!check_bom(buf_getc
, buf_ungetc
, buf_setreadl
, tok
))
679 return error_ret(tok
);
680 str
= tok
->str
; /* string after BOM if any */
682 #ifdef Py_USING_UNICODE
683 if (tok
->enc
!= NULL
) {
684 utf8
= translate_into_utf8(str
, tok
->enc
);
686 return error_ret(tok
);
687 str
= PyString_AsString(utf8
);
690 for (s
= str
;; s
++) {
691 if (*s
== '\0') break;
692 else if (*s
== '\n') {
696 if (lineno
== 2) break;
700 /* need to check line 1 and 2 separately since check_coding_spec
701 assumes a single line as input */
703 if (!check_coding_spec(str
, newl
[0] - str
, tok
, buf_setreadl
))
704 return error_ret(tok
);
705 if (tok
->enc
== NULL
&& !tok
->read_coding_spec
&& newl
[1]) {
706 if (!check_coding_spec(newl
[0]+1, newl
[1] - newl
[0],
708 return error_ret(tok
);
711 #ifdef Py_USING_UNICODE
712 if (tok
->enc
!= NULL
) {
713 assert(utf8
== NULL
);
714 utf8
= translate_into_utf8(str
, tok
->enc
);
716 return error_ret(tok
);
717 str
= PyString_AsString(utf8
);
720 assert(tok
->decoding_buffer
== NULL
);
721 tok
->decoding_buffer
= utf8
; /* CAUTION */
727 /* Set up tokenizer for string */
730 PyTokenizer_FromString(const char *str
, int exec_input
)
732 struct tok_state
*tok
= tok_new();
735 str
= (char *)decode_str(str
, exec_input
, tok
);
737 PyTokenizer_Free(tok
);
741 /* XXX: constify members. */
742 tok
->buf
= tok
->cur
= tok
->end
= tok
->inp
= (char*)str
;
747 /* Set up tokenizer for file */
750 PyTokenizer_FromFile(FILE *fp
, char *ps1
, char *ps2
)
752 struct tok_state
*tok
= tok_new();
755 if ((tok
->buf
= (char *)PyMem_MALLOC(BUFSIZ
)) == NULL
) {
756 PyTokenizer_Free(tok
);
759 tok
->cur
= tok
->inp
= tok
->buf
;
760 tok
->end
= tok
->buf
+ BUFSIZ
;
763 tok
->nextprompt
= ps2
;
768 /* Free a tok_state structure */
771 PyTokenizer_Free(struct tok_state
*tok
)
773 if (tok
->encoding
!= NULL
)
774 PyMem_FREE(tok
->encoding
);
776 Py_XDECREF(tok
->decoding_readline
);
777 Py_XDECREF(tok
->decoding_buffer
);
779 if (tok
->fp
!= NULL
&& tok
->buf
!= NULL
)
780 PyMem_FREE(tok
->buf
);
782 PyMem_FREE((char *)tok
->input
);
786 #if !defined(PGEN) && defined(Py_USING_UNICODE)
788 tok_stdin_decode(struct tok_state
*tok
, char **inp
)
790 PyObject
*enc
, *sysstdin
, *decoded
, *utf8
;
791 const char *encoding
;
794 if (PySys_GetFile((char *)"stdin", NULL
) != stdin
)
796 sysstdin
= PySys_GetObject("stdin");
797 if (sysstdin
== NULL
|| !PyFile_Check(sysstdin
))
800 enc
= ((PyFileObject
*)sysstdin
)->f_encoding
;
801 if (enc
== NULL
|| !PyString_Check(enc
))
805 encoding
= PyString_AsString(enc
);
806 decoded
= PyUnicode_Decode(*inp
, strlen(*inp
), encoding
, NULL
);
810 utf8
= PyUnicode_AsEncodedString(decoded
, "utf-8", NULL
);
815 assert(PyString_Check(utf8
));
816 converted
= new_string(PyString_AS_STRING(utf8
),
817 PyString_GET_SIZE(utf8
));
819 if (converted
== NULL
)
824 if (tok
->encoding
!= NULL
)
825 PyMem_FREE(tok
->encoding
);
826 tok
->encoding
= new_string(encoding
, strlen(encoding
));
827 if (tok
->encoding
== NULL
)
840 if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError
)) {
844 /* Fallback to iso-8859-1: for backward compatibility */
850 /* Get next char, updating state; error code goes into tok->done */
853 tok_nextc(register struct tok_state
*tok
)
856 if (tok
->cur
!= tok
->inp
) {
857 return Py_CHARMASK(*tok
->cur
++); /* Fast path */
859 if (tok
->done
!= E_OK
)
861 if (tok
->fp
== NULL
) {
862 char *end
= strchr(tok
->inp
, '\n');
866 end
= strchr(tok
->inp
, '\0');
867 if (end
== tok
->inp
) {
872 if (tok
->start
== NULL
)
874 tok
->line_start
= tok
->cur
;
877 return Py_CHARMASK(*tok
->cur
++);
879 if (tok
->prompt
!= NULL
) {
880 char *newtok
= PyOS_Readline(stdin
, stdout
, tok
->prompt
);
881 if (tok
->nextprompt
!= NULL
)
882 tok
->prompt
= tok
->nextprompt
;
885 else if (*newtok
== '\0') {
889 #if !defined(PGEN) && defined(Py_USING_UNICODE)
890 else if (tok_stdin_decode(tok
, &newtok
) != 0)
893 else if (tok
->start
!= NULL
) {
894 size_t start
= tok
->start
- tok
->buf
;
895 size_t oldlen
= tok
->cur
- tok
->buf
;
896 size_t newlen
= oldlen
+ strlen(newtok
);
897 char *buf
= tok
->buf
;
898 buf
= (char *)PyMem_REALLOC(buf
, newlen
+1);
901 PyMem_FREE(tok
->buf
);
908 tok
->cur
= tok
->buf
+ oldlen
;
909 tok
->line_start
= tok
->cur
;
910 strcpy(tok
->buf
+ oldlen
, newtok
);
912 tok
->inp
= tok
->buf
+ newlen
;
913 tok
->end
= tok
->inp
+ 1;
914 tok
->start
= tok
->buf
+ start
;
918 if (tok
->buf
!= NULL
)
919 PyMem_FREE(tok
->buf
);
921 tok
->line_start
= tok
->buf
;
923 tok
->line_start
= tok
->buf
;
924 tok
->inp
= strchr(tok
->buf
, '\0');
925 tok
->end
= tok
->inp
+ 1;
932 if (tok
->start
== NULL
) {
933 if (tok
->buf
== NULL
) {
935 PyMem_MALLOC(BUFSIZ
);
936 if (tok
->buf
== NULL
) {
940 tok
->end
= tok
->buf
+ BUFSIZ
;
942 if (decoding_fgets(tok
->buf
, (int)(tok
->end
- tok
->buf
),
949 tok
->inp
= strchr(tok
->buf
, '\0');
950 done
= tok
->inp
[-1] == '\n';
954 cur
= tok
->cur
- tok
->buf
;
955 if (decoding_feof(tok
)) {
963 /* Read until '\n' or EOF */
965 Py_ssize_t curstart
= tok
->start
== NULL
? -1 :
966 tok
->start
- tok
->buf
;
967 Py_ssize_t curvalid
= tok
->inp
- tok
->buf
;
968 Py_ssize_t newsize
= curvalid
+ BUFSIZ
;
969 char *newbuf
= tok
->buf
;
970 newbuf
= (char *)PyMem_REALLOC(newbuf
,
972 if (newbuf
== NULL
) {
978 tok
->inp
= tok
->buf
+ curvalid
;
979 tok
->end
= tok
->buf
+ newsize
;
980 tok
->start
= curstart
< 0 ? NULL
:
982 if (decoding_fgets(tok
->inp
,
983 (int)(tok
->end
- tok
->inp
),
985 /* Break out early on decoding
986 errors, as tok->buf will be NULL
988 if (tok
->decoding_erred
)
990 /* Last line does not end in \n,
992 strcpy(tok
->inp
, "\n");
994 tok
->inp
= strchr(tok
->inp
, '\0');
995 done
= tok
->inp
[-1] == '\n';
997 if (tok
->buf
!= NULL
) {
998 tok
->cur
= tok
->buf
+ cur
;
999 tok
->line_start
= tok
->cur
;
1000 /* replace "\r\n" with "\n" */
1001 /* For Mac leave the \r, giving a syntax error */
1003 if (pt
>= tok
->buf
&& *pt
== '\r') {
1010 if (tok
->done
!= E_OK
) {
1011 if (tok
->prompt
!= NULL
)
1012 PySys_WriteStderr("\n");
1013 tok
->cur
= tok
->inp
;
1021 /* Back-up one character */
1024 tok_backup(register struct tok_state
*tok
, register int c
)
1027 if (--tok
->cur
< tok
->buf
)
1028 Py_FatalError("tok_backup: beginning of buffer");
1035 /* Return the token corresponding to a single character */
1038 PyToken_OneChar(int c
)
1041 case '(': return LPAR
;
1042 case ')': return RPAR
;
1043 case '[': return LSQB
;
1044 case ']': return RSQB
;
1045 case ':': return COLON
;
1046 case ',': return COMMA
;
1047 case ';': return SEMI
;
1048 case '+': return PLUS
;
1049 case '-': return MINUS
;
1050 case '*': return STAR
;
1051 case '/': return SLASH
;
1052 case '|': return VBAR
;
1053 case '&': return AMPER
;
1054 case '<': return LESS
;
1055 case '>': return GREATER
;
1056 case '=': return EQUAL
;
1057 case '.': return DOT
;
1058 case '%': return PERCENT
;
1059 case '`': return BACKQUOTE
;
1060 case '{': return LBRACE
;
1061 case '}': return RBRACE
;
1062 case '^': return CIRCUMFLEX
;
1063 case '~': return TILDE
;
1064 case '@': return AT
;
1071 PyToken_TwoChars(int c1
, int c2
)
1076 case '=': return EQEQUAL
;
1081 case '=': return NOTEQUAL
;
1086 case '>': return NOTEQUAL
;
1087 case '=': return LESSEQUAL
;
1088 case '<': return LEFTSHIFT
;
1093 case '=': return GREATEREQUAL
;
1094 case '>': return RIGHTSHIFT
;
1099 case '=': return PLUSEQUAL
;
1104 case '=': return MINEQUAL
;
1109 case '*': return DOUBLESTAR
;
1110 case '=': return STAREQUAL
;
1115 case '/': return DOUBLESLASH
;
1116 case '=': return SLASHEQUAL
;
1121 case '=': return VBAREQUAL
;
1126 case '=': return PERCENTEQUAL
;
1131 case '=': return AMPEREQUAL
;
1136 case '=': return CIRCUMFLEXEQUAL
;
1144 PyToken_ThreeChars(int c1
, int c2
, int c3
)
1152 return LEFTSHIFTEQUAL
;
1162 return RIGHTSHIFTEQUAL
;
1172 return DOUBLESTAREQUAL
;
1182 return DOUBLESLASHEQUAL
;
1192 indenterror(struct tok_state
*tok
)
1194 if (tok
->alterror
) {
1195 tok
->done
= E_TABSPACE
;
1196 tok
->cur
= tok
->inp
;
1199 if (tok
->altwarning
) {
1200 PySys_WriteStderr("%s: inconsistent use of tabs and spaces "
1201 "in indentation\n", tok
->filename
);
1202 tok
->altwarning
= 0;
1207 /* Get next token, after space stripping etc. */
1210 tok_get(register struct tok_state
*tok
, char **p_start
, char **p_end
)
1215 *p_start
= *p_end
= NULL
;
1220 /* Get indentation level */
1222 register int col
= 0;
1223 register int altcol
= 0;
1229 else if (c
== '\t') {
1230 col
= (col
/tok
->tabsize
+ 1) * tok
->tabsize
;
1231 altcol
= (altcol
/tok
->alttabsize
+ 1)
1234 else if (c
== '\014') /* Control-L (formfeed) */
1235 col
= altcol
= 0; /* For Emacs users */
1240 if (c
== '#' || c
== '\n') {
1241 /* Lines with only whitespace and/or comments
1242 shouldn't affect the indentation and are
1243 not passed to the parser as NEWLINE tokens,
1244 except *totally* empty lines in interactive
1245 mode, which signal the end of a command group. */
1246 if (col
== 0 && c
== '\n' && tok
->prompt
!= NULL
)
1247 blankline
= 0; /* Let it through */
1249 blankline
= 1; /* Ignore completely */
1250 /* We can't jump back right here since we still
1251 may need to skip to the end of a comment */
1253 if (!blankline
&& tok
->level
== 0) {
1254 if (col
== tok
->indstack
[tok
->indent
]) {
1256 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1257 if (indenterror(tok
))
1261 else if (col
> tok
->indstack
[tok
->indent
]) {
1262 /* Indent -- always one */
1263 if (tok
->indent
+1 >= MAXINDENT
) {
1264 tok
->done
= E_TOODEEP
;
1265 tok
->cur
= tok
->inp
;
1268 if (altcol
<= tok
->altindstack
[tok
->indent
]) {
1269 if (indenterror(tok
))
1273 tok
->indstack
[++tok
->indent
] = col
;
1274 tok
->altindstack
[tok
->indent
] = altcol
;
1276 else /* col < tok->indstack[tok->indent] */ {
1277 /* Dedent -- any number, must be consistent */
1278 while (tok
->indent
> 0 &&
1279 col
< tok
->indstack
[tok
->indent
]) {
1283 if (col
!= tok
->indstack
[tok
->indent
]) {
1284 tok
->done
= E_DEDENT
;
1285 tok
->cur
= tok
->inp
;
1288 if (altcol
!= tok
->altindstack
[tok
->indent
]) {
1289 if (indenterror(tok
))
1296 tok
->start
= tok
->cur
;
1298 /* Return pending indents/dedents */
1299 if (tok
->pendin
!= 0) {
1300 if (tok
->pendin
< 0) {
1315 } while (c
== ' ' || c
== '\t' || c
== '\014');
1317 /* Set start of current token */
1318 tok
->start
= tok
->cur
- 1;
1320 /* Skip comment, while looking for tab-setting magic */
1322 static char *tabforms
[] = {
1323 "tab-width:", /* Emacs */
1324 ":tabstop=", /* vim, full form */
1325 ":ts=", /* vim, abbreviated form */
1326 "set tabsize=", /* will vi never die? */
1327 /* more templates can be added here to support other editors */
1333 *tp
++ = c
= tok_nextc(tok
);
1334 } while (c
!= EOF
&& c
!= '\n' &&
1335 (size_t)(tp
- cbuf
+ 1) < sizeof(cbuf
));
1338 cp
< tabforms
+ sizeof(tabforms
)/sizeof(tabforms
[0]);
1340 if ((tp
= strstr(cbuf
, *cp
))) {
1341 int newsize
= atoi(tp
+ strlen(*cp
));
1343 if (newsize
>= 1 && newsize
<= 40) {
1344 tok
->tabsize
= newsize
;
1347 "Tab size set to %d\n",
1352 while (c
!= EOF
&& c
!= '\n')
1356 /* Check for EOF and errors now */
1358 return tok
->done
== E_EOF
? ENDMARKER
: ERRORTOKEN
;
1361 /* Identifier (most frequent token!) */
1362 if (Py_ISALPHA(c
) || c
== '_') {
1363 /* Process r"", u"" and ur"" */
1368 if (c
== 'r' || c
== 'R')
1370 if (c
== '"' || c
== '\'')
1376 if (c
== '"' || c
== '\'')
1382 if (c
== 'r' || c
== 'R')
1384 if (c
== '"' || c
== '\'')
1388 while (c
!= EOF
&& (Py_ISALNUM(c
) || c
== '_')) {
1392 *p_start
= tok
->start
;
1400 if (blankline
|| tok
->level
> 0)
1402 *p_start
= tok
->start
;
1403 *p_end
= tok
->cur
- 1; /* Leave '\n' out of the string */
1408 /* Period or number starting with period? */
1416 *p_start
= tok
->start
;
1425 /* Hex, octal or binary -- maybe. */
1429 #ifndef WITHOUT_COMPLEX
1430 if (c
== 'j' || c
== 'J')
1433 if (c
== 'x' || c
== 'X') {
1438 tok
->done
= E_TOKEN
;
1444 } while (isxdigit(c
));
1446 else if (c
== 'o' || c
== 'O') {
1449 if (c
< '0' || c
>= '8') {
1450 tok
->done
= E_TOKEN
;
1456 } while ('0' <= c
&& c
< '8');
1458 else if (c
== 'b' || c
== 'B') {
1461 if (c
!= '0' && c
!= '1') {
1462 tok
->done
= E_TOKEN
;
1468 } while (c
== '0' || c
== '1');
1471 int found_decimal
= 0;
1472 /* Octal; c is first char of it */
1473 /* There's no 'isoctdigit' macro, sigh */
1474 while ('0' <= c
&& c
< '8') {
1481 } while (isdigit(c
));
1485 else if (c
== 'e' || c
== 'E')
1487 #ifndef WITHOUT_COMPLEX
1488 else if (c
== 'j' || c
== 'J')
1491 else if (found_decimal
) {
1492 tok
->done
= E_TOKEN
;
1497 if (c
== 'l' || c
== 'L')
1504 } while (isdigit(c
));
1505 if (c
== 'l' || c
== 'L')
1508 /* Accept floating point numbers. */
1514 } while (isdigit(c
));
1516 if (c
== 'e' || c
== 'E') {
1522 if (c
== '+' || c
== '-') {
1525 tok
->done
= E_TOKEN
;
1529 } else if (!isdigit(c
)) {
1532 *p_start
= tok
->start
;
1538 } while (isdigit(c
));
1540 #ifndef WITHOUT_COMPLEX
1541 if (c
== 'j' || c
== 'J')
1542 /* Imaginary part */
1549 *p_start
= tok
->start
;
1556 if (c
== '\'' || c
== '"') {
1557 Py_ssize_t quote2
= tok
->cur
- tok
->start
+ 1;
1570 tok
->cont_line
= 1; /* multiline string. */
1572 else if (c
== EOF
) {
1577 tok
->cur
= tok
->inp
;
1580 else if (c
== quote
) {
1582 if (tok
->cur
- tok
->start
== quote2
) {
1591 if (!triple
|| tripcount
== 3)
1594 else if (c
== '\\') {
1599 tok
->cur
= tok
->inp
;
1606 *p_start
= tok
->start
;
1611 /* Line continuation */
1615 tok
->done
= E_LINECONT
;
1616 tok
->cur
= tok
->inp
;
1620 goto again
; /* Read next line */
1623 /* Check for two-character token */
1625 int c2
= tok_nextc(tok
);
1626 int token
= PyToken_TwoChars(c
, c2
);
1628 if (Py_Py3kWarningFlag
&& token
== NOTEQUAL
&& c
== '<') {
1629 if (PyErr_WarnExplicit(PyExc_DeprecationWarning
,
1630 "<> not supported in 3.x; use !=",
1631 tok
->filename
, tok
->lineno
,
1638 int c3
= tok_nextc(tok
);
1639 int token3
= PyToken_ThreeChars(c
, c2
, c3
);
1643 tok_backup(tok
, c3
);
1645 *p_start
= tok
->start
;
1649 tok_backup(tok
, c2
);
1652 /* Keep track of parentheses nesting level */
1666 /* Punctuation character */
1667 *p_start
= tok
->start
;
1669 return PyToken_OneChar(c
);
1673 PyTokenizer_Get(struct tok_state
*tok
, char **p_start
, char **p_end
)
1675 int result
= tok_get(tok
, p_start
, p_end
);
1676 if (tok
->decoding_erred
) {
1677 result
= ERRORTOKEN
;
1678 tok
->done
= E_DECODE
;
1683 /* This function is only called from parsetok. However, it cannot live
1684 there, as it must be empty for PGEN, and we can check for PGEN only
1687 #if defined(PGEN) || !defined(Py_USING_UNICODE)
1689 PyTokenizer_RestoreEncoding(struct tok_state
* tok
, int len
, int* offset
)
1694 #ifdef Py_USING_UNICODE
1696 dec_utf8(const char *enc
, const char *text
, size_t len
) {
1697 PyObject
*ret
= NULL
;
1698 PyObject
*unicode_text
= PyUnicode_DecodeUTF8(text
, len
, "replace");
1700 ret
= PyUnicode_AsEncodedString(unicode_text
, enc
, "replace");
1701 Py_DECREF(unicode_text
);
1709 PyTokenizer_RestoreEncoding(struct tok_state
* tok
, int len
, int *offset
)
1712 if (tok
->encoding
) {
1713 /* convert source to original encondig */
1714 PyObject
*lineobj
= dec_utf8(tok
->encoding
, tok
->buf
, len
);
1715 if (lineobj
!= NULL
) {
1716 int linelen
= PyString_Size(lineobj
);
1717 const char *line
= PyString_AsString(lineobj
);
1718 text
= PyObject_MALLOC(linelen
+ 1);
1719 if (text
!= NULL
&& line
!= NULL
) {
1721 strncpy(text
, line
, linelen
);
1722 text
[linelen
] = '\0';
1726 /* adjust error offset */
1728 PyObject
*offsetobj
= dec_utf8(tok
->encoding
,
1729 tok
->buf
, *offset
-1);
1731 *offset
= PyString_Size(offsetobj
) + 1;
1732 Py_DECREF(offsetobj
);
1741 #endif /* defined(Py_USING_UNICODE) */
1748 tok_dump(int type
, char *start
, char *end
)
1750 printf("%s", _PyParser_TokenNames
[type
]);
1751 if (type
== NAME
|| type
== NUMBER
|| type
== STRING
|| type
== OP
)
1752 printf("(%.*s)", (int)(end
- start
), start
);