]>
Commit | Line | Data |
---|---|---|
be44585c | 1 | # Copyright (c) 2010, 2011 Nicira Networks |
99155935 BP |
2 | # |
3 | # Licensed under the Apache License, Version 2.0 (the "License"); | |
4 | # you may not use this file except in compliance with the License. | |
5 | # You may obtain a copy of the License at: | |
6 | # | |
7 | # http://www.apache.org/licenses/LICENSE-2.0 | |
8 | # | |
9 | # Unless required by applicable law or agreed to in writing, software | |
10 | # distributed under the License is distributed on an "AS IS" BASIS, | |
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
12 | # See the License for the specific language governing permissions and | |
13 | # limitations under the License. | |
14 | ||
15 | import re | |
16 | import StringIO | |
17 | import sys | |
18 | ||
19 | escapes = {ord('"'): u"\\\"", | |
20 | ord("\\"): u"\\\\", | |
21 | ord("\b"): u"\\b", | |
22 | ord("\f"): u"\\f", | |
23 | ord("\n"): u"\\n", | |
24 | ord("\r"): u"\\r", | |
25 | ord("\t"): u"\\t"} | |
26 | for i in range(32): | |
27 | if i not in escapes: | |
28 | escapes[i] = u"\\u%04x" % i | |
29 | ||
30 | def __dump_string(stream, s): | |
e0b23327 | 31 | stream.write(u'"%s"' % ''.join(escapes.get(ord(c), c) for c in s)) |
99155935 BP |
32 | |
33 | def to_stream(obj, stream, pretty=False, sort_keys=True): | |
34 | if obj is None: | |
35 | stream.write(u"null") | |
36 | elif obj is False: | |
37 | stream.write(u"false") | |
38 | elif obj is True: | |
39 | stream.write(u"true") | |
40 | elif type(obj) in (int, long): | |
41 | stream.write(u"%d" % obj) | |
42 | elif type(obj) == float: | |
43 | stream.write("%.15g" % obj) | |
44 | elif type(obj) == unicode: | |
45 | __dump_string(stream, obj) | |
46 | elif type(obj) == str: | |
47 | __dump_string(stream, unicode(obj)) | |
48 | elif type(obj) == dict: | |
49 | stream.write(u"{") | |
50 | if sort_keys: | |
51 | items = sorted(obj.items()) | |
52 | else: | |
53 | items = obj.iteritems() | |
367da738 | 54 | for i, (key, value) in enumerate(items): |
99155935 BP |
55 | if i > 0: |
56 | stream.write(u",") | |
99155935 BP |
57 | __dump_string(stream, unicode(key)) |
58 | stream.write(u":") | |
59 | to_stream(value, stream, pretty, sort_keys) | |
60 | stream.write(u"}") | |
61 | elif type(obj) in (list, tuple): | |
62 | stream.write(u"[") | |
367da738 | 63 | for i, value in enumerate(obj): |
99155935 BP |
64 | if i > 0: |
65 | stream.write(u",") | |
99155935 BP |
66 | to_stream(value, stream, pretty, sort_keys) |
67 | stream.write(u"]") | |
68 | else: | |
6732237b | 69 | raise Exception("can't serialize %s as JSON" % obj) |
99155935 BP |
70 | |
71 | def to_file(obj, name, pretty=False, sort_keys=True): | |
72 | stream = open(name, "w") | |
73 | try: | |
74 | to_stream(obj, stream, pretty, sort_keys) | |
75 | finally: | |
76 | stream.close() | |
77 | ||
78 | def to_string(obj, pretty=False, sort_keys=True): | |
79 | output = StringIO.StringIO() | |
80 | to_stream(obj, output, pretty, sort_keys) | |
81 | s = output.getvalue() | |
82 | output.close() | |
83 | return s | |
84 | ||
85 | def from_stream(stream): | |
86 | p = Parser(check_trailer=True) | |
87 | while True: | |
88 | buf = stream.read(4096) | |
89 | if buf == "" or p.feed(buf) != len(buf): | |
90 | break | |
91 | return p.finish() | |
92 | ||
93 | def from_file(name): | |
94 | stream = open(name, "r") | |
95 | try: | |
96 | return from_stream(stream) | |
97 | finally: | |
98 | stream.close() | |
99 | ||
100 | def from_string(s): | |
101 | try: | |
102 | s = unicode(s, 'utf-8') | |
103 | except UnicodeDecodeError, e: | |
070de9bd BP |
104 | seq = ' '.join(["0x%2x" % ord(c) |
105 | for c in e.object[e.start:e.end] if ord(c) >= 0x80]) | |
be44585c | 106 | return ("not a valid UTF-8 string: invalid UTF-8 sequence %s" % seq) |
99155935 BP |
107 | p = Parser(check_trailer=True) |
108 | p.feed(s) | |
109 | return p.finish() | |
110 | ||
111 | class Parser(object): | |
112 | ## Maximum height of parsing stack. ## | |
113 | MAX_HEIGHT = 1000 | |
114 | ||
115 | def __init__(self, check_trailer=False): | |
116 | self.check_trailer = check_trailer | |
117 | ||
118 | # Lexical analysis. | |
119 | self.lex_state = Parser.__lex_start | |
120 | self.buffer = "" | |
121 | self.line_number = 0 | |
122 | self.column_number = 0 | |
123 | self.byte_number = 0 | |
124 | ||
125 | # Parsing. | |
126 | self.parse_state = Parser.__parse_start | |
127 | self.stack = [] | |
128 | self.member_name = None | |
129 | ||
130 | # Parse status. | |
131 | self.done = False | |
132 | self.error = None | |
133 | ||
134 | def __lex_start_space(self, c): | |
135 | pass | |
136 | def __lex_start_alpha(self, c): | |
137 | self.buffer = c | |
138 | self.lex_state = Parser.__lex_keyword | |
139 | def __lex_start_token(self, c): | |
140 | self.__parser_input(c) | |
141 | def __lex_start_number(self, c): | |
142 | self.buffer = c | |
143 | self.lex_state = Parser.__lex_number | |
144 | def __lex_start_string(self, c): | |
145 | self.lex_state = Parser.__lex_string | |
146 | def __lex_start_error(self, c): | |
147 | if ord(c) >= 32 and ord(c) < 128: | |
148 | self.__error("invalid character '%s'" % c) | |
149 | else: | |
150 | self.__error("invalid character U+%04x" % ord(c)) | |
151 | ||
152 | __lex_start_actions = {} | |
153 | for c in " \t\n\r": | |
154 | __lex_start_actions[c] = __lex_start_space | |
155 | for c in "abcdefghijklmnopqrstuvwxyz": | |
156 | __lex_start_actions[c] = __lex_start_alpha | |
157 | for c in "[{]}:,": | |
158 | __lex_start_actions[c] = __lex_start_token | |
159 | for c in "-0123456789": | |
160 | __lex_start_actions[c] = __lex_start_number | |
161 | __lex_start_actions['"'] = __lex_start_string | |
162 | def __lex_start(self, c): | |
163 | Parser.__lex_start_actions.get( | |
164 | c, Parser.__lex_start_error)(self, c) | |
165 | return True | |
166 | ||
167 | __lex_alpha = {} | |
168 | for c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ": | |
169 | __lex_alpha[c] = True | |
170 | def __lex_finish_keyword(self): | |
171 | if self.buffer == "false": | |
172 | self.__parser_input(False) | |
173 | elif self.buffer == "true": | |
174 | self.__parser_input(True) | |
175 | elif self.buffer == "null": | |
176 | self.__parser_input(None) | |
177 | else: | |
178 | self.__error("invalid keyword '%s'" % self.buffer) | |
179 | def __lex_keyword(self, c): | |
180 | if c in Parser.__lex_alpha: | |
181 | self.buffer += c | |
182 | return True | |
183 | else: | |
184 | self.__lex_finish_keyword() | |
185 | return False | |
186 | ||
187 | __number_re = re.compile("(-)?(0|[1-9][0-9]*)(?:\.([0-9]+))?(?:[eE]([-+]?[0-9]+))?$") | |
188 | def __lex_finish_number(self): | |
189 | s = self.buffer | |
190 | m = Parser.__number_re.match(s) | |
191 | if m: | |
192 | sign, integer, fraction, exp = m.groups() | |
193 | if (exp is not None and | |
194 | (long(exp) > sys.maxint or long(exp) < -sys.maxint - 1)): | |
195 | self.__error("exponent outside valid range") | |
196 | return | |
197 | ||
198 | if fraction is not None and len(fraction.lstrip('0')) == 0: | |
199 | fraction = None | |
200 | ||
201 | sig_string = integer | |
202 | if fraction is not None: | |
203 | sig_string += fraction | |
204 | significand = int(sig_string) | |
205 | ||
206 | pow10 = 0 | |
207 | if fraction is not None: | |
208 | pow10 -= len(fraction) | |
209 | if exp is not None: | |
210 | pow10 += long(exp) | |
211 | ||
212 | if significand == 0: | |
213 | self.__parser_input(0) | |
214 | return | |
215 | elif significand <= 2**63: | |
216 | while pow10 > 0 and significand <= 2*63: | |
217 | significand *= 10 | |
218 | pow10 -= 1 | |
219 | while pow10 < 0 and significand % 10 == 0: | |
220 | significand /= 10 | |
221 | pow10 += 1 | |
222 | if (pow10 == 0 and | |
223 | ((not sign and significand < 2**63) or | |
224 | (sign and significand <= 2**63))): | |
225 | if sign: | |
226 | self.__parser_input(-significand) | |
227 | else: | |
228 | self.__parser_input(significand) | |
229 | return | |
230 | ||
231 | value = float(s) | |
232 | if value == float("inf") or value == float("-inf"): | |
233 | self.__error("number outside valid range") | |
234 | return | |
235 | if value == 0: | |
236 | # Suppress negative zero. | |
237 | value = 0 | |
238 | self.__parser_input(value) | |
239 | elif re.match("-?0[0-9]", s): | |
240 | self.__error("leading zeros not allowed") | |
241 | elif re.match("-([^0-9]|$)", s): | |
242 | self.__error("'-' must be followed by digit") | |
243 | elif re.match("-?(0|[1-9][0-9]*)\.([^0-9]|$)", s): | |
244 | self.__error("decimal point must be followed by digit") | |
245 | elif re.search("e[-+]?([^0-9]|$)", s): | |
246 | self.__error("exponent must contain at least one digit") | |
247 | else: | |
248 | self.__error("syntax error in number") | |
249 | ||
250 | def __lex_number(self, c): | |
251 | if c in ".0123456789eE-+": | |
252 | self.buffer += c | |
253 | return True | |
254 | else: | |
255 | self.__lex_finish_number() | |
256 | return False | |
257 | ||
258 | __4hex_re = re.compile("[0-9a-fA-F]{4}") | |
259 | def __lex_4hex(self, s): | |
260 | if len(s) < 4: | |
261 | self.__error("quoted string ends within \\u escape") | |
262 | elif not Parser.__4hex_re.match(s): | |
263 | self.__error("malformed \\u escape") | |
264 | elif s == "0000": | |
265 | self.__error("null bytes not supported in quoted strings") | |
266 | else: | |
267 | return int(s, 16) | |
268 | @staticmethod | |
269 | def __is_leading_surrogate(c): | |
270 | """Returns true if 'c' is a Unicode code point for a leading | |
271 | surrogate.""" | |
272 | return c >= 0xd800 and c <= 0xdbff | |
273 | @staticmethod | |
274 | def __is_trailing_surrogate(c): | |
275 | """Returns true if 'c' is a Unicode code point for a trailing | |
276 | surrogate.""" | |
277 | return c >= 0xdc00 and c <= 0xdfff | |
278 | @staticmethod | |
279 | def __utf16_decode_surrogate_pair(leading, trailing): | |
280 | """Returns the unicode code point corresponding to leading surrogate | |
281 | 'leading' and trailing surrogate 'trailing'. The return value will not | |
282 | make any sense if 'leading' or 'trailing' are not in the correct ranges | |
283 | for leading or trailing surrogates.""" | |
284 | # Leading surrogate: 110110wwwwxxxxxx | |
285 | # Trailing surrogate: 110111xxxxxxxxxx | |
286 | # Code point: 000uuuuuxxxxxxxxxxxxxxxx | |
287 | w = (leading >> 6) & 0xf | |
288 | u = w + 1 | |
289 | x0 = leading & 0x3f | |
290 | x1 = trailing & 0x3ff | |
291 | return (u << 16) | (x0 << 10) | x1 | |
292 | __unescape = {'"': u'"', | |
293 | "\\": u"\\", | |
294 | "/": u"/", | |
295 | "b": u"\b", | |
296 | "f": u"\f", | |
297 | "n": u"\n", | |
298 | "r": u"\r", | |
299 | "t": u"\t"} | |
300 | def __lex_finish_string(self): | |
301 | inp = self.buffer | |
302 | out = u"" | |
303 | while len(inp): | |
304 | backslash = inp.find('\\') | |
305 | if backslash == -1: | |
306 | out += inp | |
307 | break | |
308 | out += inp[:backslash] | |
309 | inp = inp[backslash + 1:] | |
310 | if inp == "": | |
311 | self.__error("quoted string may not end with backslash") | |
312 | return | |
313 | ||
314 | replacement = Parser.__unescape.get(inp[0]) | |
315 | if replacement is not None: | |
316 | out += replacement | |
317 | inp = inp[1:] | |
318 | continue | |
319 | elif inp[0] != u'u': | |
320 | self.__error("bad escape \\%s" % inp[0]) | |
321 | return | |
322 | ||
323 | c0 = self.__lex_4hex(inp[1:5]) | |
324 | if c0 is None: | |
325 | return | |
326 | inp = inp[5:] | |
327 | ||
328 | if Parser.__is_leading_surrogate(c0): | |
329 | if inp[:2] != u'\\u': | |
330 | self.__error("malformed escaped surrogate pair") | |
331 | return | |
332 | c1 = self.__lex_4hex(inp[2:6]) | |
333 | if c1 is None: | |
334 | return | |
335 | if not Parser.__is_trailing_surrogate(c1): | |
336 | self.__error("second half of escaped surrogate pair is " | |
337 | "not trailing surrogate") | |
338 | return | |
339 | code_point = Parser.__utf16_decode_surrogate_pair(c0, c1) | |
340 | inp = inp[6:] | |
341 | else: | |
342 | code_point = c0 | |
343 | out += unichr(code_point) | |
344 | self.__parser_input('string', out) | |
345 | ||
346 | def __lex_string_escape(self, c): | |
347 | self.buffer += c | |
348 | self.lex_state = Parser.__lex_string | |
349 | return True | |
350 | def __lex_string(self, c): | |
351 | if c == '\\': | |
352 | self.buffer += c | |
353 | self.lex_state = Parser.__lex_string_escape | |
354 | elif c == '"': | |
355 | self.__lex_finish_string() | |
356 | elif ord(c) >= 0x20: | |
357 | self.buffer += c | |
358 | else: | |
359 | self.__error("U+%04X must be escaped in quoted string" % ord(c)) | |
360 | return True | |
361 | ||
362 | def __lex_input(self, c): | |
363 | self.byte_number += 1 | |
364 | if c == '\n': | |
365 | self.column_number = 0 | |
366 | self.line_number += 1 | |
367 | else: | |
368 | self.column_number += 1 | |
369 | ||
370 | eat = self.lex_state(self, c) | |
371 | assert eat is True or eat is False | |
372 | return eat | |
373 | ||
374 | def __parse_start(self, token, string): | |
375 | if token == '{': | |
376 | self.__push_object() | |
377 | elif token == '[': | |
378 | self.__push_array() | |
379 | else: | |
380 | self.__error("syntax error at beginning of input") | |
381 | def __parse_end(self, token, string): | |
382 | self.__error("trailing garbage at end of input") | |
383 | def __parse_object_init(self, token, string): | |
384 | if token == '}': | |
385 | self.__parser_pop() | |
386 | else: | |
387 | self.__parse_object_name(token, string) | |
388 | def __parse_object_name(self, token, string): | |
389 | if token == 'string': | |
390 | self.member_name = string | |
391 | self.parse_state = Parser.__parse_object_colon | |
392 | else: | |
393 | self.__error("syntax error parsing object expecting string") | |
394 | def __parse_object_colon(self, token, string): | |
395 | if token == ":": | |
396 | self.parse_state = Parser.__parse_object_value | |
397 | else: | |
398 | self.__error("syntax error parsing object expecting ':'") | |
399 | def __parse_object_value(self, token, string): | |
400 | self.__parse_value(token, string, Parser.__parse_object_next) | |
401 | def __parse_object_next(self, token, string): | |
402 | if token == ",": | |
403 | self.parse_state = Parser.__parse_object_name | |
404 | elif token == "}": | |
405 | self.__parser_pop() | |
406 | else: | |
407 | self.__error("syntax error expecting '}' or ','") | |
408 | def __parse_array_init(self, token, string): | |
409 | if token == ']': | |
410 | self.__parser_pop() | |
411 | else: | |
412 | self.__parse_array_value(token, string) | |
413 | def __parse_array_value(self, token, string): | |
414 | self.__parse_value(token, string, Parser.__parse_array_next) | |
415 | def __parse_array_next(self, token, string): | |
416 | if token == ",": | |
417 | self.parse_state = Parser.__parse_array_value | |
418 | elif token == "]": | |
419 | self.__parser_pop() | |
420 | else: | |
421 | self.__error("syntax error expecting ']' or ','") | |
422 | def __parser_input(self, token, string=None): | |
423 | self.lex_state = Parser.__lex_start | |
424 | self.buffer = "" | |
425 | #old_state = self.parse_state | |
426 | self.parse_state(self, token, string) | |
427 | #print ("token=%s string=%s old_state=%s new_state=%s" | |
428 | # % (token, string, old_state, self.parse_state)) | |
429 | ||
430 | def __put_value(self, value): | |
431 | top = self.stack[-1] | |
432 | if type(top) == dict: | |
433 | top[self.member_name] = value | |
434 | else: | |
435 | top.append(value) | |
436 | ||
437 | def __parser_push(self, new_json, next_state): | |
438 | if len(self.stack) < Parser.MAX_HEIGHT: | |
439 | if len(self.stack) > 0: | |
440 | self.__put_value(new_json) | |
441 | self.stack.append(new_json) | |
442 | self.parse_state = next_state | |
443 | else: | |
444 | self.__error("input exceeds maximum nesting depth %d" % | |
445 | Parser.MAX_HEIGHT) | |
446 | def __push_object(self): | |
447 | self.__parser_push({}, Parser.__parse_object_init) | |
448 | def __push_array(self): | |
449 | self.__parser_push([], Parser.__parse_array_init) | |
450 | ||
451 | def __parser_pop(self): | |
452 | if len(self.stack) == 1: | |
453 | self.parse_state = Parser.__parse_end | |
454 | if not self.check_trailer: | |
455 | self.done = True | |
456 | else: | |
457 | self.stack.pop() | |
458 | top = self.stack[-1] | |
459 | if type(top) == list: | |
460 | self.parse_state = Parser.__parse_array_next | |
461 | else: | |
462 | self.parse_state = Parser.__parse_object_next | |
463 | ||
464 | def __parse_value(self, token, string, next_state): | |
465 | if token in [False, None, True] or type(token) in [int, long, float]: | |
466 | self.__put_value(token) | |
467 | elif token == 'string': | |
468 | self.__put_value(string) | |
469 | else: | |
470 | if token == '{': | |
471 | self.__push_object() | |
472 | elif token == '[': | |
473 | self.__push_array() | |
474 | else: | |
475 | self.__error("syntax error expecting value") | |
476 | return | |
477 | self.parse_state = next_state | |
478 | ||
479 | def __error(self, message): | |
480 | if self.error is None: | |
481 | self.error = ("line %d, column %d, byte %d: %s" | |
482 | % (self.line_number, self.column_number, | |
483 | self.byte_number, message)) | |
484 | self.done = True | |
485 | ||
486 | def feed(self, s): | |
487 | i = 0 | |
488 | while True: | |
489 | if self.done or i >= len(s): | |
490 | return i | |
491 | if self.__lex_input(s[i]): | |
492 | i += 1 | |
493 | ||
494 | def is_done(self): | |
495 | return self.done | |
496 | ||
497 | def finish(self): | |
498 | if self.lex_state == Parser.__lex_start: | |
499 | pass | |
500 | elif self.lex_state in (Parser.__lex_string, | |
501 | Parser.__lex_string_escape): | |
502 | self.__error("unexpected end of input in quoted string") | |
503 | else: | |
504 | self.__lex_input(" ") | |
505 | ||
506 | if self.parse_state == Parser.__parse_start: | |
507 | self.__error("empty input stream") | |
508 | elif self.parse_state != Parser.__parse_end: | |
509 | self.__error("unexpected end of input") | |
510 | ||
511 | if self.error == None: | |
512 | assert len(self.stack) == 1 | |
513 | return self.stack.pop() | |
514 | else: | |
515 | return self.error |