python: Make invalid UTF-8 sequence messages consistent across Python versions.
[sliver-openvswitch.git] / python / ovs / json.py
1 # Copyright (c) 2010, 2011 Nicira Networks
2 #
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at:
6 #
7 #     http://www.apache.org/licenses/LICENSE-2.0
8 #
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14
15 import re
16 import StringIO
17 import sys
18
19 escapes = {ord('"'): u"\\\"",
20            ord("\\"): u"\\\\",
21            ord("\b"): u"\\b",
22            ord("\f"): u"\\f",
23            ord("\n"): u"\\n",
24            ord("\r"): u"\\r",
25            ord("\t"): u"\\t"}
26 for i in range(32):
27     if i not in escapes:
28         escapes[i] = u"\\u%04x" % i
29
30 def __dump_string(stream, s):
31     stream.write(u"\"")
32     for c in s:
33         x = ord(c)
34         escape = escapes.get(x)
35         if escape:
36             stream.write(escape)
37         else:
38             stream.write(c)
39     stream.write(u"\"")
40
41 def to_stream(obj, stream, pretty=False, sort_keys=True):
42     if obj is None:
43         stream.write(u"null")
44     elif obj is False:
45         stream.write(u"false")
46     elif obj is True:
47         stream.write(u"true")
48     elif type(obj) in (int, long):
49         stream.write(u"%d" % obj)
50     elif type(obj) == float:
51         stream.write("%.15g" % obj)
52     elif type(obj) == unicode:
53         __dump_string(stream, obj)
54     elif type(obj) == str:
55         __dump_string(stream, unicode(obj))
56     elif type(obj) == dict:
57         stream.write(u"{")
58         if sort_keys:
59             items = sorted(obj.items())
60         else:
61             items = obj.iteritems()
62         i = 0
63         for key, value in items:
64             if i > 0:
65                 stream.write(u",")
66             i += 1
67             __dump_string(stream, unicode(key))
68             stream.write(u":")
69             to_stream(value, stream, pretty, sort_keys)
70         stream.write(u"}")
71     elif type(obj) in (list, tuple):
72         stream.write(u"[")
73         i = 0
74         for value in obj:
75             if i > 0:
76                 stream.write(u",")
77             i += 1
78             to_stream(value, stream, pretty, sort_keys)
79         stream.write(u"]")
80     else:
81         raise Error("can't serialize %s as JSON" % obj)
82
83 def to_file(obj, name, pretty=False, sort_keys=True):
84     stream = open(name, "w")
85     try:
86         to_stream(obj, stream, pretty, sort_keys)
87     finally:
88         stream.close()
89
90 def to_string(obj, pretty=False, sort_keys=True):
91     output = StringIO.StringIO()
92     to_stream(obj, output, pretty, sort_keys)
93     s = output.getvalue()
94     output.close()
95     return s
96
97 def from_stream(stream):
98     p = Parser(check_trailer=True)
99     while True:
100         buf = stream.read(4096)
101         if buf == "" or p.feed(buf) != len(buf):
102             break
103     return p.finish()
104
105 def from_file(name):
106     stream = open(name, "r")
107     try:
108         return from_stream(stream)
109     finally:
110         stream.close()
111
112 def from_string(s):
113     try:
114         s = unicode(s, 'utf-8')
115     except UnicodeDecodeError, e:
116         seq = ' '.join(["0x%2x" % ord(c)
117                         for c in e.object[e.start:e.end] if ord(c) >= 0x80])
118         return ("not a valid UTF-8 string: invalid UTF-8 sequence %s" % seq)
119     p = Parser(check_trailer=True)
120     p.feed(s)
121     return p.finish()
122
123 class Parser(object):
124     ## Maximum height of parsing stack. ##
125     MAX_HEIGHT = 1000
126
127     def __init__(self, check_trailer=False):
128         self.check_trailer = check_trailer
129
130         # Lexical analysis.
131         self.lex_state = Parser.__lex_start
132         self.buffer = ""
133         self.line_number = 0
134         self.column_number = 0
135         self.byte_number = 0
136         
137         # Parsing.
138         self.parse_state = Parser.__parse_start
139         self.stack = []
140         self.member_name = None
141
142         # Parse status.
143         self.done = False
144         self.error = None
145
146     def __lex_start_space(self, c):
147         pass
148     def __lex_start_alpha(self, c):
149         self.buffer = c
150         self.lex_state = Parser.__lex_keyword
151     def __lex_start_token(self, c):
152         self.__parser_input(c)
153     def __lex_start_number(self, c):
154         self.buffer = c
155         self.lex_state = Parser.__lex_number
156     def __lex_start_string(self, c):
157         self.lex_state = Parser.__lex_string
158     def __lex_start_error(self, c):
159         if ord(c) >= 32 and ord(c) < 128:
160             self.__error("invalid character '%s'" % c)
161         else:
162             self.__error("invalid character U+%04x" % ord(c))
163
164     __lex_start_actions = {}
165     for c in " \t\n\r":
166         __lex_start_actions[c] = __lex_start_space
167     for c in "abcdefghijklmnopqrstuvwxyz":
168         __lex_start_actions[c] = __lex_start_alpha
169     for c in "[{]}:,":
170         __lex_start_actions[c] = __lex_start_token
171     for c in "-0123456789":
172         __lex_start_actions[c] = __lex_start_number
173     __lex_start_actions['"'] = __lex_start_string
174     def __lex_start(self, c):
175         Parser.__lex_start_actions.get(
176             c, Parser.__lex_start_error)(self, c)
177         return True
178
179     __lex_alpha = {}
180     for c in "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ":
181         __lex_alpha[c] = True
182     def __lex_finish_keyword(self):
183         if self.buffer == "false":
184             self.__parser_input(False)
185         elif self.buffer == "true":
186             self.__parser_input(True)
187         elif self.buffer == "null":
188             self.__parser_input(None)
189         else:
190             self.__error("invalid keyword '%s'" % self.buffer)
191     def __lex_keyword(self, c):
192         if c in Parser.__lex_alpha:
193             self.buffer += c
194             return True
195         else:
196             self.__lex_finish_keyword()
197             return False
198
199     __number_re = re.compile("(-)?(0|[1-9][0-9]*)(?:\.([0-9]+))?(?:[eE]([-+]?[0-9]+))?$")
200     def __lex_finish_number(self):
201         s = self.buffer
202         m = Parser.__number_re.match(s)
203         if m:
204             sign, integer, fraction, exp = m.groups() 
205             if (exp is not None and
206                 (long(exp) > sys.maxint or long(exp) < -sys.maxint - 1)):
207                 self.__error("exponent outside valid range")
208                 return
209
210             if fraction is not None and len(fraction.lstrip('0')) == 0:
211                 fraction = None
212
213             sig_string = integer
214             if fraction is not None:
215                 sig_string += fraction
216             significand = int(sig_string)
217
218             pow10 = 0
219             if fraction is not None:
220                 pow10 -= len(fraction)
221             if exp is not None:
222                 pow10 += long(exp)
223
224             if significand == 0:
225                 self.__parser_input(0)
226                 return
227             elif significand <= 2**63:
228                 while pow10 > 0 and significand <= 2*63:
229                     significand *= 10
230                     pow10 -= 1
231                 while pow10 < 0 and significand % 10 == 0:
232                     significand /= 10
233                     pow10 += 1
234                 if (pow10 == 0 and
235                     ((not sign and significand < 2**63) or
236                      (sign and significand <= 2**63))):
237                     if sign:
238                         self.__parser_input(-significand)
239                     else:
240                         self.__parser_input(significand)
241                     return
242
243             value = float(s)
244             if value == float("inf") or value == float("-inf"):
245                 self.__error("number outside valid range")
246                 return
247             if value == 0:
248                 # Suppress negative zero.
249                 value = 0
250             self.__parser_input(value)
251         elif re.match("-?0[0-9]", s):
252             self.__error("leading zeros not allowed")
253         elif re.match("-([^0-9]|$)", s):
254             self.__error("'-' must be followed by digit")
255         elif re.match("-?(0|[1-9][0-9]*)\.([^0-9]|$)", s):
256             self.__error("decimal point must be followed by digit")
257         elif re.search("e[-+]?([^0-9]|$)", s):
258             self.__error("exponent must contain at least one digit")
259         else:
260             self.__error("syntax error in number")
261             
262     def __lex_number(self, c):
263         if c in ".0123456789eE-+":
264             self.buffer += c
265             return True
266         else:
267             self.__lex_finish_number()
268             return False
269
270     __4hex_re = re.compile("[0-9a-fA-F]{4}")
271     def __lex_4hex(self, s):
272         if len(s) < 4:
273             self.__error("quoted string ends within \\u escape")
274         elif not Parser.__4hex_re.match(s):
275             self.__error("malformed \\u escape")
276         elif s == "0000":
277             self.__error("null bytes not supported in quoted strings")
278         else:
279             return int(s, 16)
280     @staticmethod
281     def __is_leading_surrogate(c):
282         """Returns true if 'c' is a Unicode code point for a leading
283         surrogate."""
284         return c >= 0xd800 and c <= 0xdbff
285     @staticmethod
286     def __is_trailing_surrogate(c):
287         """Returns true if 'c' is a Unicode code point for a trailing
288         surrogate."""
289         return c >= 0xdc00 and c <= 0xdfff
290     @staticmethod
291     def __utf16_decode_surrogate_pair(leading, trailing):
292         """Returns the unicode code point corresponding to leading surrogate
293         'leading' and trailing surrogate 'trailing'.  The return value will not
294         make any sense if 'leading' or 'trailing' are not in the correct ranges
295         for leading or trailing surrogates."""
296         #  Leading surrogate:         110110wwwwxxxxxx
297         # Trailing surrogate:         110111xxxxxxxxxx
298         #         Code point: 000uuuuuxxxxxxxxxxxxxxxx
299         w = (leading >> 6) & 0xf
300         u = w + 1
301         x0 = leading & 0x3f
302         x1 = trailing & 0x3ff
303         return (u << 16) | (x0 << 10) | x1
304     __unescape = {'"': u'"',
305                   "\\": u"\\",
306                   "/": u"/",
307                   "b": u"\b",
308                   "f": u"\f",
309                   "n": u"\n",
310                   "r": u"\r",
311                   "t": u"\t"}
312     def __lex_finish_string(self):
313         inp = self.buffer
314         out = u""
315         while len(inp):
316             backslash = inp.find('\\')
317             if backslash == -1:
318                 out += inp
319                 break
320             out += inp[:backslash]
321             inp = inp[backslash + 1:]
322             if inp == "":
323                 self.__error("quoted string may not end with backslash")
324                 return
325
326             replacement = Parser.__unescape.get(inp[0])
327             if replacement is not None:
328                 out += replacement
329                 inp = inp[1:]
330                 continue
331             elif inp[0] != u'u':
332                 self.__error("bad escape \\%s" % inp[0])
333                 return
334             
335             c0 = self.__lex_4hex(inp[1:5])
336             if c0 is None:
337                 return
338             inp = inp[5:]
339
340             if Parser.__is_leading_surrogate(c0):
341                 if inp[:2] != u'\\u':
342                     self.__error("malformed escaped surrogate pair")
343                     return
344                 c1 = self.__lex_4hex(inp[2:6])
345                 if c1 is None:
346                     return
347                 if not Parser.__is_trailing_surrogate(c1):
348                     self.__error("second half of escaped surrogate pair is "
349                                  "not trailing surrogate")
350                     return
351                 code_point = Parser.__utf16_decode_surrogate_pair(c0, c1)
352                 inp = inp[6:]
353             else:
354                 code_point = c0
355             out += unichr(code_point)
356         self.__parser_input('string', out)
357
358     def __lex_string_escape(self, c):
359         self.buffer += c
360         self.lex_state = Parser.__lex_string
361         return True
362     def __lex_string(self, c):
363         if c == '\\':
364             self.buffer += c
365             self.lex_state = Parser.__lex_string_escape
366         elif c == '"':
367             self.__lex_finish_string()
368         elif ord(c) >= 0x20:
369             self.buffer += c
370         else:
371             self.__error("U+%04X must be escaped in quoted string" % ord(c))
372         return True
373
374     def __lex_input(self, c):
375         self.byte_number += 1
376         if c == '\n':
377             self.column_number = 0
378             self.line_number += 1
379         else:
380             self.column_number += 1
381
382         eat = self.lex_state(self, c)
383         assert eat is True or eat is False
384         return eat
385
386     def __parse_start(self, token, string):
387         if token == '{':
388             self.__push_object()
389         elif token == '[':
390             self.__push_array()
391         else:
392             self.__error("syntax error at beginning of input")
393     def __parse_end(self, token, string):
394         self.__error("trailing garbage at end of input")
395     def __parse_object_init(self, token, string):
396         if token == '}':
397             self.__parser_pop()
398         else:
399             self.__parse_object_name(token, string)
400     def __parse_object_name(self, token, string):
401         if token == 'string':
402             self.member_name = string
403             self.parse_state = Parser.__parse_object_colon
404         else:
405             self.__error("syntax error parsing object expecting string")
406     def __parse_object_colon(self, token, string):
407         if token == ":":
408             self.parse_state = Parser.__parse_object_value
409         else:
410             self.__error("syntax error parsing object expecting ':'")
411     def __parse_object_value(self, token, string):
412         self.__parse_value(token, string, Parser.__parse_object_next)
413     def __parse_object_next(self, token, string):
414         if token == ",":
415             self.parse_state = Parser.__parse_object_name
416         elif token == "}":
417             self.__parser_pop()
418         else:
419             self.__error("syntax error expecting '}' or ','")
420     def __parse_array_init(self, token, string):
421         if token == ']':
422             self.__parser_pop()
423         else:
424             self.__parse_array_value(token, string)
425     def __parse_array_value(self, token, string):
426         self.__parse_value(token, string, Parser.__parse_array_next)
427     def __parse_array_next(self, token, string):
428         if token == ",":
429             self.parse_state = Parser.__parse_array_value
430         elif token == "]":
431             self.__parser_pop()
432         else:
433             self.__error("syntax error expecting ']' or ','")
434     def __parser_input(self, token, string=None):
435         self.lex_state = Parser.__lex_start
436         self.buffer = ""
437         #old_state = self.parse_state
438         self.parse_state(self, token, string)
439         #print ("token=%s string=%s old_state=%s new_state=%s"
440         #       % (token, string, old_state, self.parse_state))
441
442     def __put_value(self, value):
443         top = self.stack[-1]
444         if type(top) == dict:
445             top[self.member_name] = value
446         else:
447             top.append(value)
448
449     def __parser_push(self, new_json, next_state):
450         if len(self.stack) < Parser.MAX_HEIGHT:
451             if len(self.stack) > 0:
452                 self.__put_value(new_json)
453             self.stack.append(new_json)
454             self.parse_state = next_state
455         else:
456             self.__error("input exceeds maximum nesting depth %d" %
457                          Parser.MAX_HEIGHT)
458     def __push_object(self):
459         self.__parser_push({}, Parser.__parse_object_init)
460     def __push_array(self):
461         self.__parser_push([], Parser.__parse_array_init)
462
463     def __parser_pop(self):
464         if len(self.stack) == 1:
465             self.parse_state = Parser.__parse_end
466             if not self.check_trailer:
467                 self.done = True
468         else:
469             self.stack.pop()
470             top = self.stack[-1]
471             if type(top) == list:
472                 self.parse_state = Parser.__parse_array_next
473             else:
474                 self.parse_state = Parser.__parse_object_next
475
476     def __parse_value(self, token, string, next_state):
477         if token in [False, None, True] or type(token) in [int, long, float]:
478             self.__put_value(token)
479         elif token == 'string':
480             self.__put_value(string)
481         else:
482             if token == '{':
483                 self.__push_object()
484             elif token == '[':
485                 self.__push_array()
486             else:
487                 self.__error("syntax error expecting value")
488             return
489         self.parse_state = next_state
490
491     def __error(self, message):
492         if self.error is None:
493             self.error = ("line %d, column %d, byte %d: %s"
494                           % (self.line_number, self.column_number,
495                              self.byte_number, message))
496             self.done = True
497
498     def feed(self, s):
499         i = 0
500         while True:
501             if self.done or i >= len(s):
502                 return i
503             if self.__lex_input(s[i]):
504                 i += 1
505
506     def is_done(self):
507         return self.done
508
509     def finish(self):
510         if self.lex_state == Parser.__lex_start:
511             pass
512         elif self.lex_state in (Parser.__lex_string,
513                                 Parser.__lex_string_escape):
514             self.__error("unexpected end of input in quoted string")
515         else:
516             self.__lex_input(" ")
517
518         if self.parse_state == Parser.__parse_start:
519             self.__error("empty input stream")
520         elif self.parse_state != Parser.__parse_end:
521             self.__error("unexpected end of input")
522
523         if self.error == None:
524             assert len(self.stack) == 1
525             return self.stack.pop()
526         else:
527             return self.error