-
Notifications
You must be signed in to change notification settings - Fork 2
/
jsonfsm.py
380 lines (308 loc) · 11.6 KB
/
jsonfsm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
#-*- coding: utf-8 -*-
# Copyright (C) 2011 Guilherme P. Gonçalves
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of
# this software and associated documentation files (the "Software"), to deal in
# the Software without restriction, including without limitation the rights to
# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
# of the Software, and to permit persons to whom the Software is furnished to do
# so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
'''
JSON FSM is a JSON parser build from finite state machines implemended as
Python coroutines. It has been tested on Python 2.7 only. I do plan on
porting to Python 3 eventually, but don't hold me to it :)
Currently only decoding is implemented. There are no plans to support encoding
right now.
As you might expect, JSON objects are decoded into Python dictionaries, JSON
strings into Python unicode strings, JSON arrays into Python lists and JSON
numbers into Python floats. The boolean values also have direct correspondence,
and null is decoded to None.
This parser should be mostly compliant to the high-level specification at
http://json.org (in other words, I tried to implement the railroad diagrams).
JSON FSM is a toy project developed during my spare time, as an exploration of
the coroutines concept. As such, its usage is mainly recommended for educational
purposes. It may contain several show-stopping bugs, but works neatly most of
the time.
JSON FSM implements state machines for parsing each of the basic types of JSON
objects, along with a type-generic coroutine and the familiar loads(), all of
which have a simple interface. For instance, parsing a string should be
something like:
>>> parser = jsonfsm.string_fsm()
>>> for c in json:
... value = parser.send(c) # send each character in the json
>>> print value # the JSON string
Please consult the docstrings of the individual coroutines for more information.
AUTHOR
Guilherme P. Gonçalves, guilherme.p.gonc@gmail.com
'''
NOT_PARSED_YET = object()
class JSONParseError(Exception):
pass
def coroutine(f):
def start(*args, **kwargs):
cr = f(*args, **kwargs)
cr.next()
return cr
return start
@coroutine
def string_fsm():
'''
A coroutine implementing a finite state machine for parsing JSON strings.
Accepts the string one character at a time, and yields NOT_PARSED_YET until
the string has been successfully parsed.
Once the JSON string has been parsed, yields the corresponding Python
string. The coroutine can't be used after that.
May raise JSONParseError on malformed strings.
Expects data with no leading or trailing whitespace (i.e., the first and
last input characters should be ").
'''
value = []
c = (yield NOT_PARSED_YET)
if c != '"':
raise JSONParseError("JSON strings must start with a quote")
while True:
c = (yield NOT_PARSED_YET)
if c == '"':
# end of string
break
elif c == '\\':
c = (yield NOT_PARSED_YET)
if c == 'u':
# unicode 4-digit hexadecimal
hexval = ""
for i in range(4):
hexval += (yield NOT_PARSED_YET)
value.append(unichr(int(hexval, 16)))
elif c == 'b': value.append('\b')
elif c == 'f': value.append('\f')
elif c == 'n': value.append('\n')
elif c == 'r': value.append('\r')
elif c == 't': value.append('\t')
elif c in ('"', '\\', '/'): value.append(c)
else: raise JSONParseError("Invalid escape character")
else:
value.append(c)
yield ''.join(value)
@coroutine
def number_fsm():
'''
A coroutine implementing a finite state machine for parsing JSON numbers.
Accepts the characters of the number string one at a time.
Since numbers don't have delimiters, this coroutine always yields partial
results when they are available. Therefore, when parsing, e.g., 12.45, this
coroutine yields 1.0, 12.0, NOT_PARSED_YET, 12.4 and finally 12.45 before
stopping.
Always returns JSON numbers as Python floats and may raise JSONParseError
for invalid numbers.
Expects input characters with no leading or trailing whitespace.
'''
digits = []
# we actually only validate the number, and let Python's builtin float()
# do the conversion inside this function
def build_number():
number_string = ''.join(digits)
return float(number_string)
c = (yield NOT_PARSED_YET)
if c == '-':
digits.append(c)
c = (yield NOT_PARSED_YET)
# parse integer part
# this must either be a single zero or a non-zero digit followed by a
# sequence of other digits
if not c.isdigit():
raise JSONParseError("Unexpected leading nondigit: %s" % c)
digits.append(c)
if c != '0':
while True:
c = (yield build_number())
if c.isdigit():
digits.append(c)
else:
break
else:
c = (yield 0)
if c.isdigit():
raise JSONParseError("Unexpected digit: %s" % c)
if c == '.':
digits.append(c)
c = (yield NOT_PARSED_YET)
# parse fractional part, if any
while True:
if not c.isdigit():
if c.lower() == 'e':
break
raise JSONParseError("Unexpected character in number: %s" % c)
digits.append(c)
c = (yield build_number())
assert not c.isdigit()
# handle scientific notation
if c.lower() == 'e':
digits.append(c)
c = (yield NOT_PARSED_YET)
if c == '-':
digits.append(c)
c = (yield NOT_PARSED_YET)
if not c.isdigit():
raise JSONParseError("Unexpected character in number: %s" % c)
while c.isdigit():
digits.append(c)
c = (yield build_number())
raise JSONParseError("Unexpected character in number: %s" % c)
else:
raise JSONParseError("Unexpected character in number: %s" % c)
yield build_number()
@coroutine
def array_fsm():
'''
A coroutine implementing a finite state machine for parsing JSON arrays.
Thie coroutine will yield NOT_PARSED_YET until the array has been
successfully parsed, at which point a Python list representing the array
will be yielded.
May raise JSONParseError for invalid objects.
Expects input data with no leading or trailing whitespace, that is, the
first input character must be [ and the last must be ].
'''
array = []
c = (yield NOT_PARSED_YET)
if c != '[':
raise JSONParseError("Arrays must begin with [")
c = (yield NOT_PARSED_YET)
while c.isspace():
c = (yield NOT_PARSED_YET)
while c != ']':
vp = value_fsm()
value = NOT_PARSED_YET
# look out for , and ] since they act as delimiters for numbers.
while c not in (',', ']') or value is NOT_PARSED_YET:
value = vp.send(c)
c = (yield NOT_PARSED_YET)
if value is not NOT_PARSED_YET:
while c.isspace():
c = (yield NOT_PARSED_YET)
# if we stopped at a , advance until the start of the next value
# (and raise exception if the array ends)
if c == ',':
c = (yield NOT_PARSED_YET)
while c.isspace():
c = (yield NOT_PARSED_YET)
if c == ']':
raise JSONParseError("Extra , before ] in array")
array.append(value)
yield array
@coroutine
def object_fsm():
'''
A coroutine implementing a finite state machine for parsing JSON objects.
This coroutine will yield NOT_PARSED_YET until the object has been
successfully parsed, at which point a Python dictionary representing the
object will be yielded.
May raise JSONParseError for invalid objects.
Expects input data without leading or trailing whitespace, that is, the
first input character must be { and the last }.
'''
obj = {}
c = (yield NOT_PARSED_YET)
if c != '{':
raise JSONParseError("Objects must begin with {")
c = (yield NOT_PARSED_YET)
while c.isspace():
c = (yield NOT_PARSED_YET)
while c != '}':
# parse attribute key
key = NOT_PARSED_YET
sp = string_fsm()
while key is NOT_PARSED_YET:
key = sp.send(c)
c = (yield NOT_PARSED_YET)
while c.isspace():
c = (yield NOT_PARSED_YET)
if c != ':':
raise JSONParseError("Missing : between attribute name and value")
c = (yield NOT_PARSED_YET)
while c.isspace():
c = (yield NOT_PARSED_YET)
# parse value
value = NOT_PARSED_YET
vp = value_fsm()
# look out for , and } since they act as delimiters for numbers
while c not in (',', '}') or value is NOT_PARSED_YET:
value = vp.send(c)
c = (yield NOT_PARSED_YET)
if value is not NOT_PARSED_YET:
while c.isspace():
c = (yield NOT_PARSED_YET)
# if we stopped at a , advance until the start of the next key/value
# pair (and raise exception if the object ends)
if c == ',':
c = (yield NOT_PARSED_YET)
while c.isspace():
c = (yield NOT_PARSED_YET)
if c == '}':
raise JSONParseError("Extra , before } in object")
obj[key] = value
yield obj
@coroutine
def value_fsm():
'''
A coroutine implementing a finite state machine for parsing an arbitrary
JSON value.
Expects the value to have no leading or trailing whitespace.
'''
@coroutine
def string_matcher_fsm(match, retval):
'''
A simple finite state machine for string matching.
'''
while match:
if (yield NOT_PARSED_YET) == match[0]:
match = match[1:]
else:
raise JSONParseError
yield retval
# try all available parsers on the input data, only one should
# succeed.
parsers = [
number_fsm(),
object_fsm(),
array_fsm(),
string_fsm(),
string_matcher_fsm("null", None),
string_matcher_fsm("false", False),
string_matcher_fsm("true", True),
]
# find the correct parser (all others should fail in the first character)
c = (yield NOT_PARSED_YET)
for p in parsers:
try:
value = p.send(c)
except JSONParseError:
pass
else:
break
else:
raise JSONParseError("Failed to parse JSON value")
while True:
value = p.send((yield value))
def loads(json, encoding = "utf-8"):
'''
Load a JSON object from a string.
'''
if not isinstance(json, unicode):
json = unicode(json, encoding)
parser = value_fsm()
for c in json.strip():
ret = parser.send(c)
if ret is NOT_PARSED_YET:
raise JSONParseError("Failed to parse JSON value")
return ret