data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1053 lines, 516 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	TODO:
6
7	- Many more tests
8	- Run JSONTestSuite
9
10	Later:
11
12	- PrettyPrinter uses hnode.asdl?
13	- color
14	- line wrapping -- do this later
15	- would like CONTRIBUTORS here
16
17	- Unify with ASDL pretty printing - NIL8
18	- {} [] are identical
19	- () is for statically typed ASDL data
20	(command.Simple blame_tok:(...) words:[ ])
21	although we are also using [] for typed ASDL arrays, not just JSON
22	- object IDs
23	- @ x123 can create an ID
24	- ! x123 can reference an ID
25	- <> can be for non-J8 data types? For the = operator
26	- 'hi \(name)' interpolation is useful for code
27
28	- Common between JSON8 and NIL8 - for writing by hand
29	- comments - # line or // line (JSON5 uses // line, following JS)
30	- unquoted identifier names - TYG8 could be more relaxed for (+ 1 (* 3 4))
31	- commas
32	- JSON8 could have trailing commas rule
33	- NIL8 at least has no commas for [1 2 "hi"]
34	"""
35
36	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40	from asdl import format as fmt
41	from core import error
42	from data_lang import pyj8
43	# dependency issue: consts.py pulls in frontend/option_def.py
44	from frontend import consts
45	from frontend import match
46	from mycpp import mops
47	from mycpp import mylib
48	from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50	import fastfunc
51
52	_ = log
53
54	from typing import cast, Dict, List, Tuple, Optional
55
56
57	# COPIED from ui.ValType() to break dep
58	def ValType(val):
59	# type: (value_t) -> str
60	"""For displaying type errors in the UI."""
61
62	return value_str(val.tag(), dot=False)
63
64
65	if mylib.PYTHON:
66
67	def HeapValueId(val):
68	# type: (value_t) -> int
69	"""
70	Python's id() returns the address, which is up to 64 bits.
71
72	In C++ we can use the GC ID, which fits within 32 bits.
73	"""
74	return id(val)
75
76
77	def ValueId(val):
78	# type: (value_t) -> int
79	"""
80	Return an integer ID for object that:
81
82	1. Can be used to determine whether 2 objects are the same, e.g. for
83	List, Dict, Func, Proc, etc.
84	2. Will help detect object cycles
85
86	Primitives types like Int and Float don't have this notion. They're
87	immutable values that are copied and compared by value.
88	"""
89	with tagswitch(val) as case:
90	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91	value_e.Str):
92	# These will not be on the heap if we switch to tagged pointers
93	# Str is handled conservatively - when we add small string
94	# optimization, some strings will be values, so we assume all are.
95	return -1
96	else:
97	return HeapValueId(val)
98
99
100	def ValueIdString(val):
101	# type: (value_t) -> str
102	"""Used by pp value (42) and = 42"""
103	heap_id = ValueId(val) # could be -1
104	if heap_id == -1:
105	return ''
106	else:
107	return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110	def Utf8Encode(code):
111	# type: (int) -> str
112	"""Return utf-8 encoded bytes from a unicode code point.
113
114	Based on https://stackoverflow.com/a/23502707
115	"""
116	num_cont_bytes = 0
117
118	if code <= 0x7F:
119	return chr(code & 0x7F) # ASCII
120
121	elif code <= 0x7FF:
122	num_cont_bytes = 1
123	elif code <= 0xFFFF:
124	num_cont_bytes = 2
125	elif code <= 0x10FFFF:
126	num_cont_bytes = 3
127
128	else:
129	return '\xEF\xBF\xBD' # unicode replacement character
130
131	bytes_ = [] # type: List[int]
132	for _ in xrange(num_cont_bytes):
133	bytes_.append(0x80 \| (code & 0x3F))
134	code >>= 6
135
136	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
137	bytes_.append(b)
138	bytes_.reverse()
139
140	# mod 256 because Python ints don't wrap around!
141	tmp = [chr(b & 0xFF) for b in bytes_]
142	return ''.join(tmp)
143
144
145	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147	LOSSY_JSON = 1 << 3 # JSON is lossy
148
149	# Hack until we fully translate
150	assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153	def _Print(val, buf, indent, options=0):
154	# type: (value_t, mylib.BufWriter, int, int) -> None
155	"""
156	Args:
157	indent: number of spaces to indent, or -1 for everything on one line
158	"""
159	p = InstancePrinter(buf, indent, options)
160	p.Print(val)
161
162
163	def PrintMessage(val, buf, indent):
164	# type: (value_t, mylib.BufWriter, int) -> None
165	""" For json8 write (x) and toJson8()
166
167	Caller must handle error.Encode
168	"""
169	_Print(val, buf, indent)
170
171
172	def PrintJsonMessage(val, buf, indent):
173	# type: (value_t, mylib.BufWriter, int) -> None
174	""" For json write (x) and toJson()
175
176	Caller must handle error.Encode()
177	Doesn't decay to b'' strings - will use Unicode replacement char.
178	"""
179	_Print(val, buf, indent, options=LOSSY_JSON)
180
181
182	def PrintLine(val, f):
183	# type: (value_t, mylib.Writer) -> None
184	""" For pp line (x) """
185
186	# error.Encode should be impossible - we show cycles and non-data
187	buf = mylib.BufWriter()
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	def EncodeString(s, buf, unquoted_ok=False):
194	# type: (str, mylib.BufWriter, bool) -> None
195	""" For pp proc, etc."""
196
197	if unquoted_ok and fastfunc.CanOmitQuotes(s):
198	buf.write(s)
199	return
200
201	_Print(value.Str(s), buf, -1)
202
203
204	def MaybeEncodeString(s):
205	# type: (str) -> str
206	""" For write --json8 $s and compexport """
207
208	# TODO: add unquoted_ok here?
209	# /usr/local/foo-bar/x.y/a_b
210
211	buf = mylib.BufWriter()
212	_Print(value.Str(s), buf, -1)
213	return buf.getvalue()
214
215
216	def MaybeEncodeJsonString(s):
217	# type: (str) -> str
218	""" For write --json """
219
220	# TODO: add unquoted_ok here?
221	# /usr/local/foo-bar/x.y/a_b
222	buf = mylib.BufWriter()
223	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224	return buf.getvalue()
225
226
227	# DFS traversal state
228	UNSEEN = 0
229	EXPLORING = 1
230	FINISHED = 2
231
232
233	class InstancePrinter(object):
234	"""Print a value tree as J8/JSON."""
235
236	def __init__(self, buf, indent, options):
237	# type: (mylib.BufWriter, int, int) -> None
238	self.buf = buf
239	self.indent = indent
240	self.options = options
241
242	# Key is vm.HeapValueId(val)
243	# Value is always True
244	# Dict[int, None] doesn't translate -- it would be nice to have a set()
245	self.visited = {} # type: Dict[int, int]
246
247	def _ItemIndent(self, level):
248	# type: (int) -> None
249
250	if self.indent == -1:
251	return
252
253	self.buf.write_spaces((level + 1) * self.indent)
254
255	def _BracketIndent(self, level):
256	# type: (int) -> None
257
258	if self.indent == -1:
259	return
260
261	self.buf.write_spaces(level * self.indent)
262
263	def _MaybeNewline(self):
264	# type: () -> None
265	if self.indent == -1:
266	return
267	self.buf.write('\n')
268
269	def _MaybeSpace(self):
270	# type: () -> None
271	if self.indent == -1:
272	return
273	self.buf.write(' ')
274
275	def _PrintList(self, val, level):
276	# type: (value.List, int) -> None
277
278	if len(val.items) == 0: # Special case like Python/JS
279	self.buf.write('[]')
280	else:
281	self.buf.write('[')
282	self._MaybeNewline()
283	for i, item in enumerate(val.items):
284	if i != 0:
285	self.buf.write(',')
286	self._MaybeNewline()
287
288	self._ItemIndent(level)
289	self.Print(item, level + 1)
290	self._MaybeNewline()
291
292	self._BracketIndent(level)
293	self.buf.write(']')
294
295	def _PrintDict(self, val, level):
296	# type: (value.Dict, int) -> None
297
298	if len(val.d) == 0: # Special case like Python/JS
299	self.buf.write('{}')
300	else:
301	self.buf.write('{')
302	self._MaybeNewline()
303	i = 0
304	for k, v in iteritems(val.d):
305	if i != 0:
306	self.buf.write(',')
307	self._MaybeNewline()
308
309	self._ItemIndent(level)
310
311	pyj8.WriteString(k, self.options, self.buf)
312
313	self.buf.write(':')
314	self._MaybeSpace()
315
316	self.Print(v, level + 1)
317
318	i += 1
319
320	self._MaybeNewline()
321	self._BracketIndent(level)
322	self.buf.write('}')
323
324	def Print(self, val, level=0):
325	# type: (value_t, int) -> None
326
327	# special value that means everything is on one line
328	# It's like
329	# JSON.stringify(d, null, 0)
330	# except we use -1, not 0. 0 can still have newlines.
331
332	UP_val = val
333	with tagswitch(val) as case:
334	if case(value_e.Null):
335	self.buf.write('null')
336
337	elif case(value_e.Bool):
338	val = cast(value.Bool, UP_val)
339	self.buf.write('true' if val.b else 'false')
340
341	elif case(value_e.Int):
342	val = cast(value.Int, UP_val)
343	# TODO: avoid intermediate allocation with
344	# self.buf.WriteBigInt(val.i)
345	#
346	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347	# be of arbitrary length, and will need a growth strategy.
348	# Although that is not very common, so we could allocate in
349	# that case.
350
351	self.buf.write(mops.ToStr(val.i))
352
353	elif case(value_e.Float):
354	val = cast(value.Float, UP_val)
355	# TODO: avoid intrmediate allocation with
356	# self.buf.WriteFloat(val.f)
357	self.buf.write(str(val.f))
358
359	elif case(value_e.Str):
360	val = cast(value.Str, UP_val)
361
362	pyj8.WriteString(val.s, self.options, self.buf)
363
364	elif case(value_e.List):
365	val = cast(value.List, UP_val)
366
367	# Cycle detection, only for containers that can be in cycles
368	heap_id = HeapValueId(val)
369
370	node_state = self.visited.get(heap_id, UNSEEN)
371	if node_state == FINISHED:
372	# Print it AGAIN. We print a JSON tree, which means we can
373	# visit and print nodes MANY TIMES, as long as they're not
374	# in a cycle.
375	self._PrintList(val, level)
376	return
377	if node_state == EXPLORING:
378	if self.options & SHOW_CYCLES:
379	self.buf.write('[ -->%s ]' % ValueIdString(val))
380	return
381	else:
382	# node.js prints which index closes the cycle
383	raise error.Encode(
384	"Can't encode List%s in object cycle" %
385	ValueIdString(val))
386
387	self.visited[heap_id] = EXPLORING
388	self._PrintList(val, level)
389	self.visited[heap_id] = FINISHED
390
391	elif case(value_e.Dict):
392	val = cast(value.Dict, UP_val)
393
394	# Cycle detection, only for containers that can be in cycles
395	heap_id = HeapValueId(val)
396
397	node_state = self.visited.get(heap_id, UNSEEN)
398	if node_state == FINISHED:
399	# Print it AGAIN. We print a JSON tree, which means we can
400	# visit and print nodes MANY TIMES, as long as they're not
401	# in a cycle.
402	self._PrintDict(val, level)
403	return
404	if node_state == EXPLORING:
405	if self.options & SHOW_CYCLES:
406	self.buf.write('{ -->%s }' % ValueIdString(val))
407	return
408	else:
409	# node.js prints which key closes the cycle
410	raise error.Encode(
411	"Can't encode Dict%s in object cycle" %
412	ValueIdString(val))
413
414	self.visited[heap_id] = EXPLORING
415	self._PrintDict(val, level)
416	self.visited[heap_id] = FINISHED
417
418	# BashArray and BashAssoc should be printed with pp line (x), e.g.
419	# for spec tests.
420	# - BashAssoc has a clear encoding.
421	# - BashArray could eventually be Dict[int, str]. But that's not
422	# encodable in JSON, which has string keys!
423	# So I think we can print it like ["a",null,'b"] and that won't
424	# change. That's what users expect.
425	elif case(value_e.BashArray):
426	val = cast(value.BashArray, UP_val)
427
428	self.buf.write('[')
429	self._MaybeNewline()
430	for i, s in enumerate(val.strs):
431	if i != 0:
432	self.buf.write(',')
433	self._MaybeNewline()
434
435	self._ItemIndent(level)
436	if s is None:
437	self.buf.write('null')
438	else:
439	pyj8.WriteString(s, self.options, self.buf)
440
441	self._MaybeNewline()
442
443	self._BracketIndent(level)
444	self.buf.write(']')
445
446	elif case(value_e.BashAssoc):
447	val = cast(value.BashAssoc, UP_val)
448
449	self.buf.write('{')
450	self._MaybeNewline()
451	i = 0
452	for k2, v2 in iteritems(val.d):
453	if i != 0:
454	self.buf.write(',')
455	self._MaybeNewline()
456
457	self._ItemIndent(level)
458
459	pyj8.WriteString(k2, self.options, self.buf)
460
461	self.buf.write(':')
462	self._MaybeSpace()
463
464	pyj8.WriteString(v2, self.options, self.buf)
465
466	i += 1
467
468	self._MaybeNewline()
469	self._BracketIndent(level)
470	self.buf.write('}')
471
472	else:
473	pass # mycpp workaround
474	if self.options & SHOW_NON_DATA:
475	# Similar to = operator, ui.DebugPrint()
476	# TODO: that prints value.Range in a special way
477	ysh_type = ValType(val)
478	id_str = ValueIdString(val)
479	self.buf.write('<%s%s>' % (ysh_type, id_str))
480	else:
481	raise error.Encode("Can't serialize object of type %s" %
482	ValType(val))
483
484
485	class PrettyPrinter(object):
486	""" Unused right now, but could enhance the = operator.
487
488	Output to polymorphic ColorOutput
489
490	Features like asdl/format.py:
491	- line wrapping
492	- color
493	- sharing detection by passing in a REF COUTN dict
494	- print @123 the first time, and then print ... the second time
495
496	and
497
498	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501	- Omitting commas for ASDL? Maybe we can use two spaces
502
503	(Token id: Id.VSub_DollarName start: 0 length: 3)
504	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505	"""
506
507	def __init__(self, max_col):
508	# type: (int) -> None
509	self.max_col = max_col
510
511	# This could be an optimized set an C++ bit set like
512	# mark_sweep_heap.h, rather than a Dict
513	#self.unique_objs = mylib.UniqueObjects()
514
515	# first pass of object ID -> number of times references
516
517	self.ref_count = {} # type: Dict[int, int]
518
519	def PrettyTree(self, val, f):
520	# type: (value_t, fmt.ColorOutput) -> None
521
522	# TODO: first convert to hnode.asdl types?
523
524	# Although we might want
525	# hnode.AlreadyShown = (str type, int unique_id)
526	pass
527
528	def Print(self, val, buf):
529	# type: (value_t, mylib.BufWriter) -> None
530
531	# Or print to stderr?
532	f = fmt.DetectConsoleOutput(mylib.Stdout())
533	self.PrettyTree(val, f)
534
535	# Then print those with ASDL
536	pass
537
538
539	class LexerDecoder(object):
540	"""J8 lexer and string decoder.
541
542	Similar interface as SimpleLexer, except we return an optional decoded
543	string
544	"""
545
546	def __init__(self, s, is_j8):
547	# type: (str, bool) -> None
548	self.s = s
549	self.is_j8 = is_j8
550	self.lang_str = "NIL8"
551
552	self.pos = 0
553	# Reuse this instance to save GC objects. JSON objects could have
554	# thousands of strings.
555	self.decoded = mylib.BufWriter()
556
557	def _Error(self, msg, end_pos):
558	# type: (str, int) -> error.Decode
559
560	# Use the current position as start pos
561	return error.Decode(msg, self.s, self.pos, end_pos)
562
563	def Next(self):
564	# type: () -> Tuple[Id_t, int, Optional[str]]
565	""" Returns a token and updates self.pos """
566
567	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
568
569	if not self.is_j8:
570	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
571	raise self._Error(
572	"Single quotes aren't part of JSON; you may want 'json8 read'",
573	end_pos)
574	if tok_id == Id.Ignored_Comment:
575	raise self._Error(
576	"Comments aren't part of JSON; you may want 'json8 read'",
577	end_pos)
578
579	# Non-string tokens like { } null etc.
580	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
581	Id.Left_USingleQuote):
582	return self._DecodeString(tok_id, end_pos)
583
584	self.pos = end_pos
585	return tok_id, end_pos, None
586
587	def _DecodeString(self, left_id, str_pos):
588	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
589	""" Returns a string token and updates self.pos """
590
591	while True:
592	if left_id == Id.Left_DoubleQuote:
593	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
594	else:
595	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
596
597	if tok_id == Id.Eol_Tok:
598	# TODO: point to beginning of # quote?
599	raise self._Error(
600	'Unexpected EOF while lexing %s string' % self.lang_str,
601	str_end)
602	if tok_id == Id.Unknown_Tok:
603	# e.g. invalid backslash
604	raise self._Error(
605	'Unknown token while lexing %s string' % self.lang_str,
606	str_end)
607	if tok_id == Id.Char_AsciiControl:
608	raise self._Error(
609	"ASCII control chars are illegal in %s strings" %
610	self.lang_str, str_end)
611
612	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
613
614	self.pos = str_end
615
616	s = self.decoded.getvalue()
617	self.decoded.clear() # reuse this instance
618
619	#log('decoded %r', self.decoded.getvalue())
620	return Id.J8_String, str_end, s
621
622	#
623	# Now handle each kind of token
624	#
625
626	if tok_id == Id.Lit_Chars: # JSON and J8
627	part = self.s[str_pos:str_end]
628	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
629	# Syntax error because JSON must be valid UTF-8
630	# Limit context to 20 chars arbitrarily
631	snippet = self.s[str_pos:str_pos + 20]
632	raise self._Error(
633	'Invalid UTF-8 in %s string literal: %r' %
634	(self.lang_str, snippet), str_end)
635
636	# TODO: would be nice to avoid allocation in all these cases.
637	# But LookupCharC() would have to change.
638
639	elif tok_id == Id.Char_OneChar: # JSON and J8
640	ch = self.s[str_pos + 1]
641	part = consts.LookupCharC(ch)
642
643	elif tok_id == Id.Char_UBraced: # J8 only
644	h = self.s[str_pos + 3:str_end - 1]
645	i = int(h, 16)
646
647	# Same check in osh/word_parse.py
648	if 0xD800 <= i and i < 0xE000:
649	raise self._Error(
650	r"\u{%s} escape is illegal because it's in the surrogate range"
651	% h, str_end)
652
653	part = Utf8Encode(i)
654
655	elif tok_id == Id.Char_YHex: # J8 only
656	h = self.s[str_pos + 2:str_end]
657
658	# Same check in osh/word_parse.py
659	if left_id != Id.Left_BSingleQuote:
660	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
661	raise self._Error(
662	r"\y%s escapes not allowed in u'' strings" % h,
663	str_end)
664
665	i = int(h, 16)
666	part = chr(i)
667
668	elif tok_id == Id.Char_SurrogatePair:
669	h1 = self.s[str_pos + 2:str_pos + 6]
670	h2 = self.s[str_pos + 8:str_pos + 12]
671
672	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
673	i1 = int(h1, 16) - 0xD800 # high surrogate
674	i2 = int(h2, 16) - 0xDC00 # low surrogate
675	code_point = 0x10000 + (i1 << 10) + i2
676
677	part = Utf8Encode(code_point)
678
679	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
680	h = self.s[str_pos + 2:str_end]
681	i = int(h, 16)
682	part = Utf8Encode(i)
683
684	else:
685	# Should never happen
686	raise AssertionError(Id_str(tok_id))
687
688	#log('%s part %r', Id_str(tok_id), part)
689	self.decoded.write(part)
690	str_pos = str_end
691
692
693	class _Parser(object):
694
695	def __init__(self, s, is_j8):
696	# type: (str, bool) -> None
697	self.s = s
698	self.is_j8 = is_j8
699	self.lang_str = "J8" if is_j8 else "JSON"
700
701	self.lexer = LexerDecoder(s, is_j8)
702	self.tok_id = Id.Undefined_Tok
703	self.start_pos = 0
704	self.end_pos = 0
705	self.decoded = ''
706
707	def _Next(self):
708	# type: () -> None
709
710	# This isn't the start of a J8_Bool token, it's the END of the token before it
711	while True:
712	self.start_pos = self.end_pos
713	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
714	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Comment):
715	break
716	# TODO: add Ignored_Newline to count lines, and show line numbers
717	# in errors messages. The position of the last newline and a token
718	# can be used to calculate a column number.
719
720	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
721
722	def _Eat(self, tok_id):
723	# type: (Id_t) -> None
724
725	# TODO: Need location info
726	if self.tok_id != tok_id:
727	#log('position %r %d-%d %r', self.s, self.start_pos,
728	# self.end_pos, self.s[self.start_pos:self.end_pos])
729	raise self._Error("Expected %s, got %s" %
730	(Id_str(tok_id), Id_str(self.tok_id)))
731	self._Next()
732
733	def _Error(self, msg):
734	# type: (str) -> error.Decode
735	return error.Decode(msg, self.s, self.start_pos, self.end_pos)
736
737
738	class Parser(_Parser):
739	"""JSON and JSON8 Parser."""
740
741	def __init__(self, s, is_j8):
742	# type: (str, bool) -> None
743	_Parser.__init__(self, s, is_j8)
744
745	def _ParsePair(self):
746	# type: () -> Tuple[str, value_t]
747
748	k = self.decoded # Save the potential string value
749	self._Eat(Id.J8_String) # Check that it's a string
750	assert k is not None
751
752	self._Eat(Id.J8_Colon)
753
754	v = self._ParseValue()
755	return k, v
756
757	def _ParseDict(self):
758	# type: () -> value_t
759	"""
760	pair = string ':' value
761	Dict = '{' '}'
762	\| '{' pair (',' pair)* '}'
763	"""
764	# precondition
765	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
766
767	#log('> Dict')
768
769	d = NewDict() # type: Dict[str, value_t]
770
771	self._Next()
772	if self.tok_id == Id.J8_RBrace:
773	self._Next()
774	return value.Dict(d)
775
776	k, v = self._ParsePair()
777	d[k] = v
778	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
779
780	while self.tok_id == Id.J8_Comma:
781	self._Next()
782	k, v = self._ParsePair()
783	d[k] = v
784	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
785
786	self._Eat(Id.J8_RBrace)
787
788	#log('< Dict')
789
790	return value.Dict(d)
791
792	def _ParseList(self):
793	# type: () -> value_t
794	"""
795	List = '[' ']'
796	\| '[' value (',' value)* ']'
797	"""
798	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
799
800	items = [] # type: List[value_t]
801
802	self._Next()
803	if self.tok_id == Id.J8_RBracket:
804	self._Next()
805	return value.List(items)
806
807	items.append(self._ParseValue())
808
809	while self.tok_id == Id.J8_Comma:
810	self._Next()
811	items.append(self._ParseValue())
812
813	self._Eat(Id.J8_RBracket)
814
815	return value.List(items)
816
817	def _ParseValue(self):
818	# type: () -> value_t
819	if self.tok_id == Id.J8_LBrace:
820	return self._ParseDict()
821
822	elif self.tok_id == Id.J8_LBracket:
823	return self._ParseList()
824
825	elif self.tok_id == Id.J8_Null:
826	self._Next()
827	return value.Null
828
829	elif self.tok_id == Id.J8_Bool:
830	#log('%r %d', self.s[self.start_pos], self.start_pos)
831	b = value.Bool(self.s[self.start_pos] == 't')
832	self._Next()
833	return b
834
835	elif self.tok_id == Id.J8_Int:
836	part = self.s[self.start_pos:self.end_pos]
837	self._Next()
838	return value.Int(mops.FromStr(part))
839
840	elif self.tok_id == Id.J8_Float:
841	part = self.s[self.start_pos:self.end_pos]
842	self._Next()
843	return value.Float(float(part))
844
845	# UString, BString too
846	elif self.tok_id == Id.J8_String:
847	str_val = value.Str(self.decoded)
848	#log('d %r', self.decoded)
849	self._Next()
850	return str_val
851
852	elif self.tok_id == Id.Eol_Tok:
853	raise self._Error('Unexpected EOF while parsing %s' %
854	self.lang_str)
855
856	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
857	raise self._Error('Invalid token while parsing %s: %s' %
858	(self.lang_str, Id_str(self.tok_id)))
859
860	def ParseValue(self):
861	# type: () -> value_t
862	""" Raises error.Decode. """
863	self._Next()
864	obj = self._ParseValue()
865	if self.tok_id != Id.Eol_Tok:
866	raise self._Error('Unexpected trailing input')
867	return obj
868
869
870	class Nil8Parser(_Parser):
871	"""
872	Tokens not in JSON8:
873	LParen RParen Symbol
874
875	Tokens not in JSON, but in JSON8 and NIL8:
876	Identifier (unquoted keys)
877	Ignored_Comment
878	"""
879
880	def __init__(self, s, is_j8):
881	# type: (str, bool) -> None
882	_Parser.__init__(self, s, is_j8)
883
884	if 0:
885
886	def _LookAhead(self):
887	# type: () -> Id_t
888	"""
889	Don't need this right now
890	"""
891	end_pos = self.end_pos # look ahead from last token
892	while True:
893	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
894	if tok_id not in (Id.Ignored_Space, Id.Ignored_Comment):
895	break
896	return tok_id
897
898	def _ParseRecord(self):
899	# type: () -> nvalue_t
900	"""
901	Yaks
902	(self->Next) => (-> self Next)
903	(self->Next obj.field) => ((-> self Next) (. obj field))
904
905	Similar to
906	((identity identity) 42) => 42 in Clojure
907
908	ASDL
909	(Node left:(. x4beef2))
910	(Node left !x4beef2)
911
912	# Ambiguous because value can be identifier.
913	# We have to look ahead to and see if there's a colon :
914	field =
915	Identifier ':' value
916	\| value
917
918	record = '(' head field* ')'
919
920	- Identifier \| Symbol are treated the same, it's a side effect of
921	the lexing style
922	- do positional args come before named args
923	- () is invalid? Use [] for empty list
924	"""
925	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
926
927	items = [] # type: List[nvalue_t]
928
929	self._Next()
930	if self.tok_id == Id.J8_RParen:
931	self._Next()
932	return nvalue.List(items)
933
934	#log('TOK %s', Id_str(self.tok_id))
935	while self.tok_id != Id.J8_RParen:
936	items.append(self._ParseNil8())
937	#log('TOK 2 %s', Id_str(self.tok_id))
938
939	self._Eat(Id.J8_RParen)
940
941	return nvalue.List(items)
942
943	def _ParseList8(self):
944	# type: () -> nvalue_t
945	"""
946	List8 = '[' value* ']'
947
948	No commas, not even optional ones for now.
949	"""
950	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
951
952	items = [] # type: List[nvalue_t]
953
954	self._Next()
955	if self.tok_id == Id.J8_RBracket:
956	self._Next()
957	return nvalue.List(items)
958
959	#log('TOK %s', Id_str(self.tok_id))
960	while self.tok_id != Id.J8_RBracket:
961	items.append(self._ParseNil8())
962	#log('TOK 2 %s', Id_str(self.tok_id))
963
964	self._Eat(Id.J8_RBracket)
965
966	return nvalue.List(items)
967
968	def _ParseNil8(self):
969	# type: () -> nvalue_t
970	if self.tok_id == Id.J8_LParen:
971	obj = self._ParseRecord() # type: nvalue_t
972	#return obj
973
974	elif self.tok_id == Id.J8_LBracket:
975	obj = self._ParseList8()
976	#return obj
977
978	# Primitives are copied from J8 above.
979	# TODO: We also want hex literals.
980	elif self.tok_id == Id.J8_Null:
981	self._Next()
982	obj = nvalue.Null
983
984	elif self.tok_id == Id.J8_Bool:
985	b = nvalue.Bool(self.s[self.start_pos] == 't')
986	self._Next()
987	obj = b
988
989	elif self.tok_id == Id.J8_Int:
990	part = self.s[self.start_pos:self.end_pos]
991	self._Next()
992	obj = nvalue.Int(int(part))
993
994	elif self.tok_id == Id.J8_Float:
995	part = self.s[self.start_pos:self.end_pos]
996	self._Next()
997	obj = nvalue.Float(float(part))
998
999	elif self.tok_id == Id.J8_String:
1000	str_val = nvalue.Str(self.decoded)
1001	self._Next()
1002	obj = str_val
1003
1004	# <- etc.
1005	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1006	Id.J8_Comma):
1007	# unquoted "word" treated like a string
1008	part = self.s[self.start_pos:self.end_pos]
1009	self._Next()
1010	obj = nvalue.Symbol(part)
1011
1012	elif self.tok_id == Id.Eol_Tok:
1013	raise self._Error('Unexpected EOF while parsing %s' %
1014	self.lang_str)
1015
1016	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1017	raise self._Error('Invalid token while parsing %s: %s' %
1018	(self.lang_str, Id_str(self.tok_id)))
1019
1020	#log('YO %s', Id_str(self.tok_id))
1021	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1022	#log('AT %s', Id_str(self.tok_id))
1023
1024	# key: "value" -> (: key "value")
1025	part = self.s[self.start_pos:self.end_pos]
1026	op = nvalue.Symbol(part)
1027
1028	self._Next()
1029	operand2 = self._ParseNil8()
1030	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1031	#print("--> INFIX %d %s" % (id(infix), infix))
1032	return infix
1033
1034	#next_id = self._LookAhead()
1035	#print('NEXT %s' % Id_str(next_id))
1036
1037	#raise AssertionError()
1038	#print("--> OBJ %d %s" % (id(obj), obj))
1039	return obj
1040
1041	def ParseNil8(self):
1042	# type: () -> nvalue_t
1043	""" Raises error.Decode. """
1044	self._Next()
1045	#print('yo')
1046	obj = self._ParseNil8()
1047	#print("==> %d %s" % (id(obj), obj))
1048	if self.tok_id != Id.Eol_Tok:
1049	raise self._Error('Unexpected trailing input')
1050	return obj
1051
1052
1053	# vim: sw=4