data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1230 lines, 589 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	TODO:
6
7	- Many more tests
8	- Run JSONTestSuite
9
10	Later:
11
12	- PrettyPrinter uses hnode.asdl?
13	- color
14	- line wrapping -- do this later
15	- would like CONTRIBUTORS here
16
17	- Unify with ASDL pretty printing - NIL8
18	- {} [] are identical
19	- () is for statically typed ASDL data
20	(command.Simple blame_tok:(...) words:[ ])
21	although we are also using [] for typed ASDL arrays, not just JSON
22	- object IDs
23	- @ x123 can create an ID
24	- ! x123 can reference an ID
25	- <> can be for non-J8 data types? For the = operator
26	- 'hi \(name)' interpolation is useful for code
27
28	- Common between JSON8 and NIL8 - for writing by hand
29	- comments - # line or // line (JSON5 uses // line, following JS)
30	- unquoted identifier names - TYG8 could be more relaxed for (+ 1 (* 3 4))
31	- commas
32	- JSON8 could have trailing commas rule
33	- NIL8 at least has no commas for [1 2 "hi"]
34	"""
35
36	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40	from asdl import format as fmt
41	from core import error
42	from data_lang import pyj8
43	# dependency issue: consts.py pulls in frontend/option_def.py
44	from frontend import consts
45	from frontend import match
46	from mycpp import mops
47	from mycpp import mylib
48	from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50	import fastfunc
51
52	_ = log
53
54	from typing import cast, Dict, List, Tuple, Optional
55
56
57	# COPIED from ui.ValType() to break dep
58	def ValType(val):
59	# type: (value_t) -> str
60	"""For displaying type errors in the UI."""
61
62	return value_str(val.tag(), dot=False)
63
64
65	if mylib.PYTHON:
66
67	def HeapValueId(val):
68	# type: (value_t) -> int
69	"""
70	Python's id() returns the address, which is up to 64 bits.
71
72	In C++ we can use the GC ID, which fits within 32 bits.
73	"""
74	return id(val)
75
76
77	def ValueId(val):
78	# type: (value_t) -> int
79	"""
80	Return an integer ID for object that:
81
82	1. Can be used to determine whether 2 objects are the same, e.g. for
83	List, Dict, Func, Proc, etc.
84	2. Will help detect object cycles
85
86	Primitives types like Int and Float don't have this notion. They're
87	immutable values that are copied and compared by value.
88	"""
89	with tagswitch(val) as case:
90	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91	value_e.Str):
92	# These will not be on the heap if we switch to tagged pointers
93	# Str is handled conservatively - when we add small string
94	# optimization, some strings will be values, so we assume all are.
95	return -1
96	else:
97	return HeapValueId(val)
98
99
100	def ValueIdString(val):
101	# type: (value_t) -> str
102	"""Used by pp value (42) and = 42"""
103	heap_id = ValueId(val) # could be -1
104	if heap_id == -1:
105	return ''
106	else:
107	return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110	def Utf8Encode(code):
111	# type: (int) -> str
112	"""Return utf-8 encoded bytes from a unicode code point.
113
114	Based on https://stackoverflow.com/a/23502707
115	"""
116	num_cont_bytes = 0
117
118	if code <= 0x7F:
119	return chr(code & 0x7F) # ASCII
120
121	elif code <= 0x7FF:
122	num_cont_bytes = 1
123	elif code <= 0xFFFF:
124	num_cont_bytes = 2
125	elif code <= 0x10FFFF:
126	num_cont_bytes = 3
127
128	else:
129	return '\xEF\xBF\xBD' # unicode replacement character
130
131	bytes_ = [] # type: List[int]
132	for _ in xrange(num_cont_bytes):
133	bytes_.append(0x80 \| (code & 0x3F))
134	code >>= 6
135
136	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
137	bytes_.append(b)
138	bytes_.reverse()
139
140	# mod 256 because Python ints don't wrap around!
141	tmp = [chr(b & 0xFF) for b in bytes_]
142	return ''.join(tmp)
143
144
145	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147	LOSSY_JSON = 1 << 3 # JSON is lossy
148
149	# Hack until we fully translate
150	assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153	def _Print(val, buf, indent, options=0):
154	# type: (value_t, mylib.BufWriter, int, int) -> None
155	"""
156	Args:
157	indent: number of spaces to indent, or -1 for everything on one line
158	"""
159	p = InstancePrinter(buf, indent, options)
160	p.Print(val)
161
162
163	def PrintMessage(val, buf, indent):
164	# type: (value_t, mylib.BufWriter, int) -> None
165	""" For json8 write (x) and toJson8()
166
167	Caller must handle error.Encode
168	"""
169	_Print(val, buf, indent)
170
171
172	def PrintJsonMessage(val, buf, indent):
173	# type: (value_t, mylib.BufWriter, int) -> None
174	""" For json write (x) and toJson()
175
176	Caller must handle error.Encode()
177	Doesn't decay to b'' strings - will use Unicode replacement char.
178	"""
179	_Print(val, buf, indent, options=LOSSY_JSON)
180
181
182	def PrintLine(val, f):
183	# type: (value_t, mylib.Writer) -> None
184	""" For pp line (x) """
185
186	# error.Encode should be impossible - we show cycles and non-data
187	buf = mylib.BufWriter()
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	def EncodeString(s, buf, unquoted_ok=False):
194	# type: (str, mylib.BufWriter, bool) -> None
195	""" For pp proc, etc."""
196
197	if unquoted_ok and fastfunc.CanOmitQuotes(s):
198	buf.write(s)
199	return
200
201	_Print(value.Str(s), buf, -1)
202
203
204	def MaybeEncodeString(s):
205	# type: (str) -> str
206	""" For write --json8 $s and compexport """
207
208	# TODO: add unquoted_ok here?
209	# /usr/local/foo-bar/x.y/a_b
210
211	buf = mylib.BufWriter()
212	_Print(value.Str(s), buf, -1)
213	return buf.getvalue()
214
215
216	def MaybeEncodeJsonString(s):
217	# type: (str) -> str
218	""" For write --json """
219
220	# TODO: add unquoted_ok here?
221	# /usr/local/foo-bar/x.y/a_b
222	buf = mylib.BufWriter()
223	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224	return buf.getvalue()
225
226
227	# DFS traversal state
228	UNSEEN = 0
229	EXPLORING = 1
230	FINISHED = 2
231
232
233	class InstancePrinter(object):
234	"""Print a value tree as J8/JSON."""
235
236	def __init__(self, buf, indent, options):
237	# type: (mylib.BufWriter, int, int) -> None
238	self.buf = buf
239	self.indent = indent
240	self.options = options
241
242	# Key is vm.HeapValueId(val)
243	# Value is always True
244	# Dict[int, None] doesn't translate -- it would be nice to have a set()
245	self.visited = {} # type: Dict[int, int]
246
247	def _ItemIndent(self, level):
248	# type: (int) -> None
249
250	if self.indent == -1:
251	return
252
253	self.buf.write_spaces((level + 1) * self.indent)
254
255	def _BracketIndent(self, level):
256	# type: (int) -> None
257
258	if self.indent == -1:
259	return
260
261	self.buf.write_spaces(level * self.indent)
262
263	def _MaybeNewline(self):
264	# type: () -> None
265	if self.indent == -1:
266	return
267	self.buf.write('\n')
268
269	def _MaybeSpace(self):
270	# type: () -> None
271	if self.indent == -1:
272	return
273	self.buf.write(' ')
274
275	def _PrintList(self, val, level):
276	# type: (value.List, int) -> None
277
278	if len(val.items) == 0: # Special case like Python/JS
279	self.buf.write('[]')
280	else:
281	self.buf.write('[')
282	self._MaybeNewline()
283	for i, item in enumerate(val.items):
284	if i != 0:
285	self.buf.write(',')
286	self._MaybeNewline()
287
288	self._ItemIndent(level)
289	self.Print(item, level + 1)
290	self._MaybeNewline()
291
292	self._BracketIndent(level)
293	self.buf.write(']')
294
295	def _PrintDict(self, val, level):
296	# type: (value.Dict, int) -> None
297
298	if len(val.d) == 0: # Special case like Python/JS
299	self.buf.write('{}')
300	else:
301	self.buf.write('{')
302	self._MaybeNewline()
303	i = 0
304	for k, v in iteritems(val.d):
305	if i != 0:
306	self.buf.write(',')
307	self._MaybeNewline()
308
309	self._ItemIndent(level)
310
311	pyj8.WriteString(k, self.options, self.buf)
312
313	self.buf.write(':')
314	self._MaybeSpace()
315
316	self.Print(v, level + 1)
317
318	i += 1
319
320	self._MaybeNewline()
321	self._BracketIndent(level)
322	self.buf.write('}')
323
324	def Print(self, val, level=0):
325	# type: (value_t, int) -> None
326
327	# special value that means everything is on one line
328	# It's like
329	# JSON.stringify(d, null, 0)
330	# except we use -1, not 0. 0 can still have newlines.
331
332	UP_val = val
333	with tagswitch(val) as case:
334	if case(value_e.Null):
335	self.buf.write('null')
336
337	elif case(value_e.Bool):
338	val = cast(value.Bool, UP_val)
339	self.buf.write('true' if val.b else 'false')
340
341	elif case(value_e.Int):
342	val = cast(value.Int, UP_val)
343	# TODO: avoid intermediate allocation with
344	# self.buf.WriteBigInt(val.i)
345	#
346	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347	# be of arbitrary length, and will need a growth strategy.
348	# Although that is not very common, so we could allocate in
349	# that case.
350
351	self.buf.write(mops.ToStr(val.i))
352
353	elif case(value_e.Float):
354	val = cast(value.Float, UP_val)
355	# TODO: avoid intrmediate allocation with
356	# self.buf.WriteFloat(val.f)
357	self.buf.write(str(val.f))
358
359	elif case(value_e.Str):
360	val = cast(value.Str, UP_val)
361
362	pyj8.WriteString(val.s, self.options, self.buf)
363
364	elif case(value_e.List):
365	val = cast(value.List, UP_val)
366
367	# Cycle detection, only for containers that can be in cycles
368	heap_id = HeapValueId(val)
369
370	node_state = self.visited.get(heap_id, UNSEEN)
371	if node_state == FINISHED:
372	# Print it AGAIN. We print a JSON tree, which means we can
373	# visit and print nodes MANY TIMES, as long as they're not
374	# in a cycle.
375	self._PrintList(val, level)
376	return
377	if node_state == EXPLORING:
378	if self.options & SHOW_CYCLES:
379	self.buf.write('[ -->%s ]' % ValueIdString(val))
380	return
381	else:
382	# node.js prints which index closes the cycle
383	raise error.Encode(
384	"Can't encode List%s in object cycle" %
385	ValueIdString(val))
386
387	self.visited[heap_id] = EXPLORING
388	self._PrintList(val, level)
389	self.visited[heap_id] = FINISHED
390
391	elif case(value_e.Dict):
392	val = cast(value.Dict, UP_val)
393
394	# Cycle detection, only for containers that can be in cycles
395	heap_id = HeapValueId(val)
396
397	node_state = self.visited.get(heap_id, UNSEEN)
398	if node_state == FINISHED:
399	# Print it AGAIN. We print a JSON tree, which means we can
400	# visit and print nodes MANY TIMES, as long as they're not
401	# in a cycle.
402	self._PrintDict(val, level)
403	return
404	if node_state == EXPLORING:
405	if self.options & SHOW_CYCLES:
406	self.buf.write('{ -->%s }' % ValueIdString(val))
407	return
408	else:
409	# node.js prints which key closes the cycle
410	raise error.Encode(
411	"Can't encode Dict%s in object cycle" %
412	ValueIdString(val))
413
414	self.visited[heap_id] = EXPLORING
415	self._PrintDict(val, level)
416	self.visited[heap_id] = FINISHED
417
418	# BashArray and BashAssoc should be printed with pp line (x), e.g.
419	# for spec tests.
420	# - BashAssoc has a clear encoding.
421	# - BashArray could eventually be Dict[int, str]. But that's not
422	# encodable in JSON, which has string keys!
423	# So I think we can print it like ["a",null,'b"] and that won't
424	# change. That's what users expect.
425	elif case(value_e.BashArray):
426	val = cast(value.BashArray, UP_val)
427
428	self.buf.write('[')
429	self._MaybeNewline()
430	for i, s in enumerate(val.strs):
431	if i != 0:
432	self.buf.write(',')
433	self._MaybeNewline()
434
435	self._ItemIndent(level)
436	if s is None:
437	self.buf.write('null')
438	else:
439	pyj8.WriteString(s, self.options, self.buf)
440
441	self._MaybeNewline()
442
443	self._BracketIndent(level)
444	self.buf.write(']')
445
446	elif case(value_e.BashAssoc):
447	val = cast(value.BashAssoc, UP_val)
448
449	self.buf.write('{')
450	self._MaybeNewline()
451	i = 0
452	for k2, v2 in iteritems(val.d):
453	if i != 0:
454	self.buf.write(',')
455	self._MaybeNewline()
456
457	self._ItemIndent(level)
458
459	pyj8.WriteString(k2, self.options, self.buf)
460
461	self.buf.write(':')
462	self._MaybeSpace()
463
464	pyj8.WriteString(v2, self.options, self.buf)
465
466	i += 1
467
468	self._MaybeNewline()
469	self._BracketIndent(level)
470	self.buf.write('}')
471
472	else:
473	pass # mycpp workaround
474	if self.options & SHOW_NON_DATA:
475	# Similar to = operator, ui.DebugPrint()
476	# TODO: that prints value.Range in a special way
477	ysh_type = ValType(val)
478	id_str = ValueIdString(val)
479	self.buf.write('<%s%s>' % (ysh_type, id_str))
480	else:
481	raise error.Encode("Can't serialize object of type %s" %
482	ValType(val))
483
484
485	class PrettyPrinter(object):
486	""" Unused right now, but could enhance the = operator.
487
488	Output to polymorphic ColorOutput
489
490	Features like asdl/format.py:
491	- line wrapping
492	- color
493	- sharing detection by passing in a REF COUTN dict
494	- print @123 the first time, and then print ... the second time
495
496	and
497
498	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501	- Omitting commas for ASDL? Maybe we can use two spaces
502
503	(Token id: Id.VSub_DollarName start: 0 length: 3)
504	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505	"""
506
507	def __init__(self, max_col):
508	# type: (int) -> None
509	self.max_col = max_col
510
511	# This could be an optimized set an C++ bit set like
512	# mark_sweep_heap.h, rather than a Dict
513	#self.unique_objs = mylib.UniqueObjects()
514
515	# first pass of object ID -> number of times references
516
517	self.ref_count = {} # type: Dict[int, int]
518
519	def PrettyTree(self, val, f):
520	# type: (value_t, fmt.ColorOutput) -> None
521
522	# TODO: first convert to hnode.asdl types?
523
524	# Although we might want
525	# hnode.AlreadyShown = (str type, int unique_id)
526	pass
527
528	def Print(self, val, buf):
529	# type: (value_t, mylib.BufWriter) -> None
530
531	# Or print to stderr?
532	f = fmt.DetectConsoleOutput(mylib.Stdout())
533	self.PrettyTree(val, f)
534
535	# Then print those with ASDL
536	pass
537
538
539	class LexerDecoder(object):
540	"""J8 lexer and string decoder.
541
542	Similar interface as SimpleLexer, except we return an optional decoded
543	string
544	"""
545
546	def __init__(self, s, is_j8, lang_str):
547	# type: (str, bool, str) -> None
548	self.s = s
549	self.is_j8 = is_j8
550	self.lang_str = lang_str
551
552	self.pos = 0
553
554	# current line being lexed -- for error messages
555	self.cur_line_num = 1
556
557	# Reuse this instance to save GC objects. JSON objects could have
558	# thousands of strings.
559	self.decoded = mylib.BufWriter()
560
561	def _Error(self, msg, end_pos):
562	# type: (str, int) -> error.Decode
563
564	# Use the current position as start pos
565	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
566
567	def Next(self):
568	# type: () -> Tuple[Id_t, int, Optional[str]]
569	""" Returns a token and updates self.pos """
570
571	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
572
573	if not self.is_j8:
574	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
575	raise self._Error(
576	"Single quotes aren't part of JSON; you may want 'json8 read'",
577	end_pos)
578	if tok_id == Id.Ignored_Comment:
579	raise self._Error(
580	"Comments aren't part of JSON; you may want 'json8 read'",
581	end_pos)
582
583	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
584	Id.Left_USingleQuote):
585	return self._DecodeString(tok_id, end_pos)
586
587	if tok_id == Id.Ignored_Newline:
588	#log('LINE %d', self.cur_line_num)
589	self.cur_line_num += 1
590
591	self.pos = end_pos
592	return tok_id, end_pos, None
593
594	def NextForLines(self):
595	# type: () -> Tuple[Id_t, int, Optional[str]]
596	""" Like Next(), but for J8 Lines """
597
598	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
599
600	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
601	Id.Left_USingleQuote):
602	return self._DecodeString(tok_id, end_pos)
603
604	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
605	# this for quoted strings.)
606	if (tok_id == Id.Lit_Chars and
607	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
608	raise self._Error(
609	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
610	if tok_id == Id.Char_AsciiControl:
611	raise self._Error(
612	"J8 Lines can't have unescaped ASCII control chars", end_pos)
613
614	if tok_id == Id.J8_Newline:
615	#log('LINE %d', self.cur_line_num)
616	self.cur_line_num += 1
617
618	self.pos = end_pos
619	return tok_id, end_pos, None
620
621	def _DecodeString(self, left_id, str_pos):
622	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
623	""" Returns a string token and updates self.pos """
624
625	while True:
626	if left_id == Id.Left_DoubleQuote:
627	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
628	else:
629	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
630
631	#log('String tok %s', Id_str(tok_id))
632
633	if tok_id == Id.Eol_Tok:
634	# TODO: point to beginning of # quote?
635	raise self._Error(
636	'Unexpected EOF while lexing %s string' % self.lang_str,
637	str_end)
638	if tok_id == Id.Unknown_Backslash:
639	raise self._Error(
640	'Bad backslash escape in %s string' % self.lang_str,
641	str_end)
642	if tok_id == Id.Char_AsciiControl:
643	raise self._Error(
644	"%s strings can't have unescaped ASCII control chars" %
645	self.lang_str, str_end)
646
647	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
648
649	self.pos = str_end
650
651	s = self.decoded.getvalue()
652	self.decoded.clear() # reuse this instance
653
654	#log('decoded %r', self.decoded.getvalue())
655	return Id.J8_String, str_end, s
656
657	#
658	# Now handle each kind of token
659	#
660
661	if tok_id == Id.Lit_Chars: # JSON and J8
662	part = self.s[str_pos:str_end]
663	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
664	raise self._Error(
665	'Invalid UTF-8 in %s string literal' % self.lang_str,
666	str_end)
667
668	# TODO: would be nice to avoid allocation in all these cases.
669	# But LookupCharC() would have to change.
670
671	elif tok_id == Id.Char_OneChar: # JSON and J8
672	ch = self.s[str_pos + 1]
673	part = consts.LookupCharC(ch)
674
675	elif tok_id == Id.Char_UBraced: # J8 only
676	h = self.s[str_pos + 3:str_end - 1]
677	i = int(h, 16)
678
679	# Same check in osh/word_parse.py
680	if 0xD800 <= i and i < 0xE000:
681	raise self._Error(
682	r"\u{%s} escape is illegal because it's in the surrogate range"
683	% h, str_end)
684
685	part = Utf8Encode(i)
686
687	elif tok_id == Id.Char_YHex: # J8 only
688	h = self.s[str_pos + 2:str_end]
689
690	# Same check in osh/word_parse.py
691	if left_id != Id.Left_BSingleQuote:
692	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
693	raise self._Error(
694	r"\y%s escapes not allowed in u'' strings" % h,
695	str_end)
696
697	i = int(h, 16)
698	part = chr(i)
699
700	elif tok_id == Id.Char_SurrogatePair:
701	h1 = self.s[str_pos + 2:str_pos + 6]
702	h2 = self.s[str_pos + 8:str_pos + 12]
703
704	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
705	i1 = int(h1, 16) - 0xD800 # high surrogate
706	i2 = int(h2, 16) - 0xDC00 # low surrogate
707	code_point = 0x10000 + (i1 << 10) + i2
708
709	part = Utf8Encode(code_point)
710
711	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
712	h = self.s[str_pos + 2:str_end]
713	i = int(h, 16)
714	part = Utf8Encode(i)
715
716	else:
717	# Should never happen
718	raise AssertionError(Id_str(tok_id))
719
720	#log('%s part %r', Id_str(tok_id), part)
721	self.decoded.write(part)
722	str_pos = str_end
723
724
725	class _Parser(object):
726
727	def __init__(self, s, is_j8):
728	# type: (str, bool) -> None
729	self.s = s
730	self.is_j8 = is_j8
731	self.lang_str = "J8" if is_j8 else "JSON"
732
733	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
734	self.tok_id = Id.Undefined_Tok
735	self.start_pos = 0
736	self.end_pos = 0
737	self.decoded = '' # decoded J8 string
738
739	def _Next(self):
740	# type: () -> None
741
742	# This isn't the start of a J8_Bool token, it's the END of the token before it
743	while True:
744	self.start_pos = self.end_pos
745	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
746	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
747	Id.Ignored_Comment):
748	break
749	# TODO: add Ignored_Newline to count lines, and show line numbers
750	# in errors messages. The position of the last newline and a token
751	# can be used to calculate a column number.
752
753	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
754
755	def _Eat(self, tok_id):
756	# type: (Id_t) -> None
757
758	if self.tok_id != tok_id:
759	#log('position %r %d-%d %r', self.s, self.start_pos,
760	# self.end_pos, self.s[self.start_pos:self.end_pos])
761	raise self._ParseError("Expected %s, got %s" %
762	(Id_str(tok_id), Id_str(self.tok_id)))
763	self._Next()
764
765	def _NextForLines(self):
766	# type: () -> None
767	"""Like _Next, but use the J8 Lines lexer."""
768	self.start_pos = self.end_pos
769	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
770
771	def _ParseError(self, msg):
772	# type: (str) -> error.Decode
773	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
774	self.lexer.cur_line_num)
775
776
777	class Parser(_Parser):
778	"""JSON and JSON8 Parser."""
779
780	def __init__(self, s, is_j8):
781	# type: (str, bool) -> None
782	_Parser.__init__(self, s, is_j8)
783
784	def _ParsePair(self):
785	# type: () -> Tuple[str, value_t]
786
787	k = self.decoded # Save the potential string value
788	self._Eat(Id.J8_String) # Check that it's a string
789	assert k is not None
790
791	self._Eat(Id.J8_Colon)
792
793	v = self._ParseValue()
794	return k, v
795
796	def _ParseDict(self):
797	# type: () -> value_t
798	"""
799	pair = string ':' value
800	Dict = '{' '}'
801	\| '{' pair (',' pair)* '}'
802	"""
803	# precondition
804	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
805
806	#log('> Dict')
807
808	d = NewDict() # type: Dict[str, value_t]
809
810	self._Next()
811	if self.tok_id == Id.J8_RBrace:
812	self._Next()
813	return value.Dict(d)
814
815	k, v = self._ParsePair()
816	d[k] = v
817	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
818
819	while self.tok_id == Id.J8_Comma:
820	self._Next()
821	k, v = self._ParsePair()
822	d[k] = v
823	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
824
825	self._Eat(Id.J8_RBrace)
826
827	#log('< Dict')
828
829	return value.Dict(d)
830
831	def _ParseList(self):
832	# type: () -> value_t
833	"""
834	List = '[' ']'
835	\| '[' value (',' value)* ']'
836	"""
837	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
838
839	items = [] # type: List[value_t]
840
841	self._Next()
842	if self.tok_id == Id.J8_RBracket:
843	self._Next()
844	return value.List(items)
845
846	items.append(self._ParseValue())
847
848	while self.tok_id == Id.J8_Comma:
849	self._Next()
850	items.append(self._ParseValue())
851
852	self._Eat(Id.J8_RBracket)
853
854	return value.List(items)
855
856	def _ParseValue(self):
857	# type: () -> value_t
858	if self.tok_id == Id.J8_LBrace:
859	return self._ParseDict()
860
861	elif self.tok_id == Id.J8_LBracket:
862	return self._ParseList()
863
864	elif self.tok_id == Id.J8_Null:
865	self._Next()
866	return value.Null
867
868	elif self.tok_id == Id.J8_Bool:
869	#log('%r %d', self.s[self.start_pos], self.start_pos)
870	b = value.Bool(self.s[self.start_pos] == 't')
871	self._Next()
872	return b
873
874	elif self.tok_id == Id.J8_Int:
875	part = self.s[self.start_pos:self.end_pos]
876	self._Next()
877	return value.Int(mops.FromStr(part))
878
879	elif self.tok_id == Id.J8_Float:
880	part = self.s[self.start_pos:self.end_pos]
881	self._Next()
882	return value.Float(float(part))
883
884	# UString, BString too
885	elif self.tok_id == Id.J8_String:
886	str_val = value.Str(self.decoded)
887	#log('d %r', self.decoded)
888	self._Next()
889	return str_val
890
891	elif self.tok_id == Id.Eol_Tok:
892	raise self._ParseError('Unexpected EOF while parsing %s' %
893	self.lang_str)
894
895	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
896	raise self._ParseError('Invalid token while parsing %s: %s' %
897	(self.lang_str, Id_str(self.tok_id)))
898
899	def ParseValue(self):
900	# type: () -> value_t
901	""" Raises error.Decode. """
902	self._Next()
903	obj = self._ParseValue()
904	if self.tok_id != Id.Eol_Tok:
905	raise self._ParseError('Unexpected trailing input')
906	return obj
907
908
909	class Nil8Parser(_Parser):
910	"""
911	Tokens not in JSON8:
912	LParen RParen Symbol
913
914	Tokens not in JSON, but in JSON8 and NIL8:
915	Identifier (unquoted keys)
916	Ignored_Comment
917	"""
918
919	def __init__(self, s, is_j8):
920	# type: (str, bool) -> None
921	_Parser.__init__(self, s, is_j8)
922
923	if 0:
924
925	def _LookAhead(self):
926	# type: () -> Id_t
927	"""
928	Don't need this right now
929	"""
930	end_pos = self.end_pos # look ahead from last token
931	while True:
932	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
933	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
934	Id.Ignored_Comment):
935	break
936	return tok_id
937
938	def _ParseRecord(self):
939	# type: () -> nvalue_t
940	"""
941	Yaks
942	(self->Next) => (-> self Next)
943	(self->Next obj.field) => ((-> self Next) (. obj field))
944
945	Similar to
946	((identity identity) 42) => 42 in Clojure
947
948	ASDL
949	(Node left:(. x4beef2))
950	(Node left !x4beef2)
951
952	# Ambiguous because value can be identifier.
953	# We have to look ahead to and see if there's a colon :
954	field =
955	Identifier ':' value
956	\| value
957
958	record = '(' head field* ')'
959
960	- Identifier \| Symbol are treated the same, it's a side effect of
961	the lexing style
962	- do positional args come before named args
963	- () is invalid? Use [] for empty list
964	"""
965	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
966
967	items = [] # type: List[nvalue_t]
968
969	self._Next()
970	if self.tok_id == Id.J8_RParen:
971	self._Next()
972	return nvalue.List(items)
973
974	#log('TOK %s', Id_str(self.tok_id))
975	while self.tok_id != Id.J8_RParen:
976	items.append(self._ParseNil8())
977	#log('TOK 2 %s', Id_str(self.tok_id))
978
979	self._Eat(Id.J8_RParen)
980
981	return nvalue.List(items)
982
983	def _ParseList8(self):
984	# type: () -> nvalue_t
985	"""
986	List8 = '[' value* ']'
987
988	No commas, not even optional ones for now.
989	"""
990	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
991
992	items = [] # type: List[nvalue_t]
993
994	self._Next()
995	if self.tok_id == Id.J8_RBracket:
996	self._Next()
997	return nvalue.List(items)
998
999	#log('TOK %s', Id_str(self.tok_id))
1000	while self.tok_id != Id.J8_RBracket:
1001	items.append(self._ParseNil8())
1002	#log('TOK 2 %s', Id_str(self.tok_id))
1003
1004	self._Eat(Id.J8_RBracket)
1005
1006	return nvalue.List(items)
1007
1008	def _ParseNil8(self):
1009	# type: () -> nvalue_t
1010	if self.tok_id == Id.J8_LParen:
1011	obj = self._ParseRecord() # type: nvalue_t
1012	#return obj
1013
1014	elif self.tok_id == Id.J8_LBracket:
1015	obj = self._ParseList8()
1016	#return obj
1017
1018	# Primitives are copied from J8 above.
1019	# TODO: We also want hex literals.
1020	elif self.tok_id == Id.J8_Null:
1021	self._Next()
1022	obj = nvalue.Null
1023
1024	elif self.tok_id == Id.J8_Bool:
1025	b = nvalue.Bool(self.s[self.start_pos] == 't')
1026	self._Next()
1027	obj = b
1028
1029	elif self.tok_id == Id.J8_Int:
1030	part = self.s[self.start_pos:self.end_pos]
1031	self._Next()
1032	obj = nvalue.Int(int(part))
1033
1034	elif self.tok_id == Id.J8_Float:
1035	part = self.s[self.start_pos:self.end_pos]
1036	self._Next()
1037	obj = nvalue.Float(float(part))
1038
1039	elif self.tok_id == Id.J8_String:
1040	str_val = nvalue.Str(self.decoded)
1041	self._Next()
1042	obj = str_val
1043
1044	# <- etc.
1045	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1046	Id.J8_Comma):
1047	# unquoted "word" treated like a string
1048	part = self.s[self.start_pos:self.end_pos]
1049	self._Next()
1050	obj = nvalue.Symbol(part)
1051
1052	elif self.tok_id == Id.Eol_Tok:
1053	raise self._ParseError('Unexpected EOF while parsing %s' %
1054	self.lang_str)
1055
1056	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1057	raise self._ParseError('Invalid token while parsing %s: %s' %
1058	(self.lang_str, Id_str(self.tok_id)))
1059
1060	#log('YO %s', Id_str(self.tok_id))
1061	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1062	#log('AT %s', Id_str(self.tok_id))
1063
1064	# key: "value" -> (: key "value")
1065	part = self.s[self.start_pos:self.end_pos]
1066	op = nvalue.Symbol(part)
1067
1068	self._Next()
1069	operand2 = self._ParseNil8()
1070	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1071	#print("--> INFIX %d %s" % (id(infix), infix))
1072	return infix
1073
1074	#next_id = self._LookAhead()
1075	#print('NEXT %s' % Id_str(next_id))
1076
1077	#raise AssertionError()
1078	#print("--> OBJ %d %s" % (id(obj), obj))
1079	return obj
1080
1081	def ParseNil8(self):
1082	# type: () -> nvalue_t
1083	""" Raises error.Decode. """
1084	self._Next()
1085	#print('yo')
1086	obj = self._ParseNil8()
1087	#print("==> %d %s" % (id(obj), obj))
1088	if self.tok_id != Id.Eol_Tok:
1089	raise self._ParseError('Unexpected trailing input')
1090	return obj
1091
1092
1093	class J8LinesParser(_Parser):
1094	"""Decode lines from a string with newlines.
1095
1096	We specify this with a grammar, to preserve location info and to reduce
1097	allocations. (But note that unquoted_line is more like a LOOP than it is
1098	grammatical.)
1099
1100	Grammar:
1101
1102	end = J8_Newline \| Eol_Tok
1103
1104	empty_line = WS_Space? end
1105
1106	# special case: read until end token, but REMOVE trailing WS_Space
1107	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1108
1109	j8_line = WS_Space? J8_String WS_Space? end
1110
1111	lines = (empty_line \| unquoted_line \| j8_line)*
1112
1113	where Lit_Chars is valid UTF-8
1114
1115	Notes:
1116
1117	(1) We disallow multiple strings on a line, like:
1118
1119	"json" "json2"
1120	"json" unquoted
1121
1122	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1123
1124	foo "" u''
1125
1126	The "" and u'' are not a decoded string, because the line started with
1127	Id.Lit_Chars literals.
1128
1129	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1130	Does it have - for empty cell?
1131	"""
1132
1133	def __init__(self, s):
1134	# type: (str) -> None
1135	_Parser.__init__(self, s, True)
1136
1137	def _Show(self, s):
1138	# type: (str) -> None
1139	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1140	self.end_pos)
1141
1142	def _ParseLine(self, out):
1143	# type: (List[str]) -> None
1144	""" May append a line to 'out' """
1145	#self._Show('1')
1146	if self.tok_id == Id.WS_Space:
1147	self._NextForLines()
1148
1149	# Empty line - return without doing anything
1150	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1151	self._NextForLines()
1152	return
1153
1154	# Quoted string on line
1155	if self.tok_id == Id.J8_String:
1156	out.append(self.decoded)
1157	self._NextForLines()
1158
1159	if self.tok_id == Id.WS_Space: # trailing whitespace
1160	self._NextForLines()
1161
1162	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1163	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1164	Id_str(self.tok_id))
1165
1166	self._NextForLines()
1167	return
1168
1169	# Unquoted line
1170	if self.tok_id == Id.Lit_Chars:
1171	# ' unquoted "" text on line ' # read every token until end
1172	string_start = self.start_pos
1173	while True:
1174	# for stripping whitespace
1175	prev_id = self.tok_id
1176	prev_start = self.start_pos
1177
1178	self._NextForLines()
1179
1180	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1181	# \r, but we're sticking with the JSON spec definition of
1182	# whitespace. (As another data point, CPython on Unix allows
1183	# \r in the middle of expressions, treating it as whitespace.)
1184	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1185	break
1186
1187	if prev_id == Id.WS_Space:
1188	string_end = prev_start # remove trailing whitespace
1189	else:
1190	string_end = self.start_pos
1191
1192	out.append(self.s[string_start:string_end])
1193
1194	self._NextForLines() # past newline
1195	return
1196
1197	raise AssertionError(Id_str(self.tok_id))
1198
1199	def Parse(self):
1200	# type: () -> List[str]
1201	""" Raises error.Decode. """
1202	self._NextForLines()
1203
1204	lines = [] # type: List[str]
1205	while self.tok_id != Id.Eol_Tok:
1206	self._ParseLine(lines)
1207
1208	if self.tok_id != Id.Eol_Tok:
1209	raise self._ParseError('Unexpected trailing input in J8 Lines')
1210
1211	return lines
1212
1213
1214	def SplitJ8Lines(s):
1215	# type: (str) -> List[str]
1216	"""Used by @(echo split command sub)
1217
1218	Raises:
1219	error.Decode
1220
1221	3 Errors:
1222	- J8 string syntax error inside quotes
1223	- Extra input on line
1224	- unquoted line isn't utf-8
1225	"""
1226	p = J8LinesParser(s)
1227	return p.Parse()
1228
1229
1230	# vim: sw=4