data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1369 lines, 685 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str,
35	Dict_)
36	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
37
38	from asdl import format as fmt
39	from core import error
40	from data_lang import pyj8
41	# dependency issue: consts.py pulls in frontend/option_def.py
42	from frontend import consts
43	from frontend import match
44	from mycpp import mops
45	from mycpp import mylib
46	from mycpp.mylib import tagswitch, iteritems, NewDict, log
47
48	import fastfunc
49
50	_ = log
51
52	from typing import cast, Dict, List, Tuple, Optional
53
54
55	# COPIED from ui.ValType() to break dep
56	def ValType(val):
57	# type: (value_t) -> str
58	"""For displaying type errors in the UI."""
59
60	return value_str(val.tag(), dot=False)
61
62
63	if mylib.PYTHON:
64
65	def HeapValueId(val):
66	# type: (value_t) -> int
67	"""
68	Python's id() returns the address, which is up to 64 bits.
69
70	In C++ we can use the GC ID, which fits within 32 bits.
71	"""
72	return id(val)
73
74
75	def ValueId(val):
76	# type: (value_t) -> int
77	"""
78	Return an integer ID for object that:
79
80	1. Can be used to determine whether 2 objects are the same, e.g. for
81	List, Dict, Func, Proc, etc.
82	2. Will help detect object cycles
83
84	Primitives types like Int and Float don't have this notion. They're
85	immutable values that are copied and compared by value.
86	"""
87	with tagswitch(val) as case:
88	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
89	value_e.Str):
90	# These will not be on the heap if we switch to tagged pointers
91	# Str is handled conservatively - when we add small string
92	# optimization, some strings will be values, so we assume all are.
93	return -1
94	else:
95	return HeapValueId(val)
96
97
98	def ValueIdString(val):
99	# type: (value_t) -> str
100	"""Used by pp value (42) and = 42"""
101	heap_id = ValueId(val) # could be -1
102	if heap_id == -1:
103	return ''
104	else:
105	return ' 0x%s' % mylib.hex_lower(heap_id)
106
107
108	def Utf8Encode(code):
109	# type: (int) -> str
110	"""Return utf-8 encoded bytes from a unicode code point.
111
112	Based on https://stackoverflow.com/a/23502707
113	"""
114	num_cont_bytes = 0
115
116	if code <= 0x7F:
117	return chr(code & 0x7F) # ASCII
118
119	elif code <= 0x7FF:
120	num_cont_bytes = 1
121	elif code <= 0xFFFF:
122	num_cont_bytes = 2
123	else:
124	# What about the check code <= 0x10FFFF ?
125	# - it happens in statically parsed $'' u''
126	# - but not dynamically parsed echo -e / printf, following bash/zsh
127	num_cont_bytes = 3
128
129	bytes_ = [] # type: List[int]
130	for _ in xrange(num_cont_bytes):
131	bytes_.append(0x80 \| (code & 0x3F))
132	code >>= 6
133
134	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
135	bytes_.append(b)
136	bytes_.reverse()
137
138	# mod 256 because Python ints don't wrap around!
139	tmp = [chr(b & 0xFF) for b in bytes_]
140	return ''.join(tmp)
141
142
143	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
144	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
145	LOSSY_JSON = 1 << 3 # JSON is lossy
146	INF_NAN_ARE_NULL = 1 << 4 # for JSON
147
148	# Hack until we fully translate
149	assert pyj8.LOSSY_JSON == LOSSY_JSON
150
151
152	def _Print(val, buf, indent, options=0):
153	# type: (value_t, mylib.BufWriter, int, int) -> None
154	"""
155	Args:
156	indent: number of spaces to indent, or -1 for everything on one line
157	"""
158	p = InstancePrinter(buf, indent, options)
159	p.Print(val)
160
161
162	def PrintMessage(val, buf, indent):
163	# type: (value_t, mylib.BufWriter, int) -> None
164	""" For json8 write (x) and toJson8()
165
166	Caller must handle error.Encode
167	"""
168	_Print(val, buf, indent)
169
170
171	def PrintJsonMessage(val, buf, indent):
172	# type: (value_t, mylib.BufWriter, int) -> None
173	""" For json write (x) and toJson()
174
175	Caller must handle error.Encode()
176	Doesn't decay to b'' strings - will use Unicode replacement char.
177	"""
178	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
179
180
181	def PrintLine(val, f):
182	# type: (value_t, mylib.Writer) -> None
183	""" For pp line (x) """
184
185	# error.Encode should be impossible - we show cycles and non-data
186	buf = mylib.BufWriter()
187
188	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
189
190	f.write(buf.getvalue())
191	f.write('\n')
192
193
194	if 0:
195
196	def Repr(val):
197	# type: (value_t) -> str
198	""" Unused
199	This is like Python's repr
200	"""
201	# error.Encode should be impossible - we show cycles and non-data
202	buf = mylib.BufWriter()
203	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
204	return buf.getvalue()
205
206
207	def EncodeString(s, buf, unquoted_ok=False):
208	# type: (str, mylib.BufWriter, bool) -> None
209	""" For pp proc, etc."""
210
211	if unquoted_ok and fastfunc.CanOmitQuotes(s):
212	buf.write(s)
213	return
214
215	_Print(value.Str(s), buf, -1)
216
217
218	def MaybeEncodeString(s):
219	# type: (str) -> str
220	""" For write --json8 $s and compexport """
221
222	# TODO: add unquoted_ok here?
223	# /usr/local/foo-bar/x.y/a_b
224
225	buf = mylib.BufWriter()
226	_Print(value.Str(s), buf, -1)
227	return buf.getvalue()
228
229
230	def MaybeEncodeJsonString(s):
231	# type: (str) -> str
232	""" For write --json """
233
234	# TODO: add unquoted_ok here?
235	# /usr/local/foo-bar/x.y/a_b
236	buf = mylib.BufWriter()
237	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
238	return buf.getvalue()
239
240
241	# DFS traversal state
242	UNSEEN = 0
243	EXPLORING = 1
244	FINISHED = 2
245
246
247	class InstancePrinter(object):
248	"""Print a value tree as J8/JSON."""
249
250	def __init__(self, buf, indent, options):
251	# type: (mylib.BufWriter, int, int) -> None
252	self.buf = buf
253	self.indent = indent
254	self.options = options
255
256	# Key is vm.HeapValueId(val)
257	# Value is always True
258	# Dict[int, None] doesn't translate -- it would be nice to have a set()
259	self.visited = {} # type: Dict[int, int]
260
261	def _ItemIndent(self, level):
262	# type: (int) -> None
263
264	if self.indent == -1:
265	return
266
267	self.buf.write_spaces((level + 1) * self.indent)
268
269	def _BracketIndent(self, level):
270	# type: (int) -> None
271
272	if self.indent == -1:
273	return
274
275	self.buf.write_spaces(level * self.indent)
276
277	def _MaybeNewline(self):
278	# type: () -> None
279	if self.indent == -1:
280	return
281	self.buf.write('\n')
282
283	def _MaybeSpace(self):
284	# type: () -> None
285	if self.indent == -1:
286	return
287	self.buf.write(' ')
288
289	def _PrintList(self, val, level):
290	# type: (value.List, int) -> None
291
292	if len(val.items) == 0: # Special case like Python/JS
293	self.buf.write('[]')
294	else:
295	self.buf.write('[')
296	self._MaybeNewline()
297	for i, item in enumerate(val.items):
298	if i != 0:
299	self.buf.write(',')
300	self._MaybeNewline()
301
302	self._ItemIndent(level)
303	self.Print(item, level + 1)
304	self._MaybeNewline()
305
306	self._BracketIndent(level)
307	self.buf.write(']')
308
309	def _PrintDict(self, val, level):
310	# type: (Dict_, int) -> None
311
312	if len(val.d) == 0: # Special case like Python/JS
313	self.buf.write('{}')
314	else:
315	self.buf.write('{')
316	self._MaybeNewline()
317	i = 0
318	for k, v in iteritems(val.d):
319	if i != 0:
320	self.buf.write(',')
321	self._MaybeNewline()
322
323	self._ItemIndent(level)
324
325	pyj8.WriteString(k, self.options, self.buf)
326
327	self.buf.write(':')
328	self._MaybeSpace()
329
330	self.Print(v, level + 1)
331
332	i += 1
333
334	self._MaybeNewline()
335	self._BracketIndent(level)
336	self.buf.write('}')
337
338	def _PrintBashPrefix(self, type_str, level):
339	# type: (str, int) -> None
340
341	self.buf.write('{')
342	self._MaybeNewline()
343	self._ItemIndent(level)
344	self.buf.write('"type":')
345	self._MaybeSpace()
346	self.buf.write(type_str) # "BashArray", or "BashAssoc",
347
348	self._MaybeNewline()
349
350	self._ItemIndent(level)
351	self.buf.write('"data":')
352	self._MaybeSpace()
353
354	def _PrintBashSuffix(self, level):
355	# type: (int) -> None
356	self._MaybeNewline()
357	self._BracketIndent(level)
358	self.buf.write('}')
359
360	def _PrintSparseArray(self, val, level):
361	# type: (value.SparseArray, int) -> None
362
363	self._PrintBashPrefix('"SparseArray",', level)
364
365	if len(val.d) == 0: # Special case like Python/JS
366	self.buf.write('{}')
367	else:
368	self.buf.write('{')
369	self._MaybeNewline()
370
371	first = True
372	i = 0
373	for k, v in iteritems(val.d):
374	if i != 0:
375	self.buf.write(',')
376	self._MaybeNewline()
377
378	self._ItemIndent(level + 1)
379	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
380
381	self.buf.write(':')
382	self._MaybeSpace()
383
384	pyj8.WriteString(v, self.options, self.buf)
385
386	i += 1
387
388	self._MaybeNewline()
389
390	self._BracketIndent(level + 1)
391	self.buf.write('}')
392
393	self._PrintBashSuffix(level)
394
395	def _PrintBashArray(self, val, level):
396	# type: (value.BashArray, int) -> None
397
398	self._PrintBashPrefix('"BashArray",', level)
399
400	if len(val.strs) == 0: # Special case like Python/JS
401	self.buf.write('{}')
402	else:
403	self.buf.write('{')
404	self._MaybeNewline()
405
406	first = True
407	for i, s in enumerate(val.strs):
408	if s is None:
409	continue
410
411	if not first:
412	self.buf.write(',')
413	self._MaybeNewline()
414
415	self._ItemIndent(level + 1)
416	pyj8.WriteString(str(i), self.options, self.buf)
417
418	self.buf.write(':')
419	self._MaybeSpace()
420
421	pyj8.WriteString(s, self.options, self.buf)
422
423	first = False
424
425	self._MaybeNewline()
426
427	self._BracketIndent(level + 1)
428	self.buf.write('}')
429
430	self._PrintBashSuffix(level)
431
432	def _PrintBashAssoc(self, val, level):
433	# type: (value.BashAssoc, int) -> None
434
435	self._PrintBashPrefix('"BashAssoc",', level)
436
437	if len(val.d) == 0: # Special case like Python/JS
438	self.buf.write('{}')
439	else:
440	self.buf.write('{')
441	self._MaybeNewline()
442
443	i = 0
444	for k2, v2 in iteritems(val.d):
445	if i != 0:
446	self.buf.write(',')
447	self._MaybeNewline()
448
449	self._ItemIndent(level + 1)
450	pyj8.WriteString(k2, self.options, self.buf)
451
452	self.buf.write(':')
453	self._MaybeSpace()
454
455	pyj8.WriteString(v2, self.options, self.buf)
456
457	i += 1
458
459	self._MaybeNewline()
460
461	self._BracketIndent(level + 1)
462	self.buf.write('}')
463
464	self._PrintBashSuffix(level)
465
466	def Print(self, val, level=0):
467	# type: (value_t, int) -> None
468
469	# special value that means everything is on one line
470	# It's like
471	# JSON.stringify(d, null, 0)
472	# except we use -1, not 0. 0 can still have newlines.
473
474	UP_val = val
475	with tagswitch(val) as case:
476	if case(value_e.Null):
477	self.buf.write('null')
478
479	elif case(value_e.Bool):
480	val = cast(value.Bool, UP_val)
481	self.buf.write('true' if val.b else 'false')
482
483	elif case(value_e.Int):
484	val = cast(value.Int, UP_val)
485	# TODO: avoid intermediate allocation with
486	# self.buf.WriteBigInt(val.i)
487	#
488	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
489	# be of arbitrary length, and will need a growth strategy.
490	# Although that is not very common, so we could allocate in
491	# that case.
492
493	self.buf.write(mops.ToStr(val.i))
494
495	elif case(value_e.Float):
496	val = cast(value.Float, UP_val)
497
498	fl = val.f
499	if math.isinf(fl):
500	if self.options & INF_NAN_ARE_NULL:
501	s = 'null' # negative infinity is null too
502	else:
503	s = 'INFINITY'
504	if fl < 0:
505	s = '-' + s
506	elif math.isnan(fl):
507	if self.options & INF_NAN_ARE_NULL:
508	# JavaScript JSON lib behavior: Inf and NaN are null
509	# Python has a bug in the encoder by default, and then
510	# allow_nan=False raises an error
511	s = 'null'
512	else:
513	s = 'NAN'
514	else:
515	# TODO: can we avoid intermediate allocation?
516	# self.buf.WriteFloat(val.f)
517	s = str(fl)
518
519	self.buf.write(s)
520
521	elif case(value_e.Str):
522	val = cast(value.Str, UP_val)
523
524	pyj8.WriteString(val.s, self.options, self.buf)
525
526	elif case(value_e.List):
527	val = cast(value.List, UP_val)
528
529	# Cycle detection, only for containers that can be in cycles
530	heap_id = HeapValueId(val)
531
532	node_state = self.visited.get(heap_id, UNSEEN)
533	if node_state == FINISHED:
534	# Print it AGAIN. We print a JSON tree, which means we can
535	# visit and print nodes MANY TIMES, as long as they're not
536	# in a cycle.
537	self._PrintList(val, level)
538	return
539	if node_state == EXPLORING:
540	if self.options & SHOW_CYCLES:
541	self.buf.write('[ -->%s ]' % ValueIdString(val))
542	return
543	else:
544	# node.js prints which index closes the cycle
545	raise error.Encode(
546	"Can't encode List%s in object cycle" %
547	ValueIdString(val))
548
549	self.visited[heap_id] = EXPLORING
550	self._PrintList(val, level)
551	self.visited[heap_id] = FINISHED
552
553	elif case(value_e.Dict):
554	val = cast(Dict_, UP_val)
555
556	# Cycle detection, only for containers that can be in cycles
557	heap_id = HeapValueId(val)
558
559	node_state = self.visited.get(heap_id, UNSEEN)
560	if node_state == FINISHED:
561	# Print it AGAIN. We print a JSON tree, which means we can
562	# visit and print nodes MANY TIMES, as long as they're not
563	# in a cycle.
564	self._PrintDict(val, level)
565	return
566	if node_state == EXPLORING:
567	if self.options & SHOW_CYCLES:
568	self.buf.write('{ -->%s }' % ValueIdString(val))
569	return
570	else:
571	# node.js prints which key closes the cycle
572	raise error.Encode(
573	"Can't encode Dict%s in object cycle" %
574	ValueIdString(val))
575
576	self.visited[heap_id] = EXPLORING
577	self._PrintDict(val, level)
578	self.visited[heap_id] = FINISHED
579
580	elif case(value_e.SparseArray):
581	val = cast(value.SparseArray, UP_val)
582	self._PrintSparseArray(val, level)
583
584	elif case(value_e.BashArray):
585	val = cast(value.BashArray, UP_val)
586	self._PrintBashArray(val, level)
587
588	elif case(value_e.BashAssoc):
589	val = cast(value.BashAssoc, UP_val)
590	self._PrintBashAssoc(val, level)
591
592	else:
593	pass # mycpp workaround
594	if self.options & SHOW_NON_DATA:
595	# Similar to = operator, ui.DebugPrint()
596	# TODO: that prints value.Range in a special way
597	ysh_type = ValType(val)
598	id_str = ValueIdString(val)
599	self.buf.write('<%s%s>' % (ysh_type, id_str))
600	else:
601	raise error.Encode("Can't serialize object of type %s" %
602	ValType(val))
603
604
605	class PrettyPrinter(object):
606	""" Unused right now, but could enhance the = operator.
607
608	Output to polymorphic ColorOutput
609
610	Features like asdl/format.py:
611	- line wrapping
612	- color
613	- sharing detection by passing in a REF COUTN dict
614	- print @123 the first time, and then print ... the second time
615
616	and
617
618	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
619	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
620
621	- Omitting commas for ASDL? Maybe we can use two spaces
622
623	(Token id: Id.VSub_DollarName start: 0 length: 3)
624	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
625	"""
626
627	def __init__(self, max_col):
628	# type: (int) -> None
629	self.max_col = max_col
630
631	# This could be an optimized set an C++ bit set like
632	# mark_sweep_heap.h, rather than a Dict
633	#self.unique_objs = mylib.UniqueObjects()
634
635	# first pass of object ID -> number of times references
636
637	self.ref_count = {} # type: Dict[int, int]
638
639	def PrettyTree(self, val, f):
640	# type: (value_t, fmt.ColorOutput) -> None
641
642	# TODO: first convert to hnode.asdl types?
643
644	# Although we might want
645	# hnode.AlreadyShown = (str type, int unique_id)
646	pass
647
648	def Print(self, val, buf):
649	# type: (value_t, mylib.BufWriter) -> None
650
651	# Or print to stderr?
652	f = fmt.DetectConsoleOutput(mylib.Stdout())
653	self.PrettyTree(val, f)
654
655	# Then print those with ASDL
656	pass
657
658
659	class LexerDecoder(object):
660	"""J8 lexer and string decoder.
661
662	Similar interface as SimpleLexer, except we return an optional decoded
663	string
664	"""
665
666	def __init__(self, s, is_j8, lang_str):
667	# type: (str, bool, str) -> None
668	self.s = s
669	self.is_j8 = is_j8
670	self.lang_str = lang_str
671
672	self.pos = 0
673
674	# current line being lexed -- for error messages
675	self.cur_line_num = 1
676
677	# Reuse this instance to save GC objects. JSON objects could have
678	# thousands of strings.
679	self.decoded = mylib.BufWriter()
680
681	def _Error(self, msg, end_pos):
682	# type: (str, int) -> error.Decode
683
684	# Use the current position as start pos
685	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
686
687	def Next(self):
688	# type: () -> Tuple[Id_t, int, Optional[str]]
689	""" Returns a token and updates self.pos """
690
691	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
692
693	if not self.is_j8:
694	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
695	raise self._Error(
696	"Single quotes aren't part of JSON; you may want 'json8 read'",
697	end_pos)
698	if tok_id == Id.Ignored_Comment:
699	raise self._Error(
700	"Comments aren't part of JSON; you may want 'json8 read'",
701	end_pos)
702
703	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
704	Id.Left_USingleQuote):
705	return self._DecodeString(tok_id, end_pos)
706
707	if tok_id == Id.Left_JDoubleQuote:
708	if self.is_j8:
709	return self._DecodeString(tok_id, end_pos)
710	else:
711	raise self._Error('Pure JSON does not accept j"" prefix',
712	end_pos)
713
714	if tok_id == Id.Ignored_Newline:
715	#log('LINE %d', self.cur_line_num)
716	self.cur_line_num += 1
717
718	self.pos = end_pos
719	return tok_id, end_pos, None
720
721	def NextForLines(self):
722	# type: () -> Tuple[Id_t, int, Optional[str]]
723	""" Like Next(), but for J8 Lines """
724
725	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
726
727	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
728	Id.Left_BSingleQuote, Id.Left_USingleQuote):
729	return self._DecodeString(tok_id, end_pos)
730
731	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
732	# this for quoted strings.)
733	if (tok_id == Id.Lit_Chars and
734	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
735	raise self._Error(
736	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
737	if tok_id == Id.Char_AsciiControl:
738	raise self._Error(
739	"J8 Lines can't have unescaped ASCII control chars", end_pos)
740
741	if tok_id == Id.J8_Newline:
742	#log('LINE %d', self.cur_line_num)
743	self.cur_line_num += 1
744
745	self.pos = end_pos
746	return tok_id, end_pos, None
747
748	def _DecodeString(self, left_id, str_pos):
749	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
750	""" Returns a string token and updates self.pos """
751
752	while True:
753	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
754	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
755	else:
756	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
757
758	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
759
760	if tok_id == Id.Eol_Tok:
761	# TODO: point to beginning of # quote?
762	raise self._Error(
763	'Unexpected EOF while lexing %s string' % self.lang_str,
764	str_end)
765	if tok_id == Id.Unknown_Backslash:
766	raise self._Error(
767	'Bad backslash escape in %s string' % self.lang_str,
768	str_end)
769	if tok_id == Id.Char_AsciiControl:
770	raise self._Error(
771	"%s strings can't have unescaped ASCII control chars" %
772	self.lang_str, str_end)
773
774	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
775
776	self.pos = str_end
777
778	s = self.decoded.getvalue()
779	self.decoded.clear() # reuse this instance
780
781	#log('decoded %r', self.decoded.getvalue())
782	return Id.J8_String, str_end, s
783
784	#
785	# Now handle each kind of token
786	#
787
788	if tok_id == Id.Lit_Chars: # JSON and J8
789	part = self.s[str_pos:str_end]
790	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
791	raise self._Error(
792	'Invalid UTF-8 in %s string literal' % self.lang_str,
793	str_end)
794
795	# TODO: would be nice to avoid allocation in all these cases.
796	# But LookupCharC() would have to change.
797
798	elif tok_id == Id.Char_OneChar: # JSON and J8
799	ch = self.s[str_pos + 1]
800	part = consts.LookupCharC(ch)
801
802	elif tok_id == Id.Char_UBraced: # J8 only
803	h = self.s[str_pos + 3:str_end - 1]
804	i = int(h, 16)
805
806	# Same checks in osh/word_compile.py
807	if i > 0x10ffff:
808	raise self._Error(
809	"Code point can't be greater than U+10ffff", str_end)
810	if 0xD800 <= i and i < 0xE000:
811	raise self._Error(
812	r"\u{%s} escape is illegal because it's in the surrogate range"
813	% h, str_end)
814
815	part = Utf8Encode(i)
816
817	elif tok_id == Id.Char_YHex: # J8 only
818	h = self.s[str_pos + 2:str_end]
819
820	# Same check in osh/word_parse.py
821	if left_id != Id.Left_BSingleQuote:
822	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
823	raise self._Error(
824	r"\y%s escapes not allowed in u'' strings" % h,
825	str_end)
826
827	i = int(h, 16)
828	part = chr(i)
829
830	elif tok_id == Id.Char_SurrogatePair:
831	h1 = self.s[str_pos + 2:str_pos + 6]
832	h2 = self.s[str_pos + 8:str_pos + 12]
833
834	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
835	i1 = int(h1, 16) - 0xD800 # high surrogate
836	i2 = int(h2, 16) - 0xDC00 # low surrogate
837	code_point = 0x10000 + (i1 << 10) + i2
838
839	part = Utf8Encode(code_point)
840
841	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
842	h = self.s[str_pos + 2:str_end]
843	i = int(h, 16)
844	part = Utf8Encode(i)
845
846	else:
847	# Should never happen
848	raise AssertionError(Id_str(tok_id))
849
850	#log('%s part %r', Id_str(tok_id), part)
851	self.decoded.write(part)
852	str_pos = str_end
853
854
855	class _Parser(object):
856
857	def __init__(self, s, is_j8):
858	# type: (str, bool) -> None
859	self.s = s
860	self.is_j8 = is_j8
861	self.lang_str = "J8" if is_j8 else "JSON"
862
863	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
864	self.tok_id = Id.Undefined_Tok
865	self.start_pos = 0
866	self.end_pos = 0
867	self.decoded = '' # decoded J8 string
868
869	def _Next(self):
870	# type: () -> None
871
872	# This isn't the start of a J8_Bool token, it's the END of the token before it
873	while True:
874	self.start_pos = self.end_pos
875	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
876	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
877	Id.Ignored_Comment):
878	break
879	# TODO: add Ignored_Newline to count lines, and show line numbers
880	# in errors messages. The position of the last newline and a token
881	# can be used to calculate a column number.
882
883	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
884
885	def _Eat(self, tok_id):
886	# type: (Id_t) -> None
887
888	if self.tok_id != tok_id:
889	#log('position %r %d-%d %r', self.s, self.start_pos,
890	# self.end_pos, self.s[self.start_pos:self.end_pos])
891	raise self._ParseError("Expected %s, got %s" %
892	(Id_str(tok_id), Id_str(self.tok_id)))
893	self._Next()
894
895	def _NextForLines(self):
896	# type: () -> None
897	"""Like _Next, but use the J8 Lines lexer."""
898	self.start_pos = self.end_pos
899	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
900
901	def _ParseError(self, msg):
902	# type: (str) -> error.Decode
903	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
904	self.lexer.cur_line_num)
905
906
907	class Parser(_Parser):
908	"""JSON and JSON8 Parser."""
909
910	def __init__(self, s, is_j8):
911	# type: (str, bool) -> None
912	_Parser.__init__(self, s, is_j8)
913
914	def _ParsePair(self):
915	# type: () -> Tuple[str, value_t]
916
917	k = self.decoded # Save the potential string value
918	self._Eat(Id.J8_String) # Check that it's a string
919	assert k is not None
920
921	self._Eat(Id.J8_Colon)
922
923	v = self._ParseValue()
924	return k, v
925
926	def _ParseDict(self):
927	# type: () -> value_t
928	"""
929	pair = string ':' value
930	Dict = '{' '}'
931	\| '{' pair (',' pair)* '}'
932	"""
933	# precondition
934	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
935
936	#log('> Dict')
937
938	d = NewDict() # type: Dict[str, value_t]
939
940	self._Next()
941	if self.tok_id == Id.J8_RBrace:
942	self._Next()
943	return Dict_(d, None)
944
945	k, v = self._ParsePair()
946	d[k] = v
947	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
948
949	while self.tok_id == Id.J8_Comma:
950	self._Next()
951	k, v = self._ParsePair()
952	d[k] = v
953	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
954
955	self._Eat(Id.J8_RBrace)
956
957	#log('< Dict')
958
959	return Dict_(d, None)
960
961	def _ParseList(self):
962	# type: () -> value_t
963	"""
964	List = '[' ']'
965	\| '[' value (',' value)* ']'
966	"""
967	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
968
969	items = [] # type: List[value_t]
970
971	self._Next()
972	if self.tok_id == Id.J8_RBracket:
973	self._Next()
974	return value.List(items)
975
976	items.append(self._ParseValue())
977
978	while self.tok_id == Id.J8_Comma:
979	self._Next()
980	items.append(self._ParseValue())
981
982	self._Eat(Id.J8_RBracket)
983
984	return value.List(items)
985
986	def _ParseValue(self):
987	# type: () -> value_t
988	if self.tok_id == Id.J8_LBrace:
989	return self._ParseDict()
990
991	elif self.tok_id == Id.J8_LBracket:
992	return self._ParseList()
993
994	elif self.tok_id == Id.J8_Null:
995	self._Next()
996	return value.Null
997
998	elif self.tok_id == Id.J8_Bool:
999	#log('%r %d', self.s[self.start_pos], self.start_pos)
1000	b = value.Bool(self.s[self.start_pos] == 't')
1001	self._Next()
1002	return b
1003
1004	elif self.tok_id == Id.J8_Int:
1005	part = self.s[self.start_pos:self.end_pos]
1006	self._Next()
1007	try:
1008	big = mops.FromStr(part)
1009	except ValueError:
1010	raise self._ParseError('Integer is too big')
1011	return value.Int(big)
1012
1013	elif self.tok_id == Id.J8_Float:
1014	part = self.s[self.start_pos:self.end_pos]
1015	self._Next()
1016	return value.Float(float(part))
1017
1018	# UString, BString too
1019	elif self.tok_id == Id.J8_String:
1020	str_val = value.Str(self.decoded)
1021	#log('d %r', self.decoded)
1022	self._Next()
1023	return str_val
1024
1025	elif self.tok_id == Id.Eol_Tok:
1026	raise self._ParseError('Unexpected EOF while parsing %s' %
1027	self.lang_str)
1028
1029	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1030	raise self._ParseError('Invalid token while parsing %s: %s' %
1031	(self.lang_str, Id_str(self.tok_id)))
1032
1033	def ParseValue(self):
1034	# type: () -> value_t
1035	""" Raises error.Decode. """
1036	self._Next()
1037	obj = self._ParseValue()
1038
1039	n = len(self.s)
1040	if self.start_pos != n:
1041	extra = n - self.start_pos
1042	#log('n %d pos %d', n, self.start_pos)
1043	raise self._ParseError(
1044	'Got %d bytes of unexpected trailing input' % extra)
1045	return obj
1046
1047
1048	class Nil8Parser(_Parser):
1049	"""
1050	Tokens not in JSON8:
1051	LParen RParen Symbol
1052
1053	Tokens not in JSON, but in JSON8 and NIL8:
1054	Identifier (unquoted keys)
1055	Ignored_Comment
1056	"""
1057
1058	def __init__(self, s, is_j8):
1059	# type: (str, bool) -> None
1060	_Parser.__init__(self, s, is_j8)
1061
1062	if 0:
1063
1064	def _LookAhead(self):
1065	# type: () -> Id_t
1066	"""
1067	Don't need this right now
1068	"""
1069	end_pos = self.end_pos # look ahead from last token
1070	while True:
1071	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1072	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1073	Id.Ignored_Comment):
1074	break
1075	return tok_id
1076
1077	def _ParseRecord(self):
1078	# type: () -> nvalue_t
1079	"""
1080	Yaks
1081	(self->Next) => (-> self Next)
1082	(self->Next obj.field) => ((-> self Next) (. obj field))
1083
1084	Similar to
1085	((identity identity) 42) => 42 in Clojure
1086
1087	ASDL
1088	(Node left:(. x4beef2))
1089	(Node left !x4beef2)
1090
1091	# Ambiguous because value can be identifier.
1092	# We have to look ahead to and see if there's a colon :
1093	field =
1094	Identifier ':' value
1095	\| value
1096
1097	record = '(' head field* ')'
1098
1099	- Identifier \| Symbol are treated the same, it's a side effect of
1100	the lexing style
1101	- do positional args come before named args
1102	- () is invalid? Use [] for empty list
1103	"""
1104	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1105
1106	items = [] # type: List[nvalue_t]
1107
1108	self._Next()
1109	if self.tok_id == Id.J8_RParen:
1110	self._Next()
1111	return nvalue.List(items)
1112
1113	#log('TOK %s', Id_str(self.tok_id))
1114	while self.tok_id != Id.J8_RParen:
1115	items.append(self._ParseNil8())
1116	#log('TOK 2 %s', Id_str(self.tok_id))
1117
1118	self._Eat(Id.J8_RParen)
1119
1120	return nvalue.List(items)
1121
1122	def _ParseList8(self):
1123	# type: () -> nvalue_t
1124	"""
1125	List8 = '[' value* ']'
1126
1127	No commas, not even optional ones for now.
1128	"""
1129	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1130
1131	items = [] # type: List[nvalue_t]
1132
1133	self._Next()
1134	if self.tok_id == Id.J8_RBracket:
1135	self._Next()
1136	return nvalue.List(items)
1137
1138	#log('TOK %s', Id_str(self.tok_id))
1139	while self.tok_id != Id.J8_RBracket:
1140	items.append(self._ParseNil8())
1141	#log('TOK 2 %s', Id_str(self.tok_id))
1142
1143	self._Eat(Id.J8_RBracket)
1144
1145	return nvalue.List(items)
1146
1147	def _ParseNil8(self):
1148	# type: () -> nvalue_t
1149	if self.tok_id == Id.J8_LParen:
1150	obj = self._ParseRecord() # type: nvalue_t
1151	#return obj
1152
1153	elif self.tok_id == Id.J8_LBracket:
1154	obj = self._ParseList8()
1155	#return obj
1156
1157	# Primitives are copied from J8 above.
1158	# TODO: We also want hex literals.
1159	elif self.tok_id == Id.J8_Null:
1160	self._Next()
1161	obj = nvalue.Null
1162
1163	elif self.tok_id == Id.J8_Bool:
1164	b = nvalue.Bool(self.s[self.start_pos] == 't')
1165	self._Next()
1166	obj = b
1167
1168	elif self.tok_id == Id.J8_Int:
1169	part = self.s[self.start_pos:self.end_pos]
1170	self._Next()
1171	obj = nvalue.Int(int(part))
1172
1173	elif self.tok_id == Id.J8_Float:
1174	part = self.s[self.start_pos:self.end_pos]
1175	self._Next()
1176	obj = nvalue.Float(float(part))
1177
1178	elif self.tok_id == Id.J8_String:
1179	str_val = nvalue.Str(self.decoded)
1180	self._Next()
1181	obj = str_val
1182
1183	# <- etc.
1184	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1185	Id.J8_Comma):
1186	# unquoted "word" treated like a string
1187	part = self.s[self.start_pos:self.end_pos]
1188	self._Next()
1189	obj = nvalue.Symbol(part)
1190
1191	elif self.tok_id == Id.Eol_Tok:
1192	raise self._ParseError('Unexpected EOF while parsing %s' %
1193	self.lang_str)
1194
1195	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1196	raise self._ParseError('Invalid token while parsing %s: %s' %
1197	(self.lang_str, Id_str(self.tok_id)))
1198
1199	#log('YO %s', Id_str(self.tok_id))
1200	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1201	#log('AT %s', Id_str(self.tok_id))
1202
1203	# key: "value" -> (: key "value")
1204	part = self.s[self.start_pos:self.end_pos]
1205	op = nvalue.Symbol(part)
1206
1207	self._Next()
1208	operand2 = self._ParseNil8()
1209	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1210	#print("--> INFIX %d %s" % (id(infix), infix))
1211	return infix
1212
1213	#next_id = self._LookAhead()
1214	#print('NEXT %s' % Id_str(next_id))
1215
1216	#raise AssertionError()
1217	#print("--> OBJ %d %s" % (id(obj), obj))
1218	return obj
1219
1220	def ParseNil8(self):
1221	# type: () -> nvalue_t
1222	""" Raises error.Decode. """
1223	self._Next()
1224	#print('yo')
1225	obj = self._ParseNil8()
1226	#print("==> %d %s" % (id(obj), obj))
1227	if self.tok_id != Id.Eol_Tok:
1228	raise self._ParseError('Unexpected trailing input')
1229	return obj
1230
1231
1232	class J8LinesParser(_Parser):
1233	"""Decode lines from a string with newlines.
1234
1235	We specify this with a grammar, to preserve location info and to reduce
1236	allocations. (But note that unquoted_line is more like a LOOP than it is
1237	grammatical.)
1238
1239	Grammar:
1240
1241	end = J8_Newline \| Eol_Tok
1242
1243	empty_line = WS_Space? end
1244
1245	# special case: read until end token, but REMOVE trailing WS_Space
1246	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1247
1248	j8_line = WS_Space? J8_String WS_Space? end
1249
1250	lines = (empty_line \| unquoted_line \| j8_line)*
1251
1252	where Lit_Chars is valid UTF-8
1253
1254	Notes:
1255
1256	(1) We disallow multiple strings on a line, like:
1257
1258	"json" "json2"
1259	"json" unquoted
1260
1261	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1262
1263	foo "" u''
1264
1265	The "" and u'' are not a decoded string, because the line started with
1266	Id.Lit_Chars literals.
1267
1268	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1269	Does it have - for empty cell?
1270	"""
1271
1272	def __init__(self, s):
1273	# type: (str) -> None
1274	_Parser.__init__(self, s, True)
1275
1276	def _Show(self, s):
1277	# type: (str) -> None
1278	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1279	self.end_pos)
1280
1281	def _ParseLine(self, out):
1282	# type: (List[str]) -> None
1283	""" May append a line to 'out' """
1284	#self._Show('1')
1285	if self.tok_id == Id.WS_Space:
1286	self._NextForLines()
1287
1288	# Empty line - return without doing anything
1289	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1290	self._NextForLines()
1291	return
1292
1293	# Quoted string on line
1294	if self.tok_id == Id.J8_String:
1295	out.append(self.decoded)
1296	self._NextForLines()
1297
1298	if self.tok_id == Id.WS_Space: # trailing whitespace
1299	self._NextForLines()
1300
1301	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1302	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1303	Id_str(self.tok_id))
1304
1305	self._NextForLines()
1306	return
1307
1308	# Unquoted line
1309	if self.tok_id == Id.Lit_Chars:
1310	# ' unquoted "" text on line ' # read every token until end
1311	string_start = self.start_pos
1312	while True:
1313	# for stripping whitespace
1314	prev_id = self.tok_id
1315	prev_start = self.start_pos
1316
1317	self._NextForLines()
1318
1319	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1320	# \r, but we're sticking with the JSON spec definition of
1321	# whitespace. (As another data point, CPython on Unix allows
1322	# \r in the middle of expressions, treating it as whitespace.)
1323	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1324	break
1325
1326	if prev_id == Id.WS_Space:
1327	string_end = prev_start # remove trailing whitespace
1328	else:
1329	string_end = self.start_pos
1330
1331	out.append(self.s[string_start:string_end])
1332
1333	self._NextForLines() # past newline
1334	return
1335
1336	raise AssertionError(Id_str(self.tok_id))
1337
1338	def Parse(self):
1339	# type: () -> List[str]
1340	""" Raises error.Decode. """
1341	self._NextForLines()
1342
1343	lines = [] # type: List[str]
1344	while self.tok_id != Id.Eol_Tok:
1345	self._ParseLine(lines)
1346
1347	if self.tok_id != Id.Eol_Tok:
1348	raise self._ParseError('Unexpected trailing input in J8 Lines')
1349
1350	return lines
1351
1352
1353	def SplitJ8Lines(s):
1354	# type: (str) -> List[str]
1355	"""Used by @(echo split command sub)
1356
1357	Raises:
1358	error.Decode
1359
1360	3 Errors:
1361	- J8 string syntax error inside quotes
1362	- Extra input on line
1363	- unquoted line isn't utf-8
1364	"""
1365	p = J8LinesParser(s)
1366	return p.Parse()
1367
1368
1369	# vim: sw=4