data_lang/j8.py

OILS / data_lang / j8.py View on Github | oilshell.org

1416 lines, 710 significant

1	#!/usr/bin/env python2
2	"""
3	j8.py: J8 Notation, a superset of JSON
4
5	Later:
6
7	- PrettyPrinter uses hnode.asdl?
8	- color
9	- line wrapping -- do this later
10	- would like CONTRIBUTORS here
11
12	- Unify with ASDL pretty printing - NIL8
13	- {} [] are identical
14	- () is for statically typed ASDL data
15	(command.Simple blame_tok:(...) words:[ ])
16	although we are also using [] for typed ASDL arrays, not just JSON
17	- object IDs
18	- @ x123 can create an ID
19	- ! x123 can reference an ID
20	- <> can be for non-J8 data types? For the = operator
21	- 'hi \(name)' interpolation is useful for code
22
23	- Common between JSON8 and NIL8 - for writing by hand
24	- comments - # line or // line (JSON5 uses // line, following JS)
25	- unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26	- commas
27	- JSON8 could have trailing commas rule
28	- NIL8 at least has no commas for [1 2 "hi"]
29	"""
30
31	import math
32
33	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34	from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str, Obj)
35	from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
36
37	from asdl import format as fmt
38	from core import error
39	from data_lang import pyj8
40	# dependency issue: consts.py pulls in frontend/option_def.py
41	from frontend import consts
42	from frontend import match
43	from mycpp import mops
44	from mycpp import mylib
45	from mycpp.mylib import tagswitch, iteritems, NewDict, log
46
47	import fastfunc
48
49	_ = log
50
51	from typing import cast, Dict, List, Tuple, Optional
52
53
54	# COPIED from ui.ValType() to break dep
55	def ValType(val):
56	# type: (value_t) -> str
57	"""For displaying type errors in the UI."""
58
59	return value_str(val.tag(), dot=False)
60
61
62	if mylib.PYTHON:
63
64	def HeapValueId(val):
65	# type: (value_t) -> int
66	"""
67	Python's id() returns the address, which is up to 64 bits.
68
69	In C++ we can use the GC ID, which fits within 32 bits.
70	"""
71	return id(val)
72
73
74	def ValueId(val):
75	# type: (value_t) -> int
76	"""
77	Return an integer ID for object that:
78
79	1. Can be used to determine whether 2 objects are the same, e.g. for
80	List, Dict, Func, Proc, etc.
81	2. Will help detect object cycles
82
83	Primitives types like Int and Float don't have this notion. They're
84	immutable values that are copied and compared by value.
85	"""
86	with tagswitch(val) as case:
87	if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
88	value_e.Str):
89	# These will not be on the heap if we switch to tagged pointers
90	# Str is handled conservatively - when we add small string
91	# optimization, some strings will be values, so we assume all are.
92	return -1
93	else:
94	return HeapValueId(val)
95
96
97	def ValueIdString(val):
98	# type: (value_t) -> str
99	"""Used by pp value (42) and = 42"""
100	heap_id = ValueId(val) # could be -1
101	if heap_id == -1:
102	return ''
103	else:
104	return ' 0x%s' % mylib.hex_lower(heap_id)
105
106
107	def Utf8Encode(code):
108	# type: (int) -> str
109	"""Return utf-8 encoded bytes from a unicode code point.
110
111	Based on https://stackoverflow.com/a/23502707
112	"""
113	num_cont_bytes = 0
114
115	if code <= 0x7F:
116	return chr(code & 0x7F) # ASCII
117
118	elif code <= 0x7FF:
119	num_cont_bytes = 1
120	elif code <= 0xFFFF:
121	num_cont_bytes = 2
122	else:
123	# What about the check code <= 0x10FFFF ?
124	# - it happens in statically parsed $'' u''
125	# - but not dynamically parsed echo -e / printf, following bash/zsh
126	num_cont_bytes = 3
127
128	bytes_ = [] # type: List[int]
129	for _ in xrange(num_cont_bytes):
130	bytes_.append(0x80 \| (code & 0x3F))
131	code >>= 6
132
133	b = (0x1E << (6 - num_cont_bytes)) \| (code & (0x3F >> num_cont_bytes))
134	bytes_.append(b)
135	bytes_.reverse()
136
137	# mod 256 because Python ints don't wrap around!
138	tmp = [chr(b & 0xFF) for b in bytes_]
139	return ''.join(tmp)
140
141
142	SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
143	SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
144	LOSSY_JSON = 1 << 3 # JSON is lossy
145	INF_NAN_ARE_NULL = 1 << 4 # for JSON
146
147	# Hack until we fully translate
148	assert pyj8.LOSSY_JSON == LOSSY_JSON
149
150
151	def _Print(val, buf, indent, options=0):
152	# type: (value_t, mylib.BufWriter, int, int) -> None
153	"""
154	Args:
155	indent: number of spaces to indent, or -1 for everything on one line
156	"""
157	p = InstancePrinter(buf, indent, options)
158	p.Print(val)
159
160
161	def PrintMessage(val, buf, indent):
162	# type: (value_t, mylib.BufWriter, int) -> None
163	""" For json8 write (x) and toJson8()
164
165	Caller must handle error.Encode
166	"""
167	_Print(val, buf, indent)
168
169
170	def PrintJsonMessage(val, buf, indent):
171	# type: (value_t, mylib.BufWriter, int) -> None
172	""" For json write (x) and toJson()
173
174	Caller must handle error.Encode()
175	Doesn't decay to b'' strings - will use Unicode replacement char.
176	"""
177	_Print(val, buf, indent, options=LOSSY_JSON \| INF_NAN_ARE_NULL)
178
179
180	def PrintLine(val, f):
181	# type: (value_t, mylib.Writer) -> None
182	""" For pp line (x) """
183
184	# error.Encode should be impossible - we show cycles and non-data
185	buf = mylib.BufWriter()
186
187	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
188
189	f.write(buf.getvalue())
190	f.write('\n')
191
192
193	if 0:
194
195	def Repr(val):
196	# type: (value_t) -> str
197	""" Unused
198	This is like Python's repr
199	"""
200	# error.Encode should be impossible - we show cycles and non-data
201	buf = mylib.BufWriter()
202	_Print(val, buf, -1, options=SHOW_CYCLES \| SHOW_NON_DATA)
203	return buf.getvalue()
204
205
206	def EncodeString(s, buf, unquoted_ok=False):
207	# type: (str, mylib.BufWriter, bool) -> None
208	""" For pp proc, etc."""
209
210	if unquoted_ok and fastfunc.CanOmitQuotes(s):
211	buf.write(s)
212	return
213
214	_Print(value.Str(s), buf, -1)
215
216
217	def MaybeEncodeString(s):
218	# type: (str) -> str
219	""" For write --json8 $s and compexport """
220
221	# TODO: add unquoted_ok here?
222	# /usr/local/foo-bar/x.y/a_b
223
224	buf = mylib.BufWriter()
225	_Print(value.Str(s), buf, -1)
226	return buf.getvalue()
227
228
229	def MaybeEncodeJsonString(s):
230	# type: (str) -> str
231	""" For write --json """
232
233	# TODO: add unquoted_ok here?
234	# /usr/local/foo-bar/x.y/a_b
235	buf = mylib.BufWriter()
236	_Print(value.Str(s), buf, -1, options=LOSSY_JSON)
237	return buf.getvalue()
238
239
240	# DFS traversal state
241	UNSEEN = 0
242	EXPLORING = 1
243	FINISHED = 2
244
245
246	class InstancePrinter(object):
247	"""Print a value tree as J8/JSON."""
248
249	def __init__(self, buf, indent, options):
250	# type: (mylib.BufWriter, int, int) -> None
251	self.buf = buf
252	self.indent = indent
253	self.options = options
254
255	# Key is vm.HeapValueId(val)
256	# Value is always True
257	# Dict[int, None] doesn't translate -- it would be nice to have a set()
258	self.visited = {} # type: Dict[int, int]
259
260	def _ItemIndent(self, level):
261	# type: (int) -> None
262
263	if self.indent == -1:
264	return
265
266	self.buf.write_spaces((level + 1) * self.indent)
267
268	def _BracketIndent(self, level):
269	# type: (int) -> None
270
271	if self.indent == -1:
272	return
273
274	self.buf.write_spaces(level * self.indent)
275
276	def _MaybeNewline(self):
277	# type: () -> None
278	if self.indent == -1:
279	return
280	self.buf.write('\n')
281
282	def _MaybeSpace(self):
283	# type: () -> None
284	if self.indent == -1:
285	return
286	self.buf.write(' ')
287
288	def _PrintList(self, val, level):
289	# type: (value.List, int) -> None
290
291	if len(val.items) == 0: # Special case like Python/JS
292	self.buf.write('[]')
293	else:
294	self.buf.write('[')
295	self._MaybeNewline()
296	for i, item in enumerate(val.items):
297	if i != 0:
298	self.buf.write(',')
299	self._MaybeNewline()
300
301	self._ItemIndent(level)
302	self.Print(item, level + 1)
303	self._MaybeNewline()
304
305	self._BracketIndent(level)
306	self.buf.write(']')
307
308	def _PrintMapping(self, d, level):
309	# type: (Dict[str, value_t], int) -> None
310	if len(d) == 0: # Special case like Python/JS
311	self.buf.write('{}')
312	else:
313	self.buf.write('{')
314	self._MaybeNewline()
315	i = 0
316	for k, v in iteritems(d):
317	if i != 0:
318	self.buf.write(',')
319	self._MaybeNewline()
320
321	self._ItemIndent(level)
322
323	pyj8.WriteString(k, self.options, self.buf)
324
325	self.buf.write(':')
326	self._MaybeSpace()
327
328	self.Print(v, level + 1)
329
330	i += 1
331
332	self._MaybeNewline()
333	self._BracketIndent(level)
334	self.buf.write('}')
335
336	def _PrintDict(self, val, level):
337	# type: (value.Dict, int) -> None
338	self._PrintMapping(val.d, level)
339
340	def _PrintObj(self, val, level):
341	# type: (Obj, int) -> None
342
343	self._PrintMapping(val.d, level)
344
345	if val.prototype:
346	self.buf.write(' ==> ')
347	self._PrintObj(val.prototype, level)
348
349	def _PrintBashPrefix(self, type_str, level):
350	# type: (str, int) -> None
351
352	self.buf.write('{')
353	self._MaybeNewline()
354	self._ItemIndent(level)
355	self.buf.write('"type":')
356	self._MaybeSpace()
357	self.buf.write(type_str) # "BashArray", or "BashAssoc",
358
359	self._MaybeNewline()
360
361	self._ItemIndent(level)
362	self.buf.write('"data":')
363	self._MaybeSpace()
364
365	def _PrintBashSuffix(self, level):
366	# type: (int) -> None
367	self._MaybeNewline()
368	self._BracketIndent(level)
369	self.buf.write('}')
370
371	def _PrintSparseArray(self, val, level):
372	# type: (value.SparseArray, int) -> None
373
374	self._PrintBashPrefix('"SparseArray",', level)
375
376	if len(val.d) == 0: # Special case like Python/JS
377	self.buf.write('{}')
378	else:
379	self.buf.write('{')
380	self._MaybeNewline()
381
382	first = True
383	i = 0
384	for k, v in iteritems(val.d):
385	if i != 0:
386	self.buf.write(',')
387	self._MaybeNewline()
388
389	self._ItemIndent(level + 1)
390	pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
391
392	self.buf.write(':')
393	self._MaybeSpace()
394
395	pyj8.WriteString(v, self.options, self.buf)
396
397	i += 1
398
399	self._MaybeNewline()
400
401	self._BracketIndent(level + 1)
402	self.buf.write('}')
403
404	self._PrintBashSuffix(level)
405
406	def _PrintBashArray(self, val, level):
407	# type: (value.BashArray, int) -> None
408
409	self._PrintBashPrefix('"BashArray",', level)
410
411	if len(val.strs) == 0: # Special case like Python/JS
412	self.buf.write('{}')
413	else:
414	self.buf.write('{')
415	self._MaybeNewline()
416
417	first = True
418	for i, s in enumerate(val.strs):
419	if s is None:
420	continue
421
422	if not first:
423	self.buf.write(',')
424	self._MaybeNewline()
425
426	self._ItemIndent(level + 1)
427	pyj8.WriteString(str(i), self.options, self.buf)
428
429	self.buf.write(':')
430	self._MaybeSpace()
431
432	pyj8.WriteString(s, self.options, self.buf)
433
434	first = False
435
436	self._MaybeNewline()
437
438	self._BracketIndent(level + 1)
439	self.buf.write('}')
440
441	self._PrintBashSuffix(level)
442
443	def _PrintBashAssoc(self, val, level):
444	# type: (value.BashAssoc, int) -> None
445
446	self._PrintBashPrefix('"BashAssoc",', level)
447
448	if len(val.d) == 0: # Special case like Python/JS
449	self.buf.write('{}')
450	else:
451	self.buf.write('{')
452	self._MaybeNewline()
453
454	i = 0
455	for k2, v2 in iteritems(val.d):
456	if i != 0:
457	self.buf.write(',')
458	self._MaybeNewline()
459
460	self._ItemIndent(level + 1)
461	pyj8.WriteString(k2, self.options, self.buf)
462
463	self.buf.write(':')
464	self._MaybeSpace()
465
466	pyj8.WriteString(v2, self.options, self.buf)
467
468	i += 1
469
470	self._MaybeNewline()
471
472	self._BracketIndent(level + 1)
473	self.buf.write('}')
474
475	self._PrintBashSuffix(level)
476
477	def Print(self, val, level=0):
478	# type: (value_t, int) -> None
479
480	# special value that means everything is on one line
481	# It's like
482	# JSON.stringify(d, null, 0)
483	# except we use -1, not 0. 0 can still have newlines.
484
485	UP_val = val
486	with tagswitch(val) as case:
487	if case(value_e.Null):
488	self.buf.write('null')
489
490	elif case(value_e.Bool):
491	val = cast(value.Bool, UP_val)
492	self.buf.write('true' if val.b else 'false')
493
494	elif case(value_e.Int):
495	val = cast(value.Int, UP_val)
496	# TODO: avoid intermediate allocation with
497	# self.buf.WriteBigInt(val.i)
498	#
499	# Or maybe we need pyj8.WriteBigInt() because truly BigInt may
500	# be of arbitrary length, and will need a growth strategy.
501	# Although that is not very common, so we could allocate in
502	# that case.
503
504	self.buf.write(mops.ToStr(val.i))
505
506	elif case(value_e.Float):
507	val = cast(value.Float, UP_val)
508
509	fl = val.f
510	if math.isinf(fl):
511	if self.options & INF_NAN_ARE_NULL:
512	s = 'null' # negative infinity is null too
513	else:
514	s = 'INFINITY'
515	if fl < 0:
516	s = '-' + s
517	elif math.isnan(fl):
518	if self.options & INF_NAN_ARE_NULL:
519	# JavaScript JSON lib behavior: Inf and NaN are null
520	# Python has a bug in the encoder by default, and then
521	# allow_nan=False raises an error
522	s = 'null'
523	else:
524	s = 'NAN'
525	else:
526	# TODO: can we avoid intermediate allocation?
527	# self.buf.WriteFloat(val.f)
528	s = str(fl)
529
530	self.buf.write(s)
531
532	elif case(value_e.Str):
533	val = cast(value.Str, UP_val)
534
535	pyj8.WriteString(val.s, self.options, self.buf)
536
537	elif case(value_e.List):
538	val = cast(value.List, UP_val)
539
540	# Cycle detection, only for containers that can be in cycles
541	heap_id = HeapValueId(val)
542
543	node_state = self.visited.get(heap_id, UNSEEN)
544	if node_state == FINISHED:
545	# Print it AGAIN. We print a JSON tree, which means we can
546	# visit and print nodes MANY TIMES, as long as they're not
547	# in a cycle.
548	self._PrintList(val, level)
549	return
550	if node_state == EXPLORING:
551	if self.options & SHOW_CYCLES:
552	self.buf.write('[ -->%s ]' % ValueIdString(val))
553	return
554	else:
555	# node.js prints which index closes the cycle
556	raise error.Encode(
557	"Can't encode List%s in object cycle" %
558	ValueIdString(val))
559
560	self.visited[heap_id] = EXPLORING
561	self._PrintList(val, level)
562	self.visited[heap_id] = FINISHED
563
564	elif case(value_e.Dict):
565	val = cast(value.Dict, UP_val)
566
567	# Cycle detection, only for containers that can be in cycles
568	heap_id = HeapValueId(val)
569
570	node_state = self.visited.get(heap_id, UNSEEN)
571	if node_state == FINISHED:
572	# Print it AGAIN. We print a JSON tree, which means we can
573	# visit and print nodes MANY TIMES, as long as they're not
574	# in a cycle.
575	self._PrintDict(val, level)
576	return
577	if node_state == EXPLORING:
578	if self.options & SHOW_CYCLES:
579	self.buf.write('{ -->%s }' % ValueIdString(val))
580	return
581	else:
582	# node.js prints which key closes the cycle
583	raise error.Encode(
584	"Can't encode Dict%s in object cycle" %
585	ValueIdString(val))
586
587	self.visited[heap_id] = EXPLORING
588	self._PrintDict(val, level)
589	self.visited[heap_id] = FINISHED
590
591	elif case(value_e.Obj):
592	val = cast(Obj, UP_val)
593
594	if not (self.options & SHOW_NON_DATA):
595	raise error.Encode("Can't encode value of type Obj")
596
597	# Cycle detection, only for containers that can be in cycles
598	heap_id = HeapValueId(val)
599
600	node_state = self.visited.get(heap_id, UNSEEN)
601	if node_state == FINISHED:
602	# Print it AGAIN. We print a JSON tree, which means we can
603	# visit and print nodes MANY TIMES, as long as they're not
604	# in a cycle.
605	self._PrintObj(val, level)
606	return
607	if node_state == EXPLORING:
608	if self.options & SHOW_CYCLES:
609	self.buf.write('{ -->%s }' % ValueIdString(val))
610	return
611	else:
612	# node.js prints which key closes the cycle
613	raise error.Encode(
614	"Can't encode Obj%s in object cycle" %
615	ValueIdString(val))
616
617	# TODO: cycle detection is a bit wrong, I think because the
618	# properties are a Dict[str, value_t], not something with an
619	# identity
620	#
621	# This is only used for pp test_, because SHOW_NON_DATA.
622	self.visited[heap_id] = EXPLORING
623	self._PrintObj(val, level)
624	self.visited[heap_id] = FINISHED
625
626	elif case(value_e.SparseArray):
627	val = cast(value.SparseArray, UP_val)
628	self._PrintSparseArray(val, level)
629
630	elif case(value_e.BashArray):
631	val = cast(value.BashArray, UP_val)
632	self._PrintBashArray(val, level)
633
634	elif case(value_e.BashAssoc):
635	val = cast(value.BashAssoc, UP_val)
636	self._PrintBashAssoc(val, level)
637
638	else:
639	pass # mycpp workaround
640	if self.options & SHOW_NON_DATA:
641	# Similar to = operator, ui.DebugPrint()
642	# TODO: that prints value.Range in a special way
643	ysh_type = ValType(val)
644	# Don't show ID in 'pp test_'
645	#id_str = ValueIdString(val)
646	self.buf.write('<%s>' % ysh_type)
647	else:
648	raise error.Encode("Can't serialize object of type %s" %
649	ValType(val))
650
651
652	class PrettyPrinter(object):
653	""" Unused right now, but could enhance the = operator.
654
655	Output to polymorphic ColorOutput
656
657	Features like asdl/format.py:
658	- line wrapping
659	- color
660	- sharing detection by passing in a REF COUTN dict
661	- print @123 the first time, and then print ... the second time
662
663	and
664
665	- Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
666	- Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
667
668	- Omitting commas for ASDL? Maybe we can use two spaces
669
670	(Token id: Id.VSub_DollarName start: 0 length: 3)
671	(Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
672	"""
673
674	def __init__(self, max_col):
675	# type: (int) -> None
676	self.max_col = max_col
677
678	# This could be an optimized set an C++ bit set like
679	# mark_sweep_heap.h, rather than a Dict
680	#self.unique_objs = mylib.UniqueObjects()
681
682	# first pass of object ID -> number of times references
683
684	self.ref_count = {} # type: Dict[int, int]
685
686	def PrettyTree(self, val, f):
687	# type: (value_t, fmt.ColorOutput) -> None
688
689	# TODO: first convert to hnode.asdl types?
690
691	# Although we might want
692	# hnode.AlreadyShown = (str type, int unique_id)
693	pass
694
695	def Print(self, val, buf):
696	# type: (value_t, mylib.BufWriter) -> None
697
698	# Or print to stderr?
699	f = fmt.DetectConsoleOutput(mylib.Stdout())
700	self.PrettyTree(val, f)
701
702	# Then print those with ASDL
703	pass
704
705
706	class LexerDecoder(object):
707	"""J8 lexer and string decoder.
708
709	Similar interface as SimpleLexer, except we return an optional decoded
710	string
711	"""
712
713	def __init__(self, s, is_j8, lang_str):
714	# type: (str, bool, str) -> None
715	self.s = s
716	self.is_j8 = is_j8
717	self.lang_str = lang_str
718
719	self.pos = 0
720
721	# current line being lexed -- for error messages
722	self.cur_line_num = 1
723
724	# Reuse this instance to save GC objects. JSON objects could have
725	# thousands of strings.
726	self.decoded = mylib.BufWriter()
727
728	def _Error(self, msg, end_pos):
729	# type: (str, int) -> error.Decode
730
731	# Use the current position as start pos
732	return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
733
734	def Next(self):
735	# type: () -> Tuple[Id_t, int, Optional[str]]
736	""" Returns a token and updates self.pos """
737
738	tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
739
740	if not self.is_j8:
741	if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
742	raise self._Error(
743	"Single quotes aren't part of JSON; you may want 'json8 read'",
744	end_pos)
745	if tok_id == Id.Ignored_Comment:
746	raise self._Error(
747	"Comments aren't part of JSON; you may want 'json8 read'",
748	end_pos)
749
750	if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
751	Id.Left_USingleQuote):
752	return self._DecodeString(tok_id, end_pos)
753
754	if tok_id == Id.Left_JDoubleQuote:
755	if self.is_j8:
756	return self._DecodeString(tok_id, end_pos)
757	else:
758	raise self._Error('Pure JSON does not accept j"" prefix',
759	end_pos)
760
761	if tok_id == Id.Ignored_Newline:
762	#log('LINE %d', self.cur_line_num)
763	self.cur_line_num += 1
764
765	self.pos = end_pos
766	return tok_id, end_pos, None
767
768	def NextForLines(self):
769	# type: () -> Tuple[Id_t, int, Optional[str]]
770	""" Like Next(), but for J8 Lines """
771
772	tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
773
774	if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
775	Id.Left_BSingleQuote, Id.Left_USingleQuote):
776	return self._DecodeString(tok_id, end_pos)
777
778	# Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
779	# this for quoted strings.)
780	if (tok_id == Id.Lit_Chars and
781	not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
782	raise self._Error(
783	'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
784	if tok_id == Id.Char_AsciiControl:
785	raise self._Error(
786	"J8 Lines can't have unescaped ASCII control chars", end_pos)
787
788	if tok_id == Id.J8_Newline:
789	#log('LINE %d', self.cur_line_num)
790	self.cur_line_num += 1
791
792	self.pos = end_pos
793	return tok_id, end_pos, None
794
795	def _DecodeString(self, left_id, str_pos):
796	# type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
797	""" Returns a string token and updates self.pos """
798
799	while True:
800	if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
801	tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
802	else:
803	tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
804
805	#log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
806
807	if tok_id == Id.Eol_Tok:
808	# TODO: point to beginning of # quote?
809	raise self._Error(
810	'Unexpected EOF while lexing %s string' % self.lang_str,
811	str_end)
812	if tok_id == Id.Unknown_Backslash:
813	raise self._Error(
814	'Bad backslash escape in %s string' % self.lang_str,
815	str_end)
816	if tok_id == Id.Char_AsciiControl:
817	raise self._Error(
818	"%s strings can't have unescaped ASCII control chars" %
819	self.lang_str, str_end)
820
821	if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
822
823	self.pos = str_end
824
825	s = self.decoded.getvalue()
826	self.decoded.clear() # reuse this instance
827
828	#log('decoded %r', self.decoded.getvalue())
829	return Id.J8_String, str_end, s
830
831	#
832	# Now handle each kind of token
833	#
834
835	if tok_id == Id.Lit_Chars: # JSON and J8
836	part = self.s[str_pos:str_end]
837	if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
838	raise self._Error(
839	'Invalid UTF-8 in %s string literal' % self.lang_str,
840	str_end)
841
842	# TODO: would be nice to avoid allocation in all these cases.
843	# But LookupCharC() would have to change.
844
845	elif tok_id == Id.Char_OneChar: # JSON and J8
846	ch = self.s[str_pos + 1]
847	part = consts.LookupCharC(ch)
848
849	elif tok_id == Id.Char_UBraced: # J8 only
850	h = self.s[str_pos + 3:str_end - 1]
851	i = int(h, 16)
852
853	# Same checks in osh/word_compile.py
854	if i > 0x10ffff:
855	raise self._Error(
856	"Code point can't be greater than U+10ffff", str_end)
857	if 0xD800 <= i and i < 0xE000:
858	raise self._Error(
859	r"\u{%s} escape is illegal because it's in the surrogate range"
860	% h, str_end)
861
862	part = Utf8Encode(i)
863
864	elif tok_id == Id.Char_YHex: # J8 only
865	h = self.s[str_pos + 2:str_end]
866
867	# Same check in osh/word_parse.py
868	if left_id != Id.Left_BSingleQuote:
869	assert left_id != Id.Left_BTSingleQuote, "Not handled here"
870	raise self._Error(
871	r"\y%s escapes not allowed in u'' strings" % h,
872	str_end)
873
874	i = int(h, 16)
875	part = chr(i)
876
877	elif tok_id == Id.Char_SurrogatePair:
878	h1 = self.s[str_pos + 2:str_pos + 6]
879	h2 = self.s[str_pos + 8:str_pos + 12]
880
881	# https://www.oilshell.org/blog/2023/06/surrogate-pair.html
882	i1 = int(h1, 16) - 0xD800 # high surrogate
883	i2 = int(h2, 16) - 0xDC00 # low surrogate
884	code_point = 0x10000 + (i1 << 10) + i2
885
886	part = Utf8Encode(code_point)
887
888	elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
889	h = self.s[str_pos + 2:str_end]
890	i = int(h, 16)
891	part = Utf8Encode(i)
892
893	else:
894	# Should never happen
895	raise AssertionError(Id_str(tok_id))
896
897	#log('%s part %r', Id_str(tok_id), part)
898	self.decoded.write(part)
899	str_pos = str_end
900
901
902	class _Parser(object):
903
904	def __init__(self, s, is_j8):
905	# type: (str, bool) -> None
906	self.s = s
907	self.is_j8 = is_j8
908	self.lang_str = "J8" if is_j8 else "JSON"
909
910	self.lexer = LexerDecoder(s, is_j8, self.lang_str)
911	self.tok_id = Id.Undefined_Tok
912	self.start_pos = 0
913	self.end_pos = 0
914	self.decoded = '' # decoded J8 string
915
916	def _Next(self):
917	# type: () -> None
918
919	# This isn't the start of a J8_Bool token, it's the END of the token before it
920	while True:
921	self.start_pos = self.end_pos
922	self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
923	if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
924	Id.Ignored_Comment):
925	break
926	# TODO: add Ignored_Newline to count lines, and show line numbers
927	# in errors messages. The position of the last newline and a token
928	# can be used to calculate a column number.
929
930	#log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
931
932	def _Eat(self, tok_id):
933	# type: (Id_t) -> None
934
935	if self.tok_id != tok_id:
936	#log('position %r %d-%d %r', self.s, self.start_pos,
937	# self.end_pos, self.s[self.start_pos:self.end_pos])
938	raise self._ParseError("Expected %s, got %s" %
939	(Id_str(tok_id), Id_str(self.tok_id)))
940	self._Next()
941
942	def _NextForLines(self):
943	# type: () -> None
944	"""Like _Next, but use the J8 Lines lexer."""
945	self.start_pos = self.end_pos
946	self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
947
948	def _ParseError(self, msg):
949	# type: (str) -> error.Decode
950	return error.Decode(msg, self.s, self.start_pos, self.end_pos,
951	self.lexer.cur_line_num)
952
953
954	class Parser(_Parser):
955	"""JSON and JSON8 Parser."""
956
957	def __init__(self, s, is_j8):
958	# type: (str, bool) -> None
959	_Parser.__init__(self, s, is_j8)
960
961	def _ParsePair(self):
962	# type: () -> Tuple[str, value_t]
963
964	k = self.decoded # Save the potential string value
965	self._Eat(Id.J8_String) # Check that it's a string
966	assert k is not None
967
968	self._Eat(Id.J8_Colon)
969
970	v = self._ParseValue()
971	return k, v
972
973	def _ParseDict(self):
974	# type: () -> value_t
975	"""
976	pair = string ':' value
977	Dict = '{' '}'
978	\| '{' pair (',' pair)* '}'
979	"""
980	# precondition
981	assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
982
983	#log('> Dict')
984
985	d = NewDict() # type: Dict[str, value_t]
986
987	self._Next()
988	if self.tok_id == Id.J8_RBrace:
989	self._Next()
990	return value.Dict(d)
991
992	k, v = self._ParsePair()
993	d[k] = v
994	#log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
995
996	while self.tok_id == Id.J8_Comma:
997	self._Next()
998	k, v = self._ParsePair()
999	d[k] = v
1000	#log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
1001
1002	self._Eat(Id.J8_RBrace)
1003
1004	#log('< Dict')
1005
1006	return value.Dict(d)
1007
1008	def _ParseList(self):
1009	# type: () -> value_t
1010	"""
1011	List = '[' ']'
1012	\| '[' value (',' value)* ']'
1013	"""
1014	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1015
1016	items = [] # type: List[value_t]
1017
1018	self._Next()
1019	if self.tok_id == Id.J8_RBracket:
1020	self._Next()
1021	return value.List(items)
1022
1023	items.append(self._ParseValue())
1024
1025	while self.tok_id == Id.J8_Comma:
1026	self._Next()
1027	items.append(self._ParseValue())
1028
1029	self._Eat(Id.J8_RBracket)
1030
1031	return value.List(items)
1032
1033	def _ParseValue(self):
1034	# type: () -> value_t
1035	if self.tok_id == Id.J8_LBrace:
1036	return self._ParseDict()
1037
1038	elif self.tok_id == Id.J8_LBracket:
1039	return self._ParseList()
1040
1041	elif self.tok_id == Id.J8_Null:
1042	self._Next()
1043	return value.Null
1044
1045	elif self.tok_id == Id.J8_Bool:
1046	#log('%r %d', self.s[self.start_pos], self.start_pos)
1047	b = value.Bool(self.s[self.start_pos] == 't')
1048	self._Next()
1049	return b
1050
1051	elif self.tok_id == Id.J8_Int:
1052	part = self.s[self.start_pos:self.end_pos]
1053	self._Next()
1054	try:
1055	big = mops.FromStr(part)
1056	except ValueError:
1057	raise self._ParseError('Integer is too big')
1058	return value.Int(big)
1059
1060	elif self.tok_id == Id.J8_Float:
1061	part = self.s[self.start_pos:self.end_pos]
1062	self._Next()
1063	return value.Float(float(part))
1064
1065	# UString, BString too
1066	elif self.tok_id == Id.J8_String:
1067	str_val = value.Str(self.decoded)
1068	#log('d %r', self.decoded)
1069	self._Next()
1070	return str_val
1071
1072	elif self.tok_id == Id.Eol_Tok:
1073	raise self._ParseError('Unexpected EOF while parsing %s' %
1074	self.lang_str)
1075
1076	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1077	raise self._ParseError('Invalid token while parsing %s: %s' %
1078	(self.lang_str, Id_str(self.tok_id)))
1079
1080	def ParseValue(self):
1081	# type: () -> value_t
1082	""" Raises error.Decode. """
1083	self._Next()
1084	obj = self._ParseValue()
1085
1086	n = len(self.s)
1087	if self.start_pos != n:
1088	extra = n - self.start_pos
1089	#log('n %d pos %d', n, self.start_pos)
1090	raise self._ParseError(
1091	'Got %d bytes of unexpected trailing input' % extra)
1092	return obj
1093
1094
1095	class Nil8Parser(_Parser):
1096	"""
1097	Tokens not in JSON8:
1098	LParen RParen Symbol
1099
1100	Tokens not in JSON, but in JSON8 and NIL8:
1101	Identifier (unquoted keys)
1102	Ignored_Comment
1103	"""
1104
1105	def __init__(self, s, is_j8):
1106	# type: (str, bool) -> None
1107	_Parser.__init__(self, s, is_j8)
1108
1109	if 0:
1110
1111	def _LookAhead(self):
1112	# type: () -> Id_t
1113	"""
1114	Don't need this right now
1115	"""
1116	end_pos = self.end_pos # look ahead from last token
1117	while True:
1118	tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1119	if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1120	Id.Ignored_Comment):
1121	break
1122	return tok_id
1123
1124	def _ParseRecord(self):
1125	# type: () -> nvalue_t
1126	"""
1127	Yaks
1128	(self->Next) => (-> self Next)
1129	(self->Next obj.field) => ((-> self Next) (. obj field))
1130
1131	Similar to
1132	((identity identity) 42) => 42 in Clojure
1133
1134	ASDL
1135	(Node left:(. x4beef2))
1136	(Node left !x4beef2)
1137
1138	# Ambiguous because value can be identifier.
1139	# We have to look ahead to and see if there's a colon :
1140	field =
1141	Identifier ':' value
1142	\| value
1143
1144	record = '(' head field* ')'
1145
1146	- Identifier \| Symbol are treated the same, it's a side effect of
1147	the lexing style
1148	- do positional args come before named args
1149	- () is invalid? Use [] for empty list
1150	"""
1151	assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1152
1153	items = [] # type: List[nvalue_t]
1154
1155	self._Next()
1156	if self.tok_id == Id.J8_RParen:
1157	self._Next()
1158	return nvalue.List(items)
1159
1160	#log('TOK %s', Id_str(self.tok_id))
1161	while self.tok_id != Id.J8_RParen:
1162	items.append(self._ParseNil8())
1163	#log('TOK 2 %s', Id_str(self.tok_id))
1164
1165	self._Eat(Id.J8_RParen)
1166
1167	return nvalue.List(items)
1168
1169	def _ParseList8(self):
1170	# type: () -> nvalue_t
1171	"""
1172	List8 = '[' value* ']'
1173
1174	No commas, not even optional ones for now.
1175	"""
1176	assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1177
1178	items = [] # type: List[nvalue_t]
1179
1180	self._Next()
1181	if self.tok_id == Id.J8_RBracket:
1182	self._Next()
1183	return nvalue.List(items)
1184
1185	#log('TOK %s', Id_str(self.tok_id))
1186	while self.tok_id != Id.J8_RBracket:
1187	items.append(self._ParseNil8())
1188	#log('TOK 2 %s', Id_str(self.tok_id))
1189
1190	self._Eat(Id.J8_RBracket)
1191
1192	return nvalue.List(items)
1193
1194	def _ParseNil8(self):
1195	# type: () -> nvalue_t
1196	if self.tok_id == Id.J8_LParen:
1197	obj = self._ParseRecord() # type: nvalue_t
1198	#return obj
1199
1200	elif self.tok_id == Id.J8_LBracket:
1201	obj = self._ParseList8()
1202	#return obj
1203
1204	# Primitives are copied from J8 above.
1205	# TODO: We also want hex literals.
1206	elif self.tok_id == Id.J8_Null:
1207	self._Next()
1208	obj = nvalue.Null
1209
1210	elif self.tok_id == Id.J8_Bool:
1211	b = nvalue.Bool(self.s[self.start_pos] == 't')
1212	self._Next()
1213	obj = b
1214
1215	elif self.tok_id == Id.J8_Int:
1216	part = self.s[self.start_pos:self.end_pos]
1217	self._Next()
1218	obj = nvalue.Int(int(part))
1219
1220	elif self.tok_id == Id.J8_Float:
1221	part = self.s[self.start_pos:self.end_pos]
1222	self._Next()
1223	obj = nvalue.Float(float(part))
1224
1225	elif self.tok_id == Id.J8_String:
1226	str_val = nvalue.Str(self.decoded)
1227	self._Next()
1228	obj = str_val
1229
1230	# <- etc.
1231	elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1232	Id.J8_Comma):
1233	# unquoted "word" treated like a string
1234	part = self.s[self.start_pos:self.end_pos]
1235	self._Next()
1236	obj = nvalue.Symbol(part)
1237
1238	elif self.tok_id == Id.Eol_Tok:
1239	raise self._ParseError('Unexpected EOF while parsing %s' %
1240	self.lang_str)
1241
1242	else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1243	raise self._ParseError('Invalid token while parsing %s: %s' %
1244	(self.lang_str, Id_str(self.tok_id)))
1245
1246	#log('YO %s', Id_str(self.tok_id))
1247	if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1248	#log('AT %s', Id_str(self.tok_id))
1249
1250	# key: "value" -> (: key "value")
1251	part = self.s[self.start_pos:self.end_pos]
1252	op = nvalue.Symbol(part)
1253
1254	self._Next()
1255	operand2 = self._ParseNil8()
1256	infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1257	#print("--> INFIX %d %s" % (id(infix), infix))
1258	return infix
1259
1260	#next_id = self._LookAhead()
1261	#print('NEXT %s' % Id_str(next_id))
1262
1263	#raise AssertionError()
1264	#print("--> OBJ %d %s" % (id(obj), obj))
1265	return obj
1266
1267	def ParseNil8(self):
1268	# type: () -> nvalue_t
1269	""" Raises error.Decode. """
1270	self._Next()
1271	#print('yo')
1272	obj = self._ParseNil8()
1273	#print("==> %d %s" % (id(obj), obj))
1274	if self.tok_id != Id.Eol_Tok:
1275	raise self._ParseError('Unexpected trailing input')
1276	return obj
1277
1278
1279	class J8LinesParser(_Parser):
1280	"""Decode lines from a string with newlines.
1281
1282	We specify this with a grammar, to preserve location info and to reduce
1283	allocations. (But note that unquoted_line is more like a LOOP than it is
1284	grammatical.)
1285
1286	Grammar:
1287
1288	end = J8_Newline \| Eol_Tok
1289
1290	empty_line = WS_Space? end
1291
1292	# special case: read until end token, but REMOVE trailing WS_Space
1293	unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1294
1295	j8_line = WS_Space? J8_String WS_Space? end
1296
1297	lines = (empty_line \| unquoted_line \| j8_line)*
1298
1299	where Lit_Chars is valid UTF-8
1300
1301	Notes:
1302
1303	(1) We disallow multiple strings on a line, like:
1304
1305	"json" "json2"
1306	"json" unquoted
1307
1308	(2) Internal quotes are allowed on unquoted lines. Consider this line:
1309
1310	foo "" u''
1311
1312	The "" and u'' are not a decoded string, because the line started with
1313	Id.Lit_Chars literals.
1314
1315	(3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1316	Does it have - for empty cell?
1317	"""
1318
1319	def __init__(self, s):
1320	# type: (str) -> None
1321	_Parser.__init__(self, s, True)
1322
1323	def _Show(self, s):
1324	# type: (str) -> None
1325	log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1326	self.end_pos)
1327
1328	def _ParseLine(self, out):
1329	# type: (List[str]) -> None
1330	""" May append a line to 'out' """
1331	#self._Show('1')
1332	if self.tok_id == Id.WS_Space:
1333	self._NextForLines()
1334
1335	# Empty line - return without doing anything
1336	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1337	self._NextForLines()
1338	return
1339
1340	# Quoted string on line
1341	if self.tok_id == Id.J8_String:
1342	out.append(self.decoded)
1343	self._NextForLines()
1344
1345	if self.tok_id == Id.WS_Space: # trailing whitespace
1346	self._NextForLines()
1347
1348	if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1349	raise self._ParseError('Unexpected text after J8 Line (%s)' %
1350	Id_str(self.tok_id))
1351
1352	self._NextForLines()
1353	return
1354
1355	# Unquoted line
1356	if self.tok_id == Id.Lit_Chars:
1357	# ' unquoted "" text on line ' # read every token until end
1358	string_start = self.start_pos
1359	while True:
1360	# for stripping whitespace
1361	prev_id = self.tok_id
1362	prev_start = self.start_pos
1363
1364	self._NextForLines()
1365
1366	# It would be nicer if "middle" Id.WS_Space tokens didn't have
1367	# \r, but we're sticking with the JSON spec definition of
1368	# whitespace. (As another data point, CPython on Unix allows
1369	# \r in the middle of expressions, treating it as whitespace.)
1370	if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1371	break
1372
1373	if prev_id == Id.WS_Space:
1374	string_end = prev_start # remove trailing whitespace
1375	else:
1376	string_end = self.start_pos
1377
1378	out.append(self.s[string_start:string_end])
1379
1380	self._NextForLines() # past newline
1381	return
1382
1383	raise AssertionError(Id_str(self.tok_id))
1384
1385	def Parse(self):
1386	# type: () -> List[str]
1387	""" Raises error.Decode. """
1388	self._NextForLines()
1389
1390	lines = [] # type: List[str]
1391	while self.tok_id != Id.Eol_Tok:
1392	self._ParseLine(lines)
1393
1394	if self.tok_id != Id.Eol_Tok:
1395	raise self._ParseError('Unexpected trailing input in J8 Lines')
1396
1397	return lines
1398
1399
1400	def SplitJ8Lines(s):
1401	# type: (str) -> List[str]
1402	"""Used by @(echo split command sub)
1403
1404	Raises:
1405	error.Decode
1406
1407	3 Errors:
1408	- J8 string syntax error inside quotes
1409	- Extra input on line
1410	- unquoted line isn't utf-8
1411	"""
1412	p = J8LinesParser(s)
1413	return p.Parse()
1414
1415
1416	# vim: sw=4