OILS / data_lang / j8.py View on Github | oilshell.org

1369 lines, 685 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5Later:
6
7- PrettyPrinter uses hnode.asdl?
8 - color
9 - line wrapping -- do this later
10 - would like CONTRIBUTORS here
11
12- Unify with ASDL pretty printing - NIL8
13 - {} [] are identical
14 - () is for statically typed ASDL data
15 (command.Simple blame_tok:(...) words:[ ])
16 although we are also using [] for typed ASDL arrays, not just JSON
17 - object IDs
18 - @ x123 can create an ID
19 - ! x123 can reference an ID
20 - <> can be for non-J8 data types? For the = operator
21 - 'hi \(name)' interpolation is useful for code
22
23- Common between JSON8 and NIL8 - for writing by hand
24 - comments - # line or // line (JSON5 uses // line, following JS)
25 - unquoted identifier names - NIL8 could be more relaxed for (+ 1 (* 3 4))
26 - commas
27 - JSON8 could have trailing commas rule
28 - NIL8 at least has no commas for [1 2 "hi"]
29"""
30
31import math
32
33from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
34from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str,
35 Dict_)
36from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
37
38from asdl import format as fmt
39from core import error
40from data_lang import pyj8
41# dependency issue: consts.py pulls in frontend/option_def.py
42from frontend import consts
43from frontend import match
44from mycpp import mops
45from mycpp import mylib
46from mycpp.mylib import tagswitch, iteritems, NewDict, log
47
48import fastfunc
49
50_ = log
51
52from typing import cast, Dict, List, Tuple, Optional
53
54
55# COPIED from ui.ValType() to break dep
56def ValType(val):
57 # type: (value_t) -> str
58 """For displaying type errors in the UI."""
59
60 return value_str(val.tag(), dot=False)
61
62
63if mylib.PYTHON:
64
65 def HeapValueId(val):
66 # type: (value_t) -> int
67 """
68 Python's id() returns the address, which is up to 64 bits.
69
70 In C++ we can use the GC ID, which fits within 32 bits.
71 """
72 return id(val)
73
74
75def ValueId(val):
76 # type: (value_t) -> int
77 """
78 Return an integer ID for object that:
79
80 1. Can be used to determine whether 2 objects are the same, e.g. for
81 List, Dict, Func, Proc, etc.
82 2. Will help detect object cycles
83
84 Primitives types like Int and Float don't have this notion. They're
85 immutable values that are copied and compared by value.
86 """
87 with tagswitch(val) as case:
88 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
89 value_e.Str):
90 # These will not be on the heap if we switch to tagged pointers
91 # Str is handled conservatively - when we add small string
92 # optimization, some strings will be values, so we assume all are.
93 return -1
94 else:
95 return HeapValueId(val)
96
97
98def ValueIdString(val):
99 # type: (value_t) -> str
100 """Used by pp value (42) and = 42"""
101 heap_id = ValueId(val) # could be -1
102 if heap_id == -1:
103 return ''
104 else:
105 return ' 0x%s' % mylib.hex_lower(heap_id)
106
107
108def Utf8Encode(code):
109 # type: (int) -> str
110 """Return utf-8 encoded bytes from a unicode code point.
111
112 Based on https://stackoverflow.com/a/23502707
113 """
114 num_cont_bytes = 0
115
116 if code <= 0x7F:
117 return chr(code & 0x7F) # ASCII
118
119 elif code <= 0x7FF:
120 num_cont_bytes = 1
121 elif code <= 0xFFFF:
122 num_cont_bytes = 2
123 else:
124 # What about the check code <= 0x10FFFF ?
125 # - it happens in statically parsed $'' u''
126 # - but not dynamically parsed echo -e / printf, following bash/zsh
127 num_cont_bytes = 3
128
129 bytes_ = [] # type: List[int]
130 for _ in xrange(num_cont_bytes):
131 bytes_.append(0x80 | (code & 0x3F))
132 code >>= 6
133
134 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
135 bytes_.append(b)
136 bytes_.reverse()
137
138 # mod 256 because Python ints don't wrap around!
139 tmp = [chr(b & 0xFF) for b in bytes_]
140 return ''.join(tmp)
141
142
143SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
144SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
145LOSSY_JSON = 1 << 3 # JSON is lossy
146INF_NAN_ARE_NULL = 1 << 4 # for JSON
147
148# Hack until we fully translate
149assert pyj8.LOSSY_JSON == LOSSY_JSON
150
151
152def _Print(val, buf, indent, options=0):
153 # type: (value_t, mylib.BufWriter, int, int) -> None
154 """
155 Args:
156 indent: number of spaces to indent, or -1 for everything on one line
157 """
158 p = InstancePrinter(buf, indent, options)
159 p.Print(val)
160
161
162def PrintMessage(val, buf, indent):
163 # type: (value_t, mylib.BufWriter, int) -> None
164 """ For json8 write (x) and toJson8()
165
166 Caller must handle error.Encode
167 """
168 _Print(val, buf, indent)
169
170
171def PrintJsonMessage(val, buf, indent):
172 # type: (value_t, mylib.BufWriter, int) -> None
173 """ For json write (x) and toJson()
174
175 Caller must handle error.Encode()
176 Doesn't decay to b'' strings - will use Unicode replacement char.
177 """
178 _Print(val, buf, indent, options=LOSSY_JSON | INF_NAN_ARE_NULL)
179
180
181def PrintLine(val, f):
182 # type: (value_t, mylib.Writer) -> None
183 """ For pp line (x) """
184
185 # error.Encode should be impossible - we show cycles and non-data
186 buf = mylib.BufWriter()
187
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189
190 f.write(buf.getvalue())
191 f.write('\n')
192
193
194if 0:
195
196 def Repr(val):
197 # type: (value_t) -> str
198 """ Unused
199 This is like Python's repr
200 """
201 # error.Encode should be impossible - we show cycles and non-data
202 buf = mylib.BufWriter()
203 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
204 return buf.getvalue()
205
206
207def EncodeString(s, buf, unquoted_ok=False):
208 # type: (str, mylib.BufWriter, bool) -> None
209 """ For pp proc, etc."""
210
211 if unquoted_ok and fastfunc.CanOmitQuotes(s):
212 buf.write(s)
213 return
214
215 _Print(value.Str(s), buf, -1)
216
217
218def MaybeEncodeString(s):
219 # type: (str) -> str
220 """ For write --json8 $s and compexport """
221
222 # TODO: add unquoted_ok here?
223 # /usr/local/foo-bar/x.y/a_b
224
225 buf = mylib.BufWriter()
226 _Print(value.Str(s), buf, -1)
227 return buf.getvalue()
228
229
230def MaybeEncodeJsonString(s):
231 # type: (str) -> str
232 """ For write --json """
233
234 # TODO: add unquoted_ok here?
235 # /usr/local/foo-bar/x.y/a_b
236 buf = mylib.BufWriter()
237 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
238 return buf.getvalue()
239
240
241# DFS traversal state
242UNSEEN = 0
243EXPLORING = 1
244FINISHED = 2
245
246
247class InstancePrinter(object):
248 """Print a value tree as J8/JSON."""
249
250 def __init__(self, buf, indent, options):
251 # type: (mylib.BufWriter, int, int) -> None
252 self.buf = buf
253 self.indent = indent
254 self.options = options
255
256 # Key is vm.HeapValueId(val)
257 # Value is always True
258 # Dict[int, None] doesn't translate -- it would be nice to have a set()
259 self.visited = {} # type: Dict[int, int]
260
261 def _ItemIndent(self, level):
262 # type: (int) -> None
263
264 if self.indent == -1:
265 return
266
267 self.buf.write_spaces((level + 1) * self.indent)
268
269 def _BracketIndent(self, level):
270 # type: (int) -> None
271
272 if self.indent == -1:
273 return
274
275 self.buf.write_spaces(level * self.indent)
276
277 def _MaybeNewline(self):
278 # type: () -> None
279 if self.indent == -1:
280 return
281 self.buf.write('\n')
282
283 def _MaybeSpace(self):
284 # type: () -> None
285 if self.indent == -1:
286 return
287 self.buf.write(' ')
288
289 def _PrintList(self, val, level):
290 # type: (value.List, int) -> None
291
292 if len(val.items) == 0: # Special case like Python/JS
293 self.buf.write('[]')
294 else:
295 self.buf.write('[')
296 self._MaybeNewline()
297 for i, item in enumerate(val.items):
298 if i != 0:
299 self.buf.write(',')
300 self._MaybeNewline()
301
302 self._ItemIndent(level)
303 self.Print(item, level + 1)
304 self._MaybeNewline()
305
306 self._BracketIndent(level)
307 self.buf.write(']')
308
309 def _PrintDict(self, val, level):
310 # type: (Dict_, int) -> None
311
312 if len(val.d) == 0: # Special case like Python/JS
313 self.buf.write('{}')
314 else:
315 self.buf.write('{')
316 self._MaybeNewline()
317 i = 0
318 for k, v in iteritems(val.d):
319 if i != 0:
320 self.buf.write(',')
321 self._MaybeNewline()
322
323 self._ItemIndent(level)
324
325 pyj8.WriteString(k, self.options, self.buf)
326
327 self.buf.write(':')
328 self._MaybeSpace()
329
330 self.Print(v, level + 1)
331
332 i += 1
333
334 self._MaybeNewline()
335 self._BracketIndent(level)
336 self.buf.write('}')
337
338 def _PrintBashPrefix(self, type_str, level):
339 # type: (str, int) -> None
340
341 self.buf.write('{')
342 self._MaybeNewline()
343 self._ItemIndent(level)
344 self.buf.write('"type":')
345 self._MaybeSpace()
346 self.buf.write(type_str) # "BashArray", or "BashAssoc",
347
348 self._MaybeNewline()
349
350 self._ItemIndent(level)
351 self.buf.write('"data":')
352 self._MaybeSpace()
353
354 def _PrintBashSuffix(self, level):
355 # type: (int) -> None
356 self._MaybeNewline()
357 self._BracketIndent(level)
358 self.buf.write('}')
359
360 def _PrintSparseArray(self, val, level):
361 # type: (value.SparseArray, int) -> None
362
363 self._PrintBashPrefix('"SparseArray",', level)
364
365 if len(val.d) == 0: # Special case like Python/JS
366 self.buf.write('{}')
367 else:
368 self.buf.write('{')
369 self._MaybeNewline()
370
371 first = True
372 i = 0
373 for k, v in iteritems(val.d):
374 if i != 0:
375 self.buf.write(',')
376 self._MaybeNewline()
377
378 self._ItemIndent(level + 1)
379 pyj8.WriteString(mops.ToStr(k), self.options, self.buf)
380
381 self.buf.write(':')
382 self._MaybeSpace()
383
384 pyj8.WriteString(v, self.options, self.buf)
385
386 i += 1
387
388 self._MaybeNewline()
389
390 self._BracketIndent(level + 1)
391 self.buf.write('}')
392
393 self._PrintBashSuffix(level)
394
395 def _PrintBashArray(self, val, level):
396 # type: (value.BashArray, int) -> None
397
398 self._PrintBashPrefix('"BashArray",', level)
399
400 if len(val.strs) == 0: # Special case like Python/JS
401 self.buf.write('{}')
402 else:
403 self.buf.write('{')
404 self._MaybeNewline()
405
406 first = True
407 for i, s in enumerate(val.strs):
408 if s is None:
409 continue
410
411 if not first:
412 self.buf.write(',')
413 self._MaybeNewline()
414
415 self._ItemIndent(level + 1)
416 pyj8.WriteString(str(i), self.options, self.buf)
417
418 self.buf.write(':')
419 self._MaybeSpace()
420
421 pyj8.WriteString(s, self.options, self.buf)
422
423 first = False
424
425 self._MaybeNewline()
426
427 self._BracketIndent(level + 1)
428 self.buf.write('}')
429
430 self._PrintBashSuffix(level)
431
432 def _PrintBashAssoc(self, val, level):
433 # type: (value.BashAssoc, int) -> None
434
435 self._PrintBashPrefix('"BashAssoc",', level)
436
437 if len(val.d) == 0: # Special case like Python/JS
438 self.buf.write('{}')
439 else:
440 self.buf.write('{')
441 self._MaybeNewline()
442
443 i = 0
444 for k2, v2 in iteritems(val.d):
445 if i != 0:
446 self.buf.write(',')
447 self._MaybeNewline()
448
449 self._ItemIndent(level + 1)
450 pyj8.WriteString(k2, self.options, self.buf)
451
452 self.buf.write(':')
453 self._MaybeSpace()
454
455 pyj8.WriteString(v2, self.options, self.buf)
456
457 i += 1
458
459 self._MaybeNewline()
460
461 self._BracketIndent(level + 1)
462 self.buf.write('}')
463
464 self._PrintBashSuffix(level)
465
466 def Print(self, val, level=0):
467 # type: (value_t, int) -> None
468
469 # special value that means everything is on one line
470 # It's like
471 # JSON.stringify(d, null, 0)
472 # except we use -1, not 0. 0 can still have newlines.
473
474 UP_val = val
475 with tagswitch(val) as case:
476 if case(value_e.Null):
477 self.buf.write('null')
478
479 elif case(value_e.Bool):
480 val = cast(value.Bool, UP_val)
481 self.buf.write('true' if val.b else 'false')
482
483 elif case(value_e.Int):
484 val = cast(value.Int, UP_val)
485 # TODO: avoid intermediate allocation with
486 # self.buf.WriteBigInt(val.i)
487 #
488 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
489 # be of arbitrary length, and will need a growth strategy.
490 # Although that is not very common, so we could allocate in
491 # that case.
492
493 self.buf.write(mops.ToStr(val.i))
494
495 elif case(value_e.Float):
496 val = cast(value.Float, UP_val)
497
498 fl = val.f
499 if math.isinf(fl):
500 if self.options & INF_NAN_ARE_NULL:
501 s = 'null' # negative infinity is null too
502 else:
503 s = 'INFINITY'
504 if fl < 0:
505 s = '-' + s
506 elif math.isnan(fl):
507 if self.options & INF_NAN_ARE_NULL:
508 # JavaScript JSON lib behavior: Inf and NaN are null
509 # Python has a bug in the encoder by default, and then
510 # allow_nan=False raises an error
511 s = 'null'
512 else:
513 s = 'NAN'
514 else:
515 # TODO: can we avoid intermediate allocation?
516 # self.buf.WriteFloat(val.f)
517 s = str(fl)
518
519 self.buf.write(s)
520
521 elif case(value_e.Str):
522 val = cast(value.Str, UP_val)
523
524 pyj8.WriteString(val.s, self.options, self.buf)
525
526 elif case(value_e.List):
527 val = cast(value.List, UP_val)
528
529 # Cycle detection, only for containers that can be in cycles
530 heap_id = HeapValueId(val)
531
532 node_state = self.visited.get(heap_id, UNSEEN)
533 if node_state == FINISHED:
534 # Print it AGAIN. We print a JSON tree, which means we can
535 # visit and print nodes MANY TIMES, as long as they're not
536 # in a cycle.
537 self._PrintList(val, level)
538 return
539 if node_state == EXPLORING:
540 if self.options & SHOW_CYCLES:
541 self.buf.write('[ -->%s ]' % ValueIdString(val))
542 return
543 else:
544 # node.js prints which index closes the cycle
545 raise error.Encode(
546 "Can't encode List%s in object cycle" %
547 ValueIdString(val))
548
549 self.visited[heap_id] = EXPLORING
550 self._PrintList(val, level)
551 self.visited[heap_id] = FINISHED
552
553 elif case(value_e.Dict):
554 val = cast(Dict_, UP_val)
555
556 # Cycle detection, only for containers that can be in cycles
557 heap_id = HeapValueId(val)
558
559 node_state = self.visited.get(heap_id, UNSEEN)
560 if node_state == FINISHED:
561 # Print it AGAIN. We print a JSON tree, which means we can
562 # visit and print nodes MANY TIMES, as long as they're not
563 # in a cycle.
564 self._PrintDict(val, level)
565 return
566 if node_state == EXPLORING:
567 if self.options & SHOW_CYCLES:
568 self.buf.write('{ -->%s }' % ValueIdString(val))
569 return
570 else:
571 # node.js prints which key closes the cycle
572 raise error.Encode(
573 "Can't encode Dict%s in object cycle" %
574 ValueIdString(val))
575
576 self.visited[heap_id] = EXPLORING
577 self._PrintDict(val, level)
578 self.visited[heap_id] = FINISHED
579
580 elif case(value_e.SparseArray):
581 val = cast(value.SparseArray, UP_val)
582 self._PrintSparseArray(val, level)
583
584 elif case(value_e.BashArray):
585 val = cast(value.BashArray, UP_val)
586 self._PrintBashArray(val, level)
587
588 elif case(value_e.BashAssoc):
589 val = cast(value.BashAssoc, UP_val)
590 self._PrintBashAssoc(val, level)
591
592 else:
593 pass # mycpp workaround
594 if self.options & SHOW_NON_DATA:
595 # Similar to = operator, ui.DebugPrint()
596 # TODO: that prints value.Range in a special way
597 ysh_type = ValType(val)
598 id_str = ValueIdString(val)
599 self.buf.write('<%s%s>' % (ysh_type, id_str))
600 else:
601 raise error.Encode("Can't serialize object of type %s" %
602 ValType(val))
603
604
605class PrettyPrinter(object):
606 """ Unused right now, but could enhance the = operator.
607
608 Output to polymorphic ColorOutput
609
610 Features like asdl/format.py:
611 - line wrapping
612 - color
613 - sharing detection by passing in a REF COUTN dict
614 - print @123 the first time, and then print ... the second time
615
616 and
617
618 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
619 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
620
621 - Omitting commas for ASDL? Maybe we can use two spaces
622
623 (Token id: Id.VSub_DollarName start: 0 length: 3)
624 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
625 """
626
627 def __init__(self, max_col):
628 # type: (int) -> None
629 self.max_col = max_col
630
631 # This could be an optimized set an C++ bit set like
632 # mark_sweep_heap.h, rather than a Dict
633 #self.unique_objs = mylib.UniqueObjects()
634
635 # first pass of object ID -> number of times references
636
637 self.ref_count = {} # type: Dict[int, int]
638
639 def PrettyTree(self, val, f):
640 # type: (value_t, fmt.ColorOutput) -> None
641
642 # TODO: first convert to hnode.asdl types?
643
644 # Although we might want
645 # hnode.AlreadyShown = (str type, int unique_id)
646 pass
647
648 def Print(self, val, buf):
649 # type: (value_t, mylib.BufWriter) -> None
650
651 # Or print to stderr?
652 f = fmt.DetectConsoleOutput(mylib.Stdout())
653 self.PrettyTree(val, f)
654
655 # Then print those with ASDL
656 pass
657
658
659class LexerDecoder(object):
660 """J8 lexer and string decoder.
661
662 Similar interface as SimpleLexer, except we return an optional decoded
663 string
664 """
665
666 def __init__(self, s, is_j8, lang_str):
667 # type: (str, bool, str) -> None
668 self.s = s
669 self.is_j8 = is_j8
670 self.lang_str = lang_str
671
672 self.pos = 0
673
674 # current line being lexed -- for error messages
675 self.cur_line_num = 1
676
677 # Reuse this instance to save GC objects. JSON objects could have
678 # thousands of strings.
679 self.decoded = mylib.BufWriter()
680
681 def _Error(self, msg, end_pos):
682 # type: (str, int) -> error.Decode
683
684 # Use the current position as start pos
685 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
686
687 def Next(self):
688 # type: () -> Tuple[Id_t, int, Optional[str]]
689 """ Returns a token and updates self.pos """
690
691 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
692
693 if not self.is_j8:
694 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
695 raise self._Error(
696 "Single quotes aren't part of JSON; you may want 'json8 read'",
697 end_pos)
698 if tok_id == Id.Ignored_Comment:
699 raise self._Error(
700 "Comments aren't part of JSON; you may want 'json8 read'",
701 end_pos)
702
703 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
704 Id.Left_USingleQuote):
705 return self._DecodeString(tok_id, end_pos)
706
707 if tok_id == Id.Left_JDoubleQuote:
708 if self.is_j8:
709 return self._DecodeString(tok_id, end_pos)
710 else:
711 raise self._Error('Pure JSON does not accept j"" prefix',
712 end_pos)
713
714 if tok_id == Id.Ignored_Newline:
715 #log('LINE %d', self.cur_line_num)
716 self.cur_line_num += 1
717
718 self.pos = end_pos
719 return tok_id, end_pos, None
720
721 def NextForLines(self):
722 # type: () -> Tuple[Id_t, int, Optional[str]]
723 """ Like Next(), but for J8 Lines """
724
725 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
726
727 if tok_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote,
728 Id.Left_BSingleQuote, Id.Left_USingleQuote):
729 return self._DecodeString(tok_id, end_pos)
730
731 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
732 # this for quoted strings.)
733 if (tok_id == Id.Lit_Chars and
734 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
735 raise self._Error(
736 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
737 if tok_id == Id.Char_AsciiControl:
738 raise self._Error(
739 "J8 Lines can't have unescaped ASCII control chars", end_pos)
740
741 if tok_id == Id.J8_Newline:
742 #log('LINE %d', self.cur_line_num)
743 self.cur_line_num += 1
744
745 self.pos = end_pos
746 return tok_id, end_pos, None
747
748 def _DecodeString(self, left_id, str_pos):
749 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
750 """ Returns a string token and updates self.pos """
751
752 while True:
753 if left_id in (Id.Left_DoubleQuote, Id.Left_JDoubleQuote):
754 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
755 else:
756 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
757
758 #log('String tok %s %r', Id_str(tok_id), self.s[str_pos:str_end])
759
760 if tok_id == Id.Eol_Tok:
761 # TODO: point to beginning of # quote?
762 raise self._Error(
763 'Unexpected EOF while lexing %s string' % self.lang_str,
764 str_end)
765 if tok_id == Id.Unknown_Backslash:
766 raise self._Error(
767 'Bad backslash escape in %s string' % self.lang_str,
768 str_end)
769 if tok_id == Id.Char_AsciiControl:
770 raise self._Error(
771 "%s strings can't have unescaped ASCII control chars" %
772 self.lang_str, str_end)
773
774 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
775
776 self.pos = str_end
777
778 s = self.decoded.getvalue()
779 self.decoded.clear() # reuse this instance
780
781 #log('decoded %r', self.decoded.getvalue())
782 return Id.J8_String, str_end, s
783
784 #
785 # Now handle each kind of token
786 #
787
788 if tok_id == Id.Lit_Chars: # JSON and J8
789 part = self.s[str_pos:str_end]
790 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
791 raise self._Error(
792 'Invalid UTF-8 in %s string literal' % self.lang_str,
793 str_end)
794
795 # TODO: would be nice to avoid allocation in all these cases.
796 # But LookupCharC() would have to change.
797
798 elif tok_id == Id.Char_OneChar: # JSON and J8
799 ch = self.s[str_pos + 1]
800 part = consts.LookupCharC(ch)
801
802 elif tok_id == Id.Char_UBraced: # J8 only
803 h = self.s[str_pos + 3:str_end - 1]
804 i = int(h, 16)
805
806 # Same checks in osh/word_compile.py
807 if i > 0x10ffff:
808 raise self._Error(
809 "Code point can't be greater than U+10ffff", str_end)
810 if 0xD800 <= i and i < 0xE000:
811 raise self._Error(
812 r"\u{%s} escape is illegal because it's in the surrogate range"
813 % h, str_end)
814
815 part = Utf8Encode(i)
816
817 elif tok_id == Id.Char_YHex: # J8 only
818 h = self.s[str_pos + 2:str_end]
819
820 # Same check in osh/word_parse.py
821 if left_id != Id.Left_BSingleQuote:
822 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
823 raise self._Error(
824 r"\y%s escapes not allowed in u'' strings" % h,
825 str_end)
826
827 i = int(h, 16)
828 part = chr(i)
829
830 elif tok_id == Id.Char_SurrogatePair:
831 h1 = self.s[str_pos + 2:str_pos + 6]
832 h2 = self.s[str_pos + 8:str_pos + 12]
833
834 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
835 i1 = int(h1, 16) - 0xD800 # high surrogate
836 i2 = int(h2, 16) - 0xDC00 # low surrogate
837 code_point = 0x10000 + (i1 << 10) + i2
838
839 part = Utf8Encode(code_point)
840
841 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
842 h = self.s[str_pos + 2:str_end]
843 i = int(h, 16)
844 part = Utf8Encode(i)
845
846 else:
847 # Should never happen
848 raise AssertionError(Id_str(tok_id))
849
850 #log('%s part %r', Id_str(tok_id), part)
851 self.decoded.write(part)
852 str_pos = str_end
853
854
855class _Parser(object):
856
857 def __init__(self, s, is_j8):
858 # type: (str, bool) -> None
859 self.s = s
860 self.is_j8 = is_j8
861 self.lang_str = "J8" if is_j8 else "JSON"
862
863 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
864 self.tok_id = Id.Undefined_Tok
865 self.start_pos = 0
866 self.end_pos = 0
867 self.decoded = '' # decoded J8 string
868
869 def _Next(self):
870 # type: () -> None
871
872 # This isn't the start of a J8_Bool token, it's the END of the token before it
873 while True:
874 self.start_pos = self.end_pos
875 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
876 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
877 Id.Ignored_Comment):
878 break
879 # TODO: add Ignored_Newline to count lines, and show line numbers
880 # in errors messages. The position of the last newline and a token
881 # can be used to calculate a column number.
882
883 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
884
885 def _Eat(self, tok_id):
886 # type: (Id_t) -> None
887
888 if self.tok_id != tok_id:
889 #log('position %r %d-%d %r', self.s, self.start_pos,
890 # self.end_pos, self.s[self.start_pos:self.end_pos])
891 raise self._ParseError("Expected %s, got %s" %
892 (Id_str(tok_id), Id_str(self.tok_id)))
893 self._Next()
894
895 def _NextForLines(self):
896 # type: () -> None
897 """Like _Next, but use the J8 Lines lexer."""
898 self.start_pos = self.end_pos
899 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
900
901 def _ParseError(self, msg):
902 # type: (str) -> error.Decode
903 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
904 self.lexer.cur_line_num)
905
906
907class Parser(_Parser):
908 """JSON and JSON8 Parser."""
909
910 def __init__(self, s, is_j8):
911 # type: (str, bool) -> None
912 _Parser.__init__(self, s, is_j8)
913
914 def _ParsePair(self):
915 # type: () -> Tuple[str, value_t]
916
917 k = self.decoded # Save the potential string value
918 self._Eat(Id.J8_String) # Check that it's a string
919 assert k is not None
920
921 self._Eat(Id.J8_Colon)
922
923 v = self._ParseValue()
924 return k, v
925
926 def _ParseDict(self):
927 # type: () -> value_t
928 """
929 pair = string ':' value
930 Dict = '{' '}'
931 | '{' pair (',' pair)* '}'
932 """
933 # precondition
934 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
935
936 #log('> Dict')
937
938 d = NewDict() # type: Dict[str, value_t]
939
940 self._Next()
941 if self.tok_id == Id.J8_RBrace:
942 self._Next()
943 return Dict_(d, None)
944
945 k, v = self._ParsePair()
946 d[k] = v
947 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
948
949 while self.tok_id == Id.J8_Comma:
950 self._Next()
951 k, v = self._ParsePair()
952 d[k] = v
953 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
954
955 self._Eat(Id.J8_RBrace)
956
957 #log('< Dict')
958
959 return Dict_(d, None)
960
961 def _ParseList(self):
962 # type: () -> value_t
963 """
964 List = '[' ']'
965 | '[' value (',' value)* ']'
966 """
967 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
968
969 items = [] # type: List[value_t]
970
971 self._Next()
972 if self.tok_id == Id.J8_RBracket:
973 self._Next()
974 return value.List(items)
975
976 items.append(self._ParseValue())
977
978 while self.tok_id == Id.J8_Comma:
979 self._Next()
980 items.append(self._ParseValue())
981
982 self._Eat(Id.J8_RBracket)
983
984 return value.List(items)
985
986 def _ParseValue(self):
987 # type: () -> value_t
988 if self.tok_id == Id.J8_LBrace:
989 return self._ParseDict()
990
991 elif self.tok_id == Id.J8_LBracket:
992 return self._ParseList()
993
994 elif self.tok_id == Id.J8_Null:
995 self._Next()
996 return value.Null
997
998 elif self.tok_id == Id.J8_Bool:
999 #log('%r %d', self.s[self.start_pos], self.start_pos)
1000 b = value.Bool(self.s[self.start_pos] == 't')
1001 self._Next()
1002 return b
1003
1004 elif self.tok_id == Id.J8_Int:
1005 part = self.s[self.start_pos:self.end_pos]
1006 self._Next()
1007 try:
1008 big = mops.FromStr(part)
1009 except ValueError:
1010 raise self._ParseError('Integer is too big')
1011 return value.Int(big)
1012
1013 elif self.tok_id == Id.J8_Float:
1014 part = self.s[self.start_pos:self.end_pos]
1015 self._Next()
1016 return value.Float(float(part))
1017
1018 # UString, BString too
1019 elif self.tok_id == Id.J8_String:
1020 str_val = value.Str(self.decoded)
1021 #log('d %r', self.decoded)
1022 self._Next()
1023 return str_val
1024
1025 elif self.tok_id == Id.Eol_Tok:
1026 raise self._ParseError('Unexpected EOF while parsing %s' %
1027 self.lang_str)
1028
1029 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1030 raise self._ParseError('Invalid token while parsing %s: %s' %
1031 (self.lang_str, Id_str(self.tok_id)))
1032
1033 def ParseValue(self):
1034 # type: () -> value_t
1035 """ Raises error.Decode. """
1036 self._Next()
1037 obj = self._ParseValue()
1038
1039 n = len(self.s)
1040 if self.start_pos != n:
1041 extra = n - self.start_pos
1042 #log('n %d pos %d', n, self.start_pos)
1043 raise self._ParseError(
1044 'Got %d bytes of unexpected trailing input' % extra)
1045 return obj
1046
1047
1048class Nil8Parser(_Parser):
1049 """
1050 Tokens not in JSON8:
1051 LParen RParen Symbol
1052
1053 Tokens not in JSON, but in JSON8 and NIL8:
1054 Identifier (unquoted keys)
1055 Ignored_Comment
1056 """
1057
1058 def __init__(self, s, is_j8):
1059 # type: (str, bool) -> None
1060 _Parser.__init__(self, s, is_j8)
1061
1062 if 0:
1063
1064 def _LookAhead(self):
1065 # type: () -> Id_t
1066 """
1067 Don't need this right now
1068 """
1069 end_pos = self.end_pos # look ahead from last token
1070 while True:
1071 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
1072 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
1073 Id.Ignored_Comment):
1074 break
1075 return tok_id
1076
1077 def _ParseRecord(self):
1078 # type: () -> nvalue_t
1079 """
1080 Yaks
1081 (self->Next) => (-> self Next)
1082 (self->Next obj.field) => ((-> self Next) (. obj field))
1083
1084 Similar to
1085 ((identity identity) 42) => 42 in Clojure
1086
1087 ASDL
1088 (Node left:(. x4beef2))
1089 (Node left !x4beef2)
1090
1091 # Ambiguous because value can be identifier.
1092 # We have to look ahead to and see if there's a colon :
1093 field =
1094 Identifier ':' value
1095 | value
1096
1097 record = '(' head field* ')'
1098
1099 - Identifier | Symbol are treated the same, it's a side effect of
1100 the lexing style
1101 - do positional args come before named args
1102 - () is invalid? Use [] for empty list
1103 """
1104 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
1105
1106 items = [] # type: List[nvalue_t]
1107
1108 self._Next()
1109 if self.tok_id == Id.J8_RParen:
1110 self._Next()
1111 return nvalue.List(items)
1112
1113 #log('TOK %s', Id_str(self.tok_id))
1114 while self.tok_id != Id.J8_RParen:
1115 items.append(self._ParseNil8())
1116 #log('TOK 2 %s', Id_str(self.tok_id))
1117
1118 self._Eat(Id.J8_RParen)
1119
1120 return nvalue.List(items)
1121
1122 def _ParseList8(self):
1123 # type: () -> nvalue_t
1124 """
1125 List8 = '[' value* ']'
1126
1127 No commas, not even optional ones for now.
1128 """
1129 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
1130
1131 items = [] # type: List[nvalue_t]
1132
1133 self._Next()
1134 if self.tok_id == Id.J8_RBracket:
1135 self._Next()
1136 return nvalue.List(items)
1137
1138 #log('TOK %s', Id_str(self.tok_id))
1139 while self.tok_id != Id.J8_RBracket:
1140 items.append(self._ParseNil8())
1141 #log('TOK 2 %s', Id_str(self.tok_id))
1142
1143 self._Eat(Id.J8_RBracket)
1144
1145 return nvalue.List(items)
1146
1147 def _ParseNil8(self):
1148 # type: () -> nvalue_t
1149 if self.tok_id == Id.J8_LParen:
1150 obj = self._ParseRecord() # type: nvalue_t
1151 #return obj
1152
1153 elif self.tok_id == Id.J8_LBracket:
1154 obj = self._ParseList8()
1155 #return obj
1156
1157 # Primitives are copied from J8 above.
1158 # TODO: We also want hex literals.
1159 elif self.tok_id == Id.J8_Null:
1160 self._Next()
1161 obj = nvalue.Null
1162
1163 elif self.tok_id == Id.J8_Bool:
1164 b = nvalue.Bool(self.s[self.start_pos] == 't')
1165 self._Next()
1166 obj = b
1167
1168 elif self.tok_id == Id.J8_Int:
1169 part = self.s[self.start_pos:self.end_pos]
1170 self._Next()
1171 obj = nvalue.Int(int(part))
1172
1173 elif self.tok_id == Id.J8_Float:
1174 part = self.s[self.start_pos:self.end_pos]
1175 self._Next()
1176 obj = nvalue.Float(float(part))
1177
1178 elif self.tok_id == Id.J8_String:
1179 str_val = nvalue.Str(self.decoded)
1180 self._Next()
1181 obj = str_val
1182
1183 # <- etc.
1184 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1185 Id.J8_Comma):
1186 # unquoted "word" treated like a string
1187 part = self.s[self.start_pos:self.end_pos]
1188 self._Next()
1189 obj = nvalue.Symbol(part)
1190
1191 elif self.tok_id == Id.Eol_Tok:
1192 raise self._ParseError('Unexpected EOF while parsing %s' %
1193 self.lang_str)
1194
1195 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1196 raise self._ParseError('Invalid token while parsing %s: %s' %
1197 (self.lang_str, Id_str(self.tok_id)))
1198
1199 #log('YO %s', Id_str(self.tok_id))
1200 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1201 #log('AT %s', Id_str(self.tok_id))
1202
1203 # key: "value" -> (: key "value")
1204 part = self.s[self.start_pos:self.end_pos]
1205 op = nvalue.Symbol(part)
1206
1207 self._Next()
1208 operand2 = self._ParseNil8()
1209 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1210 #print("--> INFIX %d %s" % (id(infix), infix))
1211 return infix
1212
1213 #next_id = self._LookAhead()
1214 #print('NEXT %s' % Id_str(next_id))
1215
1216 #raise AssertionError()
1217 #print("--> OBJ %d %s" % (id(obj), obj))
1218 return obj
1219
1220 def ParseNil8(self):
1221 # type: () -> nvalue_t
1222 """ Raises error.Decode. """
1223 self._Next()
1224 #print('yo')
1225 obj = self._ParseNil8()
1226 #print("==> %d %s" % (id(obj), obj))
1227 if self.tok_id != Id.Eol_Tok:
1228 raise self._ParseError('Unexpected trailing input')
1229 return obj
1230
1231
1232class J8LinesParser(_Parser):
1233 """Decode lines from a string with newlines.
1234
1235 We specify this with a grammar, to preserve location info and to reduce
1236 allocations. (But note that unquoted_line is more like a LOOP than it is
1237 grammatical.)
1238
1239 Grammar:
1240
1241 end = J8_Newline | Eol_Tok
1242
1243 empty_line = WS_Space? end
1244
1245 # special case: read until end token, but REMOVE trailing WS_Space
1246 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1247
1248 j8_line = WS_Space? J8_String WS_Space? end
1249
1250 lines = (empty_line | unquoted_line | j8_line)*
1251
1252 where Lit_Chars is valid UTF-8
1253
1254 Notes:
1255
1256 (1) We disallow multiple strings on a line, like:
1257
1258 "json" "json2"
1259 "json" unquoted
1260
1261 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1262
1263 foo "" u''
1264
1265 The "" and u'' are not a decoded string, because the line started with
1266 Id.Lit_Chars literals.
1267
1268 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1269 Does it have - for empty cell?
1270 """
1271
1272 def __init__(self, s):
1273 # type: (str) -> None
1274 _Parser.__init__(self, s, True)
1275
1276 def _Show(self, s):
1277 # type: (str) -> None
1278 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1279 self.end_pos)
1280
1281 def _ParseLine(self, out):
1282 # type: (List[str]) -> None
1283 """ May append a line to 'out' """
1284 #self._Show('1')
1285 if self.tok_id == Id.WS_Space:
1286 self._NextForLines()
1287
1288 # Empty line - return without doing anything
1289 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1290 self._NextForLines()
1291 return
1292
1293 # Quoted string on line
1294 if self.tok_id == Id.J8_String:
1295 out.append(self.decoded)
1296 self._NextForLines()
1297
1298 if self.tok_id == Id.WS_Space: # trailing whitespace
1299 self._NextForLines()
1300
1301 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1302 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1303 Id_str(self.tok_id))
1304
1305 self._NextForLines()
1306 return
1307
1308 # Unquoted line
1309 if self.tok_id == Id.Lit_Chars:
1310 # ' unquoted "" text on line ' # read every token until end
1311 string_start = self.start_pos
1312 while True:
1313 # for stripping whitespace
1314 prev_id = self.tok_id
1315 prev_start = self.start_pos
1316
1317 self._NextForLines()
1318
1319 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1320 # \r, but we're sticking with the JSON spec definition of
1321 # whitespace. (As another data point, CPython on Unix allows
1322 # \r in the middle of expressions, treating it as whitespace.)
1323 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1324 break
1325
1326 if prev_id == Id.WS_Space:
1327 string_end = prev_start # remove trailing whitespace
1328 else:
1329 string_end = self.start_pos
1330
1331 out.append(self.s[string_start:string_end])
1332
1333 self._NextForLines() # past newline
1334 return
1335
1336 raise AssertionError(Id_str(self.tok_id))
1337
1338 def Parse(self):
1339 # type: () -> List[str]
1340 """ Raises error.Decode. """
1341 self._NextForLines()
1342
1343 lines = [] # type: List[str]
1344 while self.tok_id != Id.Eol_Tok:
1345 self._ParseLine(lines)
1346
1347 if self.tok_id != Id.Eol_Tok:
1348 raise self._ParseError('Unexpected trailing input in J8 Lines')
1349
1350 return lines
1351
1352
1353def SplitJ8Lines(s):
1354 # type: (str) -> List[str]
1355 """Used by @(echo split command sub)
1356
1357 Raises:
1358 error.Decode
1359
1360 3 Errors:
1361 - J8 string syntax error inside quotes
1362 - Extra input on line
1363 - unquoted line isn't utf-8
1364 """
1365 p = J8LinesParser(s)
1366 return p.Parse()
1367
1368
1369# vim: sw=4