OILS / data_lang / j8.py View on Github | oilshell.org

1230 lines, 589 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5TODO:
6
7- Many more tests
8 - Run JSONTestSuite
9
10Later:
11
12- PrettyPrinter uses hnode.asdl?
13 - color
14 - line wrapping -- do this later
15 - would like CONTRIBUTORS here
16
17- Unify with ASDL pretty printing - NIL8
18 - {} [] are identical
19 - () is for statically typed ASDL data
20 (command.Simple blame_tok:(...) words:[ ])
21 although we are also using [] for typed ASDL arrays, not just JSON
22 - object IDs
23 - @ x123 can create an ID
24 - ! x123 can reference an ID
25 - <> can be for non-J8 data types? For the = operator
26 - 'hi \(name)' interpolation is useful for code
27
28- Common between JSON8 and NIL8 - for writing by hand
29 - comments - # line or // line (JSON5 uses // line, following JS)
30 - unquoted identifier names - TYG8 could be more relaxed for (+ 1 (* 3 4))
31 - commas
32 - JSON8 could have trailing commas rule
33 - NIL8 at least has no commas for [1 2 "hi"]
34"""
35
36from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40from asdl import format as fmt
41from core import error
42from data_lang import pyj8
43# dependency issue: consts.py pulls in frontend/option_def.py
44from frontend import consts
45from frontend import match
46from mycpp import mops
47from mycpp import mylib
48from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50import fastfunc
51
52_ = log
53
54from typing import cast, Dict, List, Tuple, Optional
55
56
57# COPIED from ui.ValType() to break dep
58def ValType(val):
59 # type: (value_t) -> str
60 """For displaying type errors in the UI."""
61
62 return value_str(val.tag(), dot=False)
63
64
65if mylib.PYTHON:
66
67 def HeapValueId(val):
68 # type: (value_t) -> int
69 """
70 Python's id() returns the address, which is up to 64 bits.
71
72 In C++ we can use the GC ID, which fits within 32 bits.
73 """
74 return id(val)
75
76
77def ValueId(val):
78 # type: (value_t) -> int
79 """
80 Return an integer ID for object that:
81
82 1. Can be used to determine whether 2 objects are the same, e.g. for
83 List, Dict, Func, Proc, etc.
84 2. Will help detect object cycles
85
86 Primitives types like Int and Float don't have this notion. They're
87 immutable values that are copied and compared by value.
88 """
89 with tagswitch(val) as case:
90 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91 value_e.Str):
92 # These will not be on the heap if we switch to tagged pointers
93 # Str is handled conservatively - when we add small string
94 # optimization, some strings will be values, so we assume all are.
95 return -1
96 else:
97 return HeapValueId(val)
98
99
100def ValueIdString(val):
101 # type: (value_t) -> str
102 """Used by pp value (42) and = 42"""
103 heap_id = ValueId(val) # could be -1
104 if heap_id == -1:
105 return ''
106 else:
107 return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110def Utf8Encode(code):
111 # type: (int) -> str
112 """Return utf-8 encoded bytes from a unicode code point.
113
114 Based on https://stackoverflow.com/a/23502707
115 """
116 num_cont_bytes = 0
117
118 if code <= 0x7F:
119 return chr(code & 0x7F) # ASCII
120
121 elif code <= 0x7FF:
122 num_cont_bytes = 1
123 elif code <= 0xFFFF:
124 num_cont_bytes = 2
125 elif code <= 0x10FFFF:
126 num_cont_bytes = 3
127
128 else:
129 return '\xEF\xBF\xBD' # unicode replacement character
130
131 bytes_ = [] # type: List[int]
132 for _ in xrange(num_cont_bytes):
133 bytes_.append(0x80 | (code & 0x3F))
134 code >>= 6
135
136 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
137 bytes_.append(b)
138 bytes_.reverse()
139
140 # mod 256 because Python ints don't wrap around!
141 tmp = [chr(b & 0xFF) for b in bytes_]
142 return ''.join(tmp)
143
144
145SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147LOSSY_JSON = 1 << 3 # JSON is lossy
148
149# Hack until we fully translate
150assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153def _Print(val, buf, indent, options=0):
154 # type: (value_t, mylib.BufWriter, int, int) -> None
155 """
156 Args:
157 indent: number of spaces to indent, or -1 for everything on one line
158 """
159 p = InstancePrinter(buf, indent, options)
160 p.Print(val)
161
162
163def PrintMessage(val, buf, indent):
164 # type: (value_t, mylib.BufWriter, int) -> None
165 """ For json8 write (x) and toJson8()
166
167 Caller must handle error.Encode
168 """
169 _Print(val, buf, indent)
170
171
172def PrintJsonMessage(val, buf, indent):
173 # type: (value_t, mylib.BufWriter, int) -> None
174 """ For json write (x) and toJson()
175
176 Caller must handle error.Encode()
177 Doesn't decay to b'' strings - will use Unicode replacement char.
178 """
179 _Print(val, buf, indent, options=LOSSY_JSON)
180
181
182def PrintLine(val, f):
183 # type: (value_t, mylib.Writer) -> None
184 """ For pp line (x) """
185
186 # error.Encode should be impossible - we show cycles and non-data
187 buf = mylib.BufWriter()
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193def EncodeString(s, buf, unquoted_ok=False):
194 # type: (str, mylib.BufWriter, bool) -> None
195 """ For pp proc, etc."""
196
197 if unquoted_ok and fastfunc.CanOmitQuotes(s):
198 buf.write(s)
199 return
200
201 _Print(value.Str(s), buf, -1)
202
203
204def MaybeEncodeString(s):
205 # type: (str) -> str
206 """ For write --json8 $s and compexport """
207
208 # TODO: add unquoted_ok here?
209 # /usr/local/foo-bar/x.y/a_b
210
211 buf = mylib.BufWriter()
212 _Print(value.Str(s), buf, -1)
213 return buf.getvalue()
214
215
216def MaybeEncodeJsonString(s):
217 # type: (str) -> str
218 """ For write --json """
219
220 # TODO: add unquoted_ok here?
221 # /usr/local/foo-bar/x.y/a_b
222 buf = mylib.BufWriter()
223 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224 return buf.getvalue()
225
226
227# DFS traversal state
228UNSEEN = 0
229EXPLORING = 1
230FINISHED = 2
231
232
233class InstancePrinter(object):
234 """Print a value tree as J8/JSON."""
235
236 def __init__(self, buf, indent, options):
237 # type: (mylib.BufWriter, int, int) -> None
238 self.buf = buf
239 self.indent = indent
240 self.options = options
241
242 # Key is vm.HeapValueId(val)
243 # Value is always True
244 # Dict[int, None] doesn't translate -- it would be nice to have a set()
245 self.visited = {} # type: Dict[int, int]
246
247 def _ItemIndent(self, level):
248 # type: (int) -> None
249
250 if self.indent == -1:
251 return
252
253 self.buf.write_spaces((level + 1) * self.indent)
254
255 def _BracketIndent(self, level):
256 # type: (int) -> None
257
258 if self.indent == -1:
259 return
260
261 self.buf.write_spaces(level * self.indent)
262
263 def _MaybeNewline(self):
264 # type: () -> None
265 if self.indent == -1:
266 return
267 self.buf.write('\n')
268
269 def _MaybeSpace(self):
270 # type: () -> None
271 if self.indent == -1:
272 return
273 self.buf.write(' ')
274
275 def _PrintList(self, val, level):
276 # type: (value.List, int) -> None
277
278 if len(val.items) == 0: # Special case like Python/JS
279 self.buf.write('[]')
280 else:
281 self.buf.write('[')
282 self._MaybeNewline()
283 for i, item in enumerate(val.items):
284 if i != 0:
285 self.buf.write(',')
286 self._MaybeNewline()
287
288 self._ItemIndent(level)
289 self.Print(item, level + 1)
290 self._MaybeNewline()
291
292 self._BracketIndent(level)
293 self.buf.write(']')
294
295 def _PrintDict(self, val, level):
296 # type: (value.Dict, int) -> None
297
298 if len(val.d) == 0: # Special case like Python/JS
299 self.buf.write('{}')
300 else:
301 self.buf.write('{')
302 self._MaybeNewline()
303 i = 0
304 for k, v in iteritems(val.d):
305 if i != 0:
306 self.buf.write(',')
307 self._MaybeNewline()
308
309 self._ItemIndent(level)
310
311 pyj8.WriteString(k, self.options, self.buf)
312
313 self.buf.write(':')
314 self._MaybeSpace()
315
316 self.Print(v, level + 1)
317
318 i += 1
319
320 self._MaybeNewline()
321 self._BracketIndent(level)
322 self.buf.write('}')
323
324 def Print(self, val, level=0):
325 # type: (value_t, int) -> None
326
327 # special value that means everything is on one line
328 # It's like
329 # JSON.stringify(d, null, 0)
330 # except we use -1, not 0. 0 can still have newlines.
331
332 UP_val = val
333 with tagswitch(val) as case:
334 if case(value_e.Null):
335 self.buf.write('null')
336
337 elif case(value_e.Bool):
338 val = cast(value.Bool, UP_val)
339 self.buf.write('true' if val.b else 'false')
340
341 elif case(value_e.Int):
342 val = cast(value.Int, UP_val)
343 # TODO: avoid intermediate allocation with
344 # self.buf.WriteBigInt(val.i)
345 #
346 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347 # be of arbitrary length, and will need a growth strategy.
348 # Although that is not very common, so we could allocate in
349 # that case.
350
351 self.buf.write(mops.ToStr(val.i))
352
353 elif case(value_e.Float):
354 val = cast(value.Float, UP_val)
355 # TODO: avoid intrmediate allocation with
356 # self.buf.WriteFloat(val.f)
357 self.buf.write(str(val.f))
358
359 elif case(value_e.Str):
360 val = cast(value.Str, UP_val)
361
362 pyj8.WriteString(val.s, self.options, self.buf)
363
364 elif case(value_e.List):
365 val = cast(value.List, UP_val)
366
367 # Cycle detection, only for containers that can be in cycles
368 heap_id = HeapValueId(val)
369
370 node_state = self.visited.get(heap_id, UNSEEN)
371 if node_state == FINISHED:
372 # Print it AGAIN. We print a JSON tree, which means we can
373 # visit and print nodes MANY TIMES, as long as they're not
374 # in a cycle.
375 self._PrintList(val, level)
376 return
377 if node_state == EXPLORING:
378 if self.options & SHOW_CYCLES:
379 self.buf.write('[ -->%s ]' % ValueIdString(val))
380 return
381 else:
382 # node.js prints which index closes the cycle
383 raise error.Encode(
384 "Can't encode List%s in object cycle" %
385 ValueIdString(val))
386
387 self.visited[heap_id] = EXPLORING
388 self._PrintList(val, level)
389 self.visited[heap_id] = FINISHED
390
391 elif case(value_e.Dict):
392 val = cast(value.Dict, UP_val)
393
394 # Cycle detection, only for containers that can be in cycles
395 heap_id = HeapValueId(val)
396
397 node_state = self.visited.get(heap_id, UNSEEN)
398 if node_state == FINISHED:
399 # Print it AGAIN. We print a JSON tree, which means we can
400 # visit and print nodes MANY TIMES, as long as they're not
401 # in a cycle.
402 self._PrintDict(val, level)
403 return
404 if node_state == EXPLORING:
405 if self.options & SHOW_CYCLES:
406 self.buf.write('{ -->%s }' % ValueIdString(val))
407 return
408 else:
409 # node.js prints which key closes the cycle
410 raise error.Encode(
411 "Can't encode Dict%s in object cycle" %
412 ValueIdString(val))
413
414 self.visited[heap_id] = EXPLORING
415 self._PrintDict(val, level)
416 self.visited[heap_id] = FINISHED
417
418 # BashArray and BashAssoc should be printed with pp line (x), e.g.
419 # for spec tests.
420 # - BashAssoc has a clear encoding.
421 # - BashArray could eventually be Dict[int, str]. But that's not
422 # encodable in JSON, which has string keys!
423 # So I think we can print it like ["a",null,'b"] and that won't
424 # change. That's what users expect.
425 elif case(value_e.BashArray):
426 val = cast(value.BashArray, UP_val)
427
428 self.buf.write('[')
429 self._MaybeNewline()
430 for i, s in enumerate(val.strs):
431 if i != 0:
432 self.buf.write(',')
433 self._MaybeNewline()
434
435 self._ItemIndent(level)
436 if s is None:
437 self.buf.write('null')
438 else:
439 pyj8.WriteString(s, self.options, self.buf)
440
441 self._MaybeNewline()
442
443 self._BracketIndent(level)
444 self.buf.write(']')
445
446 elif case(value_e.BashAssoc):
447 val = cast(value.BashAssoc, UP_val)
448
449 self.buf.write('{')
450 self._MaybeNewline()
451 i = 0
452 for k2, v2 in iteritems(val.d):
453 if i != 0:
454 self.buf.write(',')
455 self._MaybeNewline()
456
457 self._ItemIndent(level)
458
459 pyj8.WriteString(k2, self.options, self.buf)
460
461 self.buf.write(':')
462 self._MaybeSpace()
463
464 pyj8.WriteString(v2, self.options, self.buf)
465
466 i += 1
467
468 self._MaybeNewline()
469 self._BracketIndent(level)
470 self.buf.write('}')
471
472 else:
473 pass # mycpp workaround
474 if self.options & SHOW_NON_DATA:
475 # Similar to = operator, ui.DebugPrint()
476 # TODO: that prints value.Range in a special way
477 ysh_type = ValType(val)
478 id_str = ValueIdString(val)
479 self.buf.write('<%s%s>' % (ysh_type, id_str))
480 else:
481 raise error.Encode("Can't serialize object of type %s" %
482 ValType(val))
483
484
485class PrettyPrinter(object):
486 """ Unused right now, but could enhance the = operator.
487
488 Output to polymorphic ColorOutput
489
490 Features like asdl/format.py:
491 - line wrapping
492 - color
493 - sharing detection by passing in a REF COUTN dict
494 - print @123 the first time, and then print ... the second time
495
496 and
497
498 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501 - Omitting commas for ASDL? Maybe we can use two spaces
502
503 (Token id: Id.VSub_DollarName start: 0 length: 3)
504 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505 """
506
507 def __init__(self, max_col):
508 # type: (int) -> None
509 self.max_col = max_col
510
511 # This could be an optimized set an C++ bit set like
512 # mark_sweep_heap.h, rather than a Dict
513 #self.unique_objs = mylib.UniqueObjects()
514
515 # first pass of object ID -> number of times references
516
517 self.ref_count = {} # type: Dict[int, int]
518
519 def PrettyTree(self, val, f):
520 # type: (value_t, fmt.ColorOutput) -> None
521
522 # TODO: first convert to hnode.asdl types?
523
524 # Although we might want
525 # hnode.AlreadyShown = (str type, int unique_id)
526 pass
527
528 def Print(self, val, buf):
529 # type: (value_t, mylib.BufWriter) -> None
530
531 # Or print to stderr?
532 f = fmt.DetectConsoleOutput(mylib.Stdout())
533 self.PrettyTree(val, f)
534
535 # Then print those with ASDL
536 pass
537
538
539class LexerDecoder(object):
540 """J8 lexer and string decoder.
541
542 Similar interface as SimpleLexer, except we return an optional decoded
543 string
544 """
545
546 def __init__(self, s, is_j8, lang_str):
547 # type: (str, bool, str) -> None
548 self.s = s
549 self.is_j8 = is_j8
550 self.lang_str = lang_str
551
552 self.pos = 0
553
554 # current line being lexed -- for error messages
555 self.cur_line_num = 1
556
557 # Reuse this instance to save GC objects. JSON objects could have
558 # thousands of strings.
559 self.decoded = mylib.BufWriter()
560
561 def _Error(self, msg, end_pos):
562 # type: (str, int) -> error.Decode
563
564 # Use the current position as start pos
565 return error.Decode(msg, self.s, self.pos, end_pos, self.cur_line_num)
566
567 def Next(self):
568 # type: () -> Tuple[Id_t, int, Optional[str]]
569 """ Returns a token and updates self.pos """
570
571 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
572
573 if not self.is_j8:
574 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
575 raise self._Error(
576 "Single quotes aren't part of JSON; you may want 'json8 read'",
577 end_pos)
578 if tok_id == Id.Ignored_Comment:
579 raise self._Error(
580 "Comments aren't part of JSON; you may want 'json8 read'",
581 end_pos)
582
583 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
584 Id.Left_USingleQuote):
585 return self._DecodeString(tok_id, end_pos)
586
587 if tok_id == Id.Ignored_Newline:
588 #log('LINE %d', self.cur_line_num)
589 self.cur_line_num += 1
590
591 self.pos = end_pos
592 return tok_id, end_pos, None
593
594 def NextForLines(self):
595 # type: () -> Tuple[Id_t, int, Optional[str]]
596 """ Like Next(), but for J8 Lines """
597
598 tok_id, end_pos = match.MatchJ8LinesToken(self.s, self.pos)
599
600 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
601 Id.Left_USingleQuote):
602 return self._DecodeString(tok_id, end_pos)
603
604 # Check that UNQUOTED lines are valid UTF-8. (_DecodeString() does
605 # this for quoted strings.)
606 if (tok_id == Id.Lit_Chars and
607 not pyj8.PartIsUtf8(self.s, self.pos, end_pos)):
608 raise self._Error(
609 'Invalid UTF-8 in %s string literal' % self.lang_str, end_pos)
610 if tok_id == Id.Char_AsciiControl:
611 raise self._Error(
612 "J8 Lines can't have unescaped ASCII control chars", end_pos)
613
614 if tok_id == Id.J8_Newline:
615 #log('LINE %d', self.cur_line_num)
616 self.cur_line_num += 1
617
618 self.pos = end_pos
619 return tok_id, end_pos, None
620
621 def _DecodeString(self, left_id, str_pos):
622 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
623 """ Returns a string token and updates self.pos """
624
625 while True:
626 if left_id == Id.Left_DoubleQuote:
627 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
628 else:
629 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
630
631 #log('String tok %s', Id_str(tok_id))
632
633 if tok_id == Id.Eol_Tok:
634 # TODO: point to beginning of # quote?
635 raise self._Error(
636 'Unexpected EOF while lexing %s string' % self.lang_str,
637 str_end)
638 if tok_id == Id.Unknown_Backslash:
639 raise self._Error(
640 'Bad backslash escape in %s string' % self.lang_str,
641 str_end)
642 if tok_id == Id.Char_AsciiControl:
643 raise self._Error(
644 "%s strings can't have unescaped ASCII control chars" %
645 self.lang_str, str_end)
646
647 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
648
649 self.pos = str_end
650
651 s = self.decoded.getvalue()
652 self.decoded.clear() # reuse this instance
653
654 #log('decoded %r', self.decoded.getvalue())
655 return Id.J8_String, str_end, s
656
657 #
658 # Now handle each kind of token
659 #
660
661 if tok_id == Id.Lit_Chars: # JSON and J8
662 part = self.s[str_pos:str_end]
663 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
664 raise self._Error(
665 'Invalid UTF-8 in %s string literal' % self.lang_str,
666 str_end)
667
668 # TODO: would be nice to avoid allocation in all these cases.
669 # But LookupCharC() would have to change.
670
671 elif tok_id == Id.Char_OneChar: # JSON and J8
672 ch = self.s[str_pos + 1]
673 part = consts.LookupCharC(ch)
674
675 elif tok_id == Id.Char_UBraced: # J8 only
676 h = self.s[str_pos + 3:str_end - 1]
677 i = int(h, 16)
678
679 # Same check in osh/word_parse.py
680 if 0xD800 <= i and i < 0xE000:
681 raise self._Error(
682 r"\u{%s} escape is illegal because it's in the surrogate range"
683 % h, str_end)
684
685 part = Utf8Encode(i)
686
687 elif tok_id == Id.Char_YHex: # J8 only
688 h = self.s[str_pos + 2:str_end]
689
690 # Same check in osh/word_parse.py
691 if left_id != Id.Left_BSingleQuote:
692 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
693 raise self._Error(
694 r"\y%s escapes not allowed in u'' strings" % h,
695 str_end)
696
697 i = int(h, 16)
698 part = chr(i)
699
700 elif tok_id == Id.Char_SurrogatePair:
701 h1 = self.s[str_pos + 2:str_pos + 6]
702 h2 = self.s[str_pos + 8:str_pos + 12]
703
704 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
705 i1 = int(h1, 16) - 0xD800 # high surrogate
706 i2 = int(h2, 16) - 0xDC00 # low surrogate
707 code_point = 0x10000 + (i1 << 10) + i2
708
709 part = Utf8Encode(code_point)
710
711 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
712 h = self.s[str_pos + 2:str_end]
713 i = int(h, 16)
714 part = Utf8Encode(i)
715
716 else:
717 # Should never happen
718 raise AssertionError(Id_str(tok_id))
719
720 #log('%s part %r', Id_str(tok_id), part)
721 self.decoded.write(part)
722 str_pos = str_end
723
724
725class _Parser(object):
726
727 def __init__(self, s, is_j8):
728 # type: (str, bool) -> None
729 self.s = s
730 self.is_j8 = is_j8
731 self.lang_str = "J8" if is_j8 else "JSON"
732
733 self.lexer = LexerDecoder(s, is_j8, self.lang_str)
734 self.tok_id = Id.Undefined_Tok
735 self.start_pos = 0
736 self.end_pos = 0
737 self.decoded = '' # decoded J8 string
738
739 def _Next(self):
740 # type: () -> None
741
742 # This isn't the start of a J8_Bool token, it's the END of the token before it
743 while True:
744 self.start_pos = self.end_pos
745 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
746 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
747 Id.Ignored_Comment):
748 break
749 # TODO: add Ignored_Newline to count lines, and show line numbers
750 # in errors messages. The position of the last newline and a token
751 # can be used to calculate a column number.
752
753 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
754
755 def _Eat(self, tok_id):
756 # type: (Id_t) -> None
757
758 if self.tok_id != tok_id:
759 #log('position %r %d-%d %r', self.s, self.start_pos,
760 # self.end_pos, self.s[self.start_pos:self.end_pos])
761 raise self._ParseError("Expected %s, got %s" %
762 (Id_str(tok_id), Id_str(self.tok_id)))
763 self._Next()
764
765 def _NextForLines(self):
766 # type: () -> None
767 """Like _Next, but use the J8 Lines lexer."""
768 self.start_pos = self.end_pos
769 self.tok_id, self.end_pos, self.decoded = self.lexer.NextForLines()
770
771 def _ParseError(self, msg):
772 # type: (str) -> error.Decode
773 return error.Decode(msg, self.s, self.start_pos, self.end_pos,
774 self.lexer.cur_line_num)
775
776
777class Parser(_Parser):
778 """JSON and JSON8 Parser."""
779
780 def __init__(self, s, is_j8):
781 # type: (str, bool) -> None
782 _Parser.__init__(self, s, is_j8)
783
784 def _ParsePair(self):
785 # type: () -> Tuple[str, value_t]
786
787 k = self.decoded # Save the potential string value
788 self._Eat(Id.J8_String) # Check that it's a string
789 assert k is not None
790
791 self._Eat(Id.J8_Colon)
792
793 v = self._ParseValue()
794 return k, v
795
796 def _ParseDict(self):
797 # type: () -> value_t
798 """
799 pair = string ':' value
800 Dict = '{' '}'
801 | '{' pair (',' pair)* '}'
802 """
803 # precondition
804 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
805
806 #log('> Dict')
807
808 d = NewDict() # type: Dict[str, value_t]
809
810 self._Next()
811 if self.tok_id == Id.J8_RBrace:
812 self._Next()
813 return value.Dict(d)
814
815 k, v = self._ParsePair()
816 d[k] = v
817 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
818
819 while self.tok_id == Id.J8_Comma:
820 self._Next()
821 k, v = self._ParsePair()
822 d[k] = v
823 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
824
825 self._Eat(Id.J8_RBrace)
826
827 #log('< Dict')
828
829 return value.Dict(d)
830
831 def _ParseList(self):
832 # type: () -> value_t
833 """
834 List = '[' ']'
835 | '[' value (',' value)* ']'
836 """
837 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
838
839 items = [] # type: List[value_t]
840
841 self._Next()
842 if self.tok_id == Id.J8_RBracket:
843 self._Next()
844 return value.List(items)
845
846 items.append(self._ParseValue())
847
848 while self.tok_id == Id.J8_Comma:
849 self._Next()
850 items.append(self._ParseValue())
851
852 self._Eat(Id.J8_RBracket)
853
854 return value.List(items)
855
856 def _ParseValue(self):
857 # type: () -> value_t
858 if self.tok_id == Id.J8_LBrace:
859 return self._ParseDict()
860
861 elif self.tok_id == Id.J8_LBracket:
862 return self._ParseList()
863
864 elif self.tok_id == Id.J8_Null:
865 self._Next()
866 return value.Null
867
868 elif self.tok_id == Id.J8_Bool:
869 #log('%r %d', self.s[self.start_pos], self.start_pos)
870 b = value.Bool(self.s[self.start_pos] == 't')
871 self._Next()
872 return b
873
874 elif self.tok_id == Id.J8_Int:
875 part = self.s[self.start_pos:self.end_pos]
876 self._Next()
877 return value.Int(mops.FromStr(part))
878
879 elif self.tok_id == Id.J8_Float:
880 part = self.s[self.start_pos:self.end_pos]
881 self._Next()
882 return value.Float(float(part))
883
884 # UString, BString too
885 elif self.tok_id == Id.J8_String:
886 str_val = value.Str(self.decoded)
887 #log('d %r', self.decoded)
888 self._Next()
889 return str_val
890
891 elif self.tok_id == Id.Eol_Tok:
892 raise self._ParseError('Unexpected EOF while parsing %s' %
893 self.lang_str)
894
895 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
896 raise self._ParseError('Invalid token while parsing %s: %s' %
897 (self.lang_str, Id_str(self.tok_id)))
898
899 def ParseValue(self):
900 # type: () -> value_t
901 """ Raises error.Decode. """
902 self._Next()
903 obj = self._ParseValue()
904 if self.tok_id != Id.Eol_Tok:
905 raise self._ParseError('Unexpected trailing input')
906 return obj
907
908
909class Nil8Parser(_Parser):
910 """
911 Tokens not in JSON8:
912 LParen RParen Symbol
913
914 Tokens not in JSON, but in JSON8 and NIL8:
915 Identifier (unquoted keys)
916 Ignored_Comment
917 """
918
919 def __init__(self, s, is_j8):
920 # type: (str, bool) -> None
921 _Parser.__init__(self, s, is_j8)
922
923 if 0:
924
925 def _LookAhead(self):
926 # type: () -> Id_t
927 """
928 Don't need this right now
929 """
930 end_pos = self.end_pos # look ahead from last token
931 while True:
932 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
933 if tok_id not in (Id.Ignored_Space, Id.Ignored_Newline,
934 Id.Ignored_Comment):
935 break
936 return tok_id
937
938 def _ParseRecord(self):
939 # type: () -> nvalue_t
940 """
941 Yaks
942 (self->Next) => (-> self Next)
943 (self->Next obj.field) => ((-> self Next) (. obj field))
944
945 Similar to
946 ((identity identity) 42) => 42 in Clojure
947
948 ASDL
949 (Node left:(. x4beef2))
950 (Node left !x4beef2)
951
952 # Ambiguous because value can be identifier.
953 # We have to look ahead to and see if there's a colon :
954 field =
955 Identifier ':' value
956 | value
957
958 record = '(' head field* ')'
959
960 - Identifier | Symbol are treated the same, it's a side effect of
961 the lexing style
962 - do positional args come before named args
963 - () is invalid? Use [] for empty list
964 """
965 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
966
967 items = [] # type: List[nvalue_t]
968
969 self._Next()
970 if self.tok_id == Id.J8_RParen:
971 self._Next()
972 return nvalue.List(items)
973
974 #log('TOK %s', Id_str(self.tok_id))
975 while self.tok_id != Id.J8_RParen:
976 items.append(self._ParseNil8())
977 #log('TOK 2 %s', Id_str(self.tok_id))
978
979 self._Eat(Id.J8_RParen)
980
981 return nvalue.List(items)
982
983 def _ParseList8(self):
984 # type: () -> nvalue_t
985 """
986 List8 = '[' value* ']'
987
988 No commas, not even optional ones for now.
989 """
990 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
991
992 items = [] # type: List[nvalue_t]
993
994 self._Next()
995 if self.tok_id == Id.J8_RBracket:
996 self._Next()
997 return nvalue.List(items)
998
999 #log('TOK %s', Id_str(self.tok_id))
1000 while self.tok_id != Id.J8_RBracket:
1001 items.append(self._ParseNil8())
1002 #log('TOK 2 %s', Id_str(self.tok_id))
1003
1004 self._Eat(Id.J8_RBracket)
1005
1006 return nvalue.List(items)
1007
1008 def _ParseNil8(self):
1009 # type: () -> nvalue_t
1010 if self.tok_id == Id.J8_LParen:
1011 obj = self._ParseRecord() # type: nvalue_t
1012 #return obj
1013
1014 elif self.tok_id == Id.J8_LBracket:
1015 obj = self._ParseList8()
1016 #return obj
1017
1018 # Primitives are copied from J8 above.
1019 # TODO: We also want hex literals.
1020 elif self.tok_id == Id.J8_Null:
1021 self._Next()
1022 obj = nvalue.Null
1023
1024 elif self.tok_id == Id.J8_Bool:
1025 b = nvalue.Bool(self.s[self.start_pos] == 't')
1026 self._Next()
1027 obj = b
1028
1029 elif self.tok_id == Id.J8_Int:
1030 part = self.s[self.start_pos:self.end_pos]
1031 self._Next()
1032 obj = nvalue.Int(int(part))
1033
1034 elif self.tok_id == Id.J8_Float:
1035 part = self.s[self.start_pos:self.end_pos]
1036 self._Next()
1037 obj = nvalue.Float(float(part))
1038
1039 elif self.tok_id == Id.J8_String:
1040 str_val = nvalue.Str(self.decoded)
1041 self._Next()
1042 obj = str_val
1043
1044 # <- etc.
1045 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1046 Id.J8_Comma):
1047 # unquoted "word" treated like a string
1048 part = self.s[self.start_pos:self.end_pos]
1049 self._Next()
1050 obj = nvalue.Symbol(part)
1051
1052 elif self.tok_id == Id.Eol_Tok:
1053 raise self._ParseError('Unexpected EOF while parsing %s' %
1054 self.lang_str)
1055
1056 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1057 raise self._ParseError('Invalid token while parsing %s: %s' %
1058 (self.lang_str, Id_str(self.tok_id)))
1059
1060 #log('YO %s', Id_str(self.tok_id))
1061 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1062 #log('AT %s', Id_str(self.tok_id))
1063
1064 # key: "value" -> (: key "value")
1065 part = self.s[self.start_pos:self.end_pos]
1066 op = nvalue.Symbol(part)
1067
1068 self._Next()
1069 operand2 = self._ParseNil8()
1070 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1071 #print("--> INFIX %d %s" % (id(infix), infix))
1072 return infix
1073
1074 #next_id = self._LookAhead()
1075 #print('NEXT %s' % Id_str(next_id))
1076
1077 #raise AssertionError()
1078 #print("--> OBJ %d %s" % (id(obj), obj))
1079 return obj
1080
1081 def ParseNil8(self):
1082 # type: () -> nvalue_t
1083 """ Raises error.Decode. """
1084 self._Next()
1085 #print('yo')
1086 obj = self._ParseNil8()
1087 #print("==> %d %s" % (id(obj), obj))
1088 if self.tok_id != Id.Eol_Tok:
1089 raise self._ParseError('Unexpected trailing input')
1090 return obj
1091
1092
1093class J8LinesParser(_Parser):
1094 """Decode lines from a string with newlines.
1095
1096 We specify this with a grammar, to preserve location info and to reduce
1097 allocations. (But note that unquoted_line is more like a LOOP than it is
1098 grammatical.)
1099
1100 Grammar:
1101
1102 end = J8_Newline | Eol_Tok
1103
1104 empty_line = WS_Space? end
1105
1106 # special case: read until end token, but REMOVE trailing WS_Space
1107 unquoted_line = WS_Space? Lit_Chars ANY* WS_Space? end
1108
1109 j8_line = WS_Space? J8_String WS_Space? end
1110
1111 lines = (empty_line | unquoted_line | j8_line)*
1112
1113 where Lit_Chars is valid UTF-8
1114
1115 Notes:
1116
1117 (1) We disallow multiple strings on a line, like:
1118
1119 "json" "json2"
1120 "json" unquoted
1121
1122 (2) Internal quotes are allowed on unquoted lines. Consider this line:
1123
1124 foo "" u''
1125
1126 The "" and u'' are not a decoded string, because the line started with
1127 Id.Lit_Chars literals.
1128
1129 (3) This is related to TSV8? Similar rules. Does TSV8 have empty cells?
1130 Does it have - for empty cell?
1131 """
1132
1133 def __init__(self, s):
1134 # type: (str) -> None
1135 _Parser.__init__(self, s, True)
1136
1137 def _Show(self, s):
1138 # type: (str) -> None
1139 log('%s tok_id %s %d-%d', s, Id_str(self.tok_id), self.start_pos,
1140 self.end_pos)
1141
1142 def _ParseLine(self, out):
1143 # type: (List[str]) -> None
1144 """ May append a line to 'out' """
1145 #self._Show('1')
1146 if self.tok_id == Id.WS_Space:
1147 self._NextForLines()
1148
1149 # Empty line - return without doing anything
1150 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1151 self._NextForLines()
1152 return
1153
1154 # Quoted string on line
1155 if self.tok_id == Id.J8_String:
1156 out.append(self.decoded)
1157 self._NextForLines()
1158
1159 if self.tok_id == Id.WS_Space: # trailing whitespace
1160 self._NextForLines()
1161
1162 if self.tok_id not in (Id.J8_Newline, Id.Eol_Tok):
1163 raise self._ParseError('Unexpected text after J8 Line (%s)' %
1164 Id_str(self.tok_id))
1165
1166 self._NextForLines()
1167 return
1168
1169 # Unquoted line
1170 if self.tok_id == Id.Lit_Chars:
1171 # ' unquoted "" text on line ' # read every token until end
1172 string_start = self.start_pos
1173 while True:
1174 # for stripping whitespace
1175 prev_id = self.tok_id
1176 prev_start = self.start_pos
1177
1178 self._NextForLines()
1179
1180 # It would be nicer if "middle" Id.WS_Space tokens didn't have
1181 # \r, but we're sticking with the JSON spec definition of
1182 # whitespace. (As another data point, CPython on Unix allows
1183 # \r in the middle of expressions, treating it as whitespace.)
1184 if self.tok_id in (Id.J8_Newline, Id.Eol_Tok):
1185 break
1186
1187 if prev_id == Id.WS_Space:
1188 string_end = prev_start # remove trailing whitespace
1189 else:
1190 string_end = self.start_pos
1191
1192 out.append(self.s[string_start:string_end])
1193
1194 self._NextForLines() # past newline
1195 return
1196
1197 raise AssertionError(Id_str(self.tok_id))
1198
1199 def Parse(self):
1200 # type: () -> List[str]
1201 """ Raises error.Decode. """
1202 self._NextForLines()
1203
1204 lines = [] # type: List[str]
1205 while self.tok_id != Id.Eol_Tok:
1206 self._ParseLine(lines)
1207
1208 if self.tok_id != Id.Eol_Tok:
1209 raise self._ParseError('Unexpected trailing input in J8 Lines')
1210
1211 return lines
1212
1213
1214def SplitJ8Lines(s):
1215 # type: (str) -> List[str]
1216 """Used by @(echo split command sub)
1217
1218 Raises:
1219 error.Decode
1220
1221 3 Errors:
1222 - J8 string syntax error inside quotes
1223 - Extra input on line
1224 - unquoted line isn't utf-8
1225 """
1226 p = J8LinesParser(s)
1227 return p.Parse()
1228
1229
1230# vim: sw=4