OILS / data_lang / j8.py View on Github | oilshell.org

1053 lines, 516 significant
1#!/usr/bin/env python2
2"""
3j8.py: J8 Notation, a superset of JSON
4
5TODO:
6
7- Many more tests
8 - Run JSONTestSuite
9
10Later:
11
12- PrettyPrinter uses hnode.asdl?
13 - color
14 - line wrapping -- do this later
15 - would like CONTRIBUTORS here
16
17- Unify with ASDL pretty printing - NIL8
18 - {} [] are identical
19 - () is for statically typed ASDL data
20 (command.Simple blame_tok:(...) words:[ ])
21 although we are also using [] for typed ASDL arrays, not just JSON
22 - object IDs
23 - @ x123 can create an ID
24 - ! x123 can reference an ID
25 - <> can be for non-J8 data types? For the = operator
26 - 'hi \(name)' interpolation is useful for code
27
28- Common between JSON8 and NIL8 - for writing by hand
29 - comments - # line or // line (JSON5 uses // line, following JS)
30 - unquoted identifier names - TYG8 could be more relaxed for (+ 1 (* 3 4))
31 - commas
32 - JSON8 could have trailing commas rule
33 - NIL8 at least has no commas for [1 2 "hi"]
34"""
35
36from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
37from _devbuild.gen.value_asdl import (value, value_e, value_t, value_str)
38from _devbuild.gen.nil8_asdl import (nvalue, nvalue_t)
39
40from asdl import format as fmt
41from core import error
42from data_lang import pyj8
43# dependency issue: consts.py pulls in frontend/option_def.py
44from frontend import consts
45from frontend import match
46from mycpp import mops
47from mycpp import mylib
48from mycpp.mylib import tagswitch, iteritems, NewDict, log
49
50import fastfunc
51
52_ = log
53
54from typing import cast, Dict, List, Tuple, Optional
55
56
57# COPIED from ui.ValType() to break dep
58def ValType(val):
59 # type: (value_t) -> str
60 """For displaying type errors in the UI."""
61
62 return value_str(val.tag(), dot=False)
63
64
65if mylib.PYTHON:
66
67 def HeapValueId(val):
68 # type: (value_t) -> int
69 """
70 Python's id() returns the address, which is up to 64 bits.
71
72 In C++ we can use the GC ID, which fits within 32 bits.
73 """
74 return id(val)
75
76
77def ValueId(val):
78 # type: (value_t) -> int
79 """
80 Return an integer ID for object that:
81
82 1. Can be used to determine whether 2 objects are the same, e.g. for
83 List, Dict, Func, Proc, etc.
84 2. Will help detect object cycles
85
86 Primitives types like Int and Float don't have this notion. They're
87 immutable values that are copied and compared by value.
88 """
89 with tagswitch(val) as case:
90 if case(value_e.Null, value_e.Bool, value_e.Int, value_e.Float,
91 value_e.Str):
92 # These will not be on the heap if we switch to tagged pointers
93 # Str is handled conservatively - when we add small string
94 # optimization, some strings will be values, so we assume all are.
95 return -1
96 else:
97 return HeapValueId(val)
98
99
100def ValueIdString(val):
101 # type: (value_t) -> str
102 """Used by pp value (42) and = 42"""
103 heap_id = ValueId(val) # could be -1
104 if heap_id == -1:
105 return ''
106 else:
107 return ' 0x%s' % mylib.hex_lower(heap_id)
108
109
110def Utf8Encode(code):
111 # type: (int) -> str
112 """Return utf-8 encoded bytes from a unicode code point.
113
114 Based on https://stackoverflow.com/a/23502707
115 """
116 num_cont_bytes = 0
117
118 if code <= 0x7F:
119 return chr(code & 0x7F) # ASCII
120
121 elif code <= 0x7FF:
122 num_cont_bytes = 1
123 elif code <= 0xFFFF:
124 num_cont_bytes = 2
125 elif code <= 0x10FFFF:
126 num_cont_bytes = 3
127
128 else:
129 return '\xEF\xBF\xBD' # unicode replacement character
130
131 bytes_ = [] # type: List[int]
132 for _ in xrange(num_cont_bytes):
133 bytes_.append(0x80 | (code & 0x3F))
134 code >>= 6
135
136 b = (0x1E << (6 - num_cont_bytes)) | (code & (0x3F >> num_cont_bytes))
137 bytes_.append(b)
138 bytes_.reverse()
139
140 # mod 256 because Python ints don't wrap around!
141 tmp = [chr(b & 0xFF) for b in bytes_]
142 return ''.join(tmp)
143
144
145SHOW_CYCLES = 1 << 1 # show as [...] or {...} I think, with object ID
146SHOW_NON_DATA = 1 << 2 # non-data objects like Eggex can be <Eggex 0xff>
147LOSSY_JSON = 1 << 3 # JSON is lossy
148
149# Hack until we fully translate
150assert pyj8.LOSSY_JSON == LOSSY_JSON
151
152
153def _Print(val, buf, indent, options=0):
154 # type: (value_t, mylib.BufWriter, int, int) -> None
155 """
156 Args:
157 indent: number of spaces to indent, or -1 for everything on one line
158 """
159 p = InstancePrinter(buf, indent, options)
160 p.Print(val)
161
162
163def PrintMessage(val, buf, indent):
164 # type: (value_t, mylib.BufWriter, int) -> None
165 """ For json8 write (x) and toJson8()
166
167 Caller must handle error.Encode
168 """
169 _Print(val, buf, indent)
170
171
172def PrintJsonMessage(val, buf, indent):
173 # type: (value_t, mylib.BufWriter, int) -> None
174 """ For json write (x) and toJson()
175
176 Caller must handle error.Encode()
177 Doesn't decay to b'' strings - will use Unicode replacement char.
178 """
179 _Print(val, buf, indent, options=LOSSY_JSON)
180
181
182def PrintLine(val, f):
183 # type: (value_t, mylib.Writer) -> None
184 """ For pp line (x) """
185
186 # error.Encode should be impossible - we show cycles and non-data
187 buf = mylib.BufWriter()
188 _Print(val, buf, -1, options=SHOW_CYCLES | SHOW_NON_DATA)
189 f.write(buf.getvalue())
190 f.write('\n')
191
192
193def EncodeString(s, buf, unquoted_ok=False):
194 # type: (str, mylib.BufWriter, bool) -> None
195 """ For pp proc, etc."""
196
197 if unquoted_ok and fastfunc.CanOmitQuotes(s):
198 buf.write(s)
199 return
200
201 _Print(value.Str(s), buf, -1)
202
203
204def MaybeEncodeString(s):
205 # type: (str) -> str
206 """ For write --json8 $s and compexport """
207
208 # TODO: add unquoted_ok here?
209 # /usr/local/foo-bar/x.y/a_b
210
211 buf = mylib.BufWriter()
212 _Print(value.Str(s), buf, -1)
213 return buf.getvalue()
214
215
216def MaybeEncodeJsonString(s):
217 # type: (str) -> str
218 """ For write --json """
219
220 # TODO: add unquoted_ok here?
221 # /usr/local/foo-bar/x.y/a_b
222 buf = mylib.BufWriter()
223 _Print(value.Str(s), buf, -1, options=LOSSY_JSON)
224 return buf.getvalue()
225
226
227# DFS traversal state
228UNSEEN = 0
229EXPLORING = 1
230FINISHED = 2
231
232
233class InstancePrinter(object):
234 """Print a value tree as J8/JSON."""
235
236 def __init__(self, buf, indent, options):
237 # type: (mylib.BufWriter, int, int) -> None
238 self.buf = buf
239 self.indent = indent
240 self.options = options
241
242 # Key is vm.HeapValueId(val)
243 # Value is always True
244 # Dict[int, None] doesn't translate -- it would be nice to have a set()
245 self.visited = {} # type: Dict[int, int]
246
247 def _ItemIndent(self, level):
248 # type: (int) -> None
249
250 if self.indent == -1:
251 return
252
253 self.buf.write_spaces((level + 1) * self.indent)
254
255 def _BracketIndent(self, level):
256 # type: (int) -> None
257
258 if self.indent == -1:
259 return
260
261 self.buf.write_spaces(level * self.indent)
262
263 def _MaybeNewline(self):
264 # type: () -> None
265 if self.indent == -1:
266 return
267 self.buf.write('\n')
268
269 def _MaybeSpace(self):
270 # type: () -> None
271 if self.indent == -1:
272 return
273 self.buf.write(' ')
274
275 def _PrintList(self, val, level):
276 # type: (value.List, int) -> None
277
278 if len(val.items) == 0: # Special case like Python/JS
279 self.buf.write('[]')
280 else:
281 self.buf.write('[')
282 self._MaybeNewline()
283 for i, item in enumerate(val.items):
284 if i != 0:
285 self.buf.write(',')
286 self._MaybeNewline()
287
288 self._ItemIndent(level)
289 self.Print(item, level + 1)
290 self._MaybeNewline()
291
292 self._BracketIndent(level)
293 self.buf.write(']')
294
295 def _PrintDict(self, val, level):
296 # type: (value.Dict, int) -> None
297
298 if len(val.d) == 0: # Special case like Python/JS
299 self.buf.write('{}')
300 else:
301 self.buf.write('{')
302 self._MaybeNewline()
303 i = 0
304 for k, v in iteritems(val.d):
305 if i != 0:
306 self.buf.write(',')
307 self._MaybeNewline()
308
309 self._ItemIndent(level)
310
311 pyj8.WriteString(k, self.options, self.buf)
312
313 self.buf.write(':')
314 self._MaybeSpace()
315
316 self.Print(v, level + 1)
317
318 i += 1
319
320 self._MaybeNewline()
321 self._BracketIndent(level)
322 self.buf.write('}')
323
324 def Print(self, val, level=0):
325 # type: (value_t, int) -> None
326
327 # special value that means everything is on one line
328 # It's like
329 # JSON.stringify(d, null, 0)
330 # except we use -1, not 0. 0 can still have newlines.
331
332 UP_val = val
333 with tagswitch(val) as case:
334 if case(value_e.Null):
335 self.buf.write('null')
336
337 elif case(value_e.Bool):
338 val = cast(value.Bool, UP_val)
339 self.buf.write('true' if val.b else 'false')
340
341 elif case(value_e.Int):
342 val = cast(value.Int, UP_val)
343 # TODO: avoid intermediate allocation with
344 # self.buf.WriteBigInt(val.i)
345 #
346 # Or maybe we need pyj8.WriteBigInt() because truly BigInt may
347 # be of arbitrary length, and will need a growth strategy.
348 # Although that is not very common, so we could allocate in
349 # that case.
350
351 self.buf.write(mops.ToStr(val.i))
352
353 elif case(value_e.Float):
354 val = cast(value.Float, UP_val)
355 # TODO: avoid intrmediate allocation with
356 # self.buf.WriteFloat(val.f)
357 self.buf.write(str(val.f))
358
359 elif case(value_e.Str):
360 val = cast(value.Str, UP_val)
361
362 pyj8.WriteString(val.s, self.options, self.buf)
363
364 elif case(value_e.List):
365 val = cast(value.List, UP_val)
366
367 # Cycle detection, only for containers that can be in cycles
368 heap_id = HeapValueId(val)
369
370 node_state = self.visited.get(heap_id, UNSEEN)
371 if node_state == FINISHED:
372 # Print it AGAIN. We print a JSON tree, which means we can
373 # visit and print nodes MANY TIMES, as long as they're not
374 # in a cycle.
375 self._PrintList(val, level)
376 return
377 if node_state == EXPLORING:
378 if self.options & SHOW_CYCLES:
379 self.buf.write('[ -->%s ]' % ValueIdString(val))
380 return
381 else:
382 # node.js prints which index closes the cycle
383 raise error.Encode(
384 "Can't encode List%s in object cycle" %
385 ValueIdString(val))
386
387 self.visited[heap_id] = EXPLORING
388 self._PrintList(val, level)
389 self.visited[heap_id] = FINISHED
390
391 elif case(value_e.Dict):
392 val = cast(value.Dict, UP_val)
393
394 # Cycle detection, only for containers that can be in cycles
395 heap_id = HeapValueId(val)
396
397 node_state = self.visited.get(heap_id, UNSEEN)
398 if node_state == FINISHED:
399 # Print it AGAIN. We print a JSON tree, which means we can
400 # visit and print nodes MANY TIMES, as long as they're not
401 # in a cycle.
402 self._PrintDict(val, level)
403 return
404 if node_state == EXPLORING:
405 if self.options & SHOW_CYCLES:
406 self.buf.write('{ -->%s }' % ValueIdString(val))
407 return
408 else:
409 # node.js prints which key closes the cycle
410 raise error.Encode(
411 "Can't encode Dict%s in object cycle" %
412 ValueIdString(val))
413
414 self.visited[heap_id] = EXPLORING
415 self._PrintDict(val, level)
416 self.visited[heap_id] = FINISHED
417
418 # BashArray and BashAssoc should be printed with pp line (x), e.g.
419 # for spec tests.
420 # - BashAssoc has a clear encoding.
421 # - BashArray could eventually be Dict[int, str]. But that's not
422 # encodable in JSON, which has string keys!
423 # So I think we can print it like ["a",null,'b"] and that won't
424 # change. That's what users expect.
425 elif case(value_e.BashArray):
426 val = cast(value.BashArray, UP_val)
427
428 self.buf.write('[')
429 self._MaybeNewline()
430 for i, s in enumerate(val.strs):
431 if i != 0:
432 self.buf.write(',')
433 self._MaybeNewline()
434
435 self._ItemIndent(level)
436 if s is None:
437 self.buf.write('null')
438 else:
439 pyj8.WriteString(s, self.options, self.buf)
440
441 self._MaybeNewline()
442
443 self._BracketIndent(level)
444 self.buf.write(']')
445
446 elif case(value_e.BashAssoc):
447 val = cast(value.BashAssoc, UP_val)
448
449 self.buf.write('{')
450 self._MaybeNewline()
451 i = 0
452 for k2, v2 in iteritems(val.d):
453 if i != 0:
454 self.buf.write(',')
455 self._MaybeNewline()
456
457 self._ItemIndent(level)
458
459 pyj8.WriteString(k2, self.options, self.buf)
460
461 self.buf.write(':')
462 self._MaybeSpace()
463
464 pyj8.WriteString(v2, self.options, self.buf)
465
466 i += 1
467
468 self._MaybeNewline()
469 self._BracketIndent(level)
470 self.buf.write('}')
471
472 else:
473 pass # mycpp workaround
474 if self.options & SHOW_NON_DATA:
475 # Similar to = operator, ui.DebugPrint()
476 # TODO: that prints value.Range in a special way
477 ysh_type = ValType(val)
478 id_str = ValueIdString(val)
479 self.buf.write('<%s%s>' % (ysh_type, id_str))
480 else:
481 raise error.Encode("Can't serialize object of type %s" %
482 ValType(val))
483
484
485class PrettyPrinter(object):
486 """ Unused right now, but could enhance the = operator.
487
488 Output to polymorphic ColorOutput
489
490 Features like asdl/format.py:
491 - line wrapping
492 - color
493 - sharing detection by passing in a REF COUTN dict
494 - print @123 the first time, and then print ... the second time
495
496 and
497
498 - Pretty spaces: {"k": "v", "k2": "v2"} instead of {"k":"v","k2","v2"}
499 - Unquoted: {k: "v", k2: "v2"} instead of {"k": "v", "k2": "v2"}
500
501 - Omitting commas for ASDL? Maybe we can use two spaces
502
503 (Token id: Id.VSub_DollarName start: 0 length: 3)
504 (Token id:Id.VSub_DollarName start:0 length:3) - color makes this work
505 """
506
507 def __init__(self, max_col):
508 # type: (int) -> None
509 self.max_col = max_col
510
511 # This could be an optimized set an C++ bit set like
512 # mark_sweep_heap.h, rather than a Dict
513 #self.unique_objs = mylib.UniqueObjects()
514
515 # first pass of object ID -> number of times references
516
517 self.ref_count = {} # type: Dict[int, int]
518
519 def PrettyTree(self, val, f):
520 # type: (value_t, fmt.ColorOutput) -> None
521
522 # TODO: first convert to hnode.asdl types?
523
524 # Although we might want
525 # hnode.AlreadyShown = (str type, int unique_id)
526 pass
527
528 def Print(self, val, buf):
529 # type: (value_t, mylib.BufWriter) -> None
530
531 # Or print to stderr?
532 f = fmt.DetectConsoleOutput(mylib.Stdout())
533 self.PrettyTree(val, f)
534
535 # Then print those with ASDL
536 pass
537
538
539class LexerDecoder(object):
540 """J8 lexer and string decoder.
541
542 Similar interface as SimpleLexer, except we return an optional decoded
543 string
544 """
545
546 def __init__(self, s, is_j8):
547 # type: (str, bool) -> None
548 self.s = s
549 self.is_j8 = is_j8
550 self.lang_str = "NIL8"
551
552 self.pos = 0
553 # Reuse this instance to save GC objects. JSON objects could have
554 # thousands of strings.
555 self.decoded = mylib.BufWriter()
556
557 def _Error(self, msg, end_pos):
558 # type: (str, int) -> error.Decode
559
560 # Use the current position as start pos
561 return error.Decode(msg, self.s, self.pos, end_pos)
562
563 def Next(self):
564 # type: () -> Tuple[Id_t, int, Optional[str]]
565 """ Returns a token and updates self.pos """
566
567 tok_id, end_pos = match.MatchJ8Token(self.s, self.pos)
568
569 if not self.is_j8:
570 if tok_id in (Id.Left_BSingleQuote, Id.Left_USingleQuote):
571 raise self._Error(
572 "Single quotes aren't part of JSON; you may want 'json8 read'",
573 end_pos)
574 if tok_id == Id.Ignored_Comment:
575 raise self._Error(
576 "Comments aren't part of JSON; you may want 'json8 read'",
577 end_pos)
578
579 # Non-string tokens like { } null etc.
580 if tok_id in (Id.Left_DoubleQuote, Id.Left_BSingleQuote,
581 Id.Left_USingleQuote):
582 return self._DecodeString(tok_id, end_pos)
583
584 self.pos = end_pos
585 return tok_id, end_pos, None
586
587 def _DecodeString(self, left_id, str_pos):
588 # type: (Id_t, int) -> Tuple[Id_t, int, Optional[str]]
589 """ Returns a string token and updates self.pos """
590
591 while True:
592 if left_id == Id.Left_DoubleQuote:
593 tok_id, str_end = match.MatchJsonStrToken(self.s, str_pos)
594 else:
595 tok_id, str_end = match.MatchJ8StrToken(self.s, str_pos)
596
597 if tok_id == Id.Eol_Tok:
598 # TODO: point to beginning of # quote?
599 raise self._Error(
600 'Unexpected EOF while lexing %s string' % self.lang_str,
601 str_end)
602 if tok_id == Id.Unknown_Tok:
603 # e.g. invalid backslash
604 raise self._Error(
605 'Unknown token while lexing %s string' % self.lang_str,
606 str_end)
607 if tok_id == Id.Char_AsciiControl:
608 raise self._Error(
609 "ASCII control chars are illegal in %s strings" %
610 self.lang_str, str_end)
611
612 if tok_id in (Id.Right_SingleQuote, Id.Right_DoubleQuote):
613
614 self.pos = str_end
615
616 s = self.decoded.getvalue()
617 self.decoded.clear() # reuse this instance
618
619 #log('decoded %r', self.decoded.getvalue())
620 return Id.J8_String, str_end, s
621
622 #
623 # Now handle each kind of token
624 #
625
626 if tok_id == Id.Lit_Chars: # JSON and J8
627 part = self.s[str_pos:str_end]
628 if not pyj8.PartIsUtf8(self.s, str_pos, str_end):
629 # Syntax error because JSON must be valid UTF-8
630 # Limit context to 20 chars arbitrarily
631 snippet = self.s[str_pos:str_pos + 20]
632 raise self._Error(
633 'Invalid UTF-8 in %s string literal: %r' %
634 (self.lang_str, snippet), str_end)
635
636 # TODO: would be nice to avoid allocation in all these cases.
637 # But LookupCharC() would have to change.
638
639 elif tok_id == Id.Char_OneChar: # JSON and J8
640 ch = self.s[str_pos + 1]
641 part = consts.LookupCharC(ch)
642
643 elif tok_id == Id.Char_UBraced: # J8 only
644 h = self.s[str_pos + 3:str_end - 1]
645 i = int(h, 16)
646
647 # Same check in osh/word_parse.py
648 if 0xD800 <= i and i < 0xE000:
649 raise self._Error(
650 r"\u{%s} escape is illegal because it's in the surrogate range"
651 % h, str_end)
652
653 part = Utf8Encode(i)
654
655 elif tok_id == Id.Char_YHex: # J8 only
656 h = self.s[str_pos + 2:str_end]
657
658 # Same check in osh/word_parse.py
659 if left_id != Id.Left_BSingleQuote:
660 assert left_id != Id.Left_BTSingleQuote, "Not handled here"
661 raise self._Error(
662 r"\y%s escapes not allowed in u'' strings" % h,
663 str_end)
664
665 i = int(h, 16)
666 part = chr(i)
667
668 elif tok_id == Id.Char_SurrogatePair:
669 h1 = self.s[str_pos + 2:str_pos + 6]
670 h2 = self.s[str_pos + 8:str_pos + 12]
671
672 # https://www.oilshell.org/blog/2023/06/surrogate-pair.html
673 i1 = int(h1, 16) - 0xD800 # high surrogate
674 i2 = int(h2, 16) - 0xDC00 # low surrogate
675 code_point = 0x10000 + (i1 << 10) + i2
676
677 part = Utf8Encode(code_point)
678
679 elif tok_id == Id.Char_Unicode4: # JSON only, unpaired
680 h = self.s[str_pos + 2:str_end]
681 i = int(h, 16)
682 part = Utf8Encode(i)
683
684 else:
685 # Should never happen
686 raise AssertionError(Id_str(tok_id))
687
688 #log('%s part %r', Id_str(tok_id), part)
689 self.decoded.write(part)
690 str_pos = str_end
691
692
693class _Parser(object):
694
695 def __init__(self, s, is_j8):
696 # type: (str, bool) -> None
697 self.s = s
698 self.is_j8 = is_j8
699 self.lang_str = "J8" if is_j8 else "JSON"
700
701 self.lexer = LexerDecoder(s, is_j8)
702 self.tok_id = Id.Undefined_Tok
703 self.start_pos = 0
704 self.end_pos = 0
705 self.decoded = ''
706
707 def _Next(self):
708 # type: () -> None
709
710 # This isn't the start of a J8_Bool token, it's the END of the token before it
711 while True:
712 self.start_pos = self.end_pos
713 self.tok_id, self.end_pos, self.decoded = self.lexer.Next()
714 if self.tok_id not in (Id.Ignored_Space, Id.Ignored_Comment):
715 break
716 # TODO: add Ignored_Newline to count lines, and show line numbers
717 # in errors messages. The position of the last newline and a token
718 # can be used to calculate a column number.
719
720 #log('NEXT %s %s %s %s', Id_str(self.tok_id), self.start_pos, self.end_pos, self.decoded or '-')
721
722 def _Eat(self, tok_id):
723 # type: (Id_t) -> None
724
725 # TODO: Need location info
726 if self.tok_id != tok_id:
727 #log('position %r %d-%d %r', self.s, self.start_pos,
728 # self.end_pos, self.s[self.start_pos:self.end_pos])
729 raise self._Error("Expected %s, got %s" %
730 (Id_str(tok_id), Id_str(self.tok_id)))
731 self._Next()
732
733 def _Error(self, msg):
734 # type: (str) -> error.Decode
735 return error.Decode(msg, self.s, self.start_pos, self.end_pos)
736
737
738class Parser(_Parser):
739 """JSON and JSON8 Parser."""
740
741 def __init__(self, s, is_j8):
742 # type: (str, bool) -> None
743 _Parser.__init__(self, s, is_j8)
744
745 def _ParsePair(self):
746 # type: () -> Tuple[str, value_t]
747
748 k = self.decoded # Save the potential string value
749 self._Eat(Id.J8_String) # Check that it's a string
750 assert k is not None
751
752 self._Eat(Id.J8_Colon)
753
754 v = self._ParseValue()
755 return k, v
756
757 def _ParseDict(self):
758 # type: () -> value_t
759 """
760 pair = string ':' value
761 Dict = '{' '}'
762 | '{' pair (',' pair)* '}'
763 """
764 # precondition
765 assert self.tok_id == Id.J8_LBrace, Id_str(self.tok_id)
766
767 #log('> Dict')
768
769 d = NewDict() # type: Dict[str, value_t]
770
771 self._Next()
772 if self.tok_id == Id.J8_RBrace:
773 self._Next()
774 return value.Dict(d)
775
776 k, v = self._ParsePair()
777 d[k] = v
778 #log(' [1] k %s v %s Id %s', k, v, Id_str(self.tok_id))
779
780 while self.tok_id == Id.J8_Comma:
781 self._Next()
782 k, v = self._ParsePair()
783 d[k] = v
784 #log(' [2] k %s v %s Id %s', k, v, Id_str(self.tok_id))
785
786 self._Eat(Id.J8_RBrace)
787
788 #log('< Dict')
789
790 return value.Dict(d)
791
792 def _ParseList(self):
793 # type: () -> value_t
794 """
795 List = '[' ']'
796 | '[' value (',' value)* ']'
797 """
798 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
799
800 items = [] # type: List[value_t]
801
802 self._Next()
803 if self.tok_id == Id.J8_RBracket:
804 self._Next()
805 return value.List(items)
806
807 items.append(self._ParseValue())
808
809 while self.tok_id == Id.J8_Comma:
810 self._Next()
811 items.append(self._ParseValue())
812
813 self._Eat(Id.J8_RBracket)
814
815 return value.List(items)
816
817 def _ParseValue(self):
818 # type: () -> value_t
819 if self.tok_id == Id.J8_LBrace:
820 return self._ParseDict()
821
822 elif self.tok_id == Id.J8_LBracket:
823 return self._ParseList()
824
825 elif self.tok_id == Id.J8_Null:
826 self._Next()
827 return value.Null
828
829 elif self.tok_id == Id.J8_Bool:
830 #log('%r %d', self.s[self.start_pos], self.start_pos)
831 b = value.Bool(self.s[self.start_pos] == 't')
832 self._Next()
833 return b
834
835 elif self.tok_id == Id.J8_Int:
836 part = self.s[self.start_pos:self.end_pos]
837 self._Next()
838 return value.Int(mops.FromStr(part))
839
840 elif self.tok_id == Id.J8_Float:
841 part = self.s[self.start_pos:self.end_pos]
842 self._Next()
843 return value.Float(float(part))
844
845 # UString, BString too
846 elif self.tok_id == Id.J8_String:
847 str_val = value.Str(self.decoded)
848 #log('d %r', self.decoded)
849 self._Next()
850 return str_val
851
852 elif self.tok_id == Id.Eol_Tok:
853 raise self._Error('Unexpected EOF while parsing %s' %
854 self.lang_str)
855
856 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
857 raise self._Error('Invalid token while parsing %s: %s' %
858 (self.lang_str, Id_str(self.tok_id)))
859
860 def ParseValue(self):
861 # type: () -> value_t
862 """ Raises error.Decode. """
863 self._Next()
864 obj = self._ParseValue()
865 if self.tok_id != Id.Eol_Tok:
866 raise self._Error('Unexpected trailing input')
867 return obj
868
869
870class Nil8Parser(_Parser):
871 """
872 Tokens not in JSON8:
873 LParen RParen Symbol
874
875 Tokens not in JSON, but in JSON8 and NIL8:
876 Identifier (unquoted keys)
877 Ignored_Comment
878 """
879
880 def __init__(self, s, is_j8):
881 # type: (str, bool) -> None
882 _Parser.__init__(self, s, is_j8)
883
884 if 0:
885
886 def _LookAhead(self):
887 # type: () -> Id_t
888 """
889 Don't need this right now
890 """
891 end_pos = self.end_pos # look ahead from last token
892 while True:
893 tok_id, end_pos = match.MatchJ8Token(self.s, end_pos)
894 if tok_id not in (Id.Ignored_Space, Id.Ignored_Comment):
895 break
896 return tok_id
897
898 def _ParseRecord(self):
899 # type: () -> nvalue_t
900 """
901 Yaks
902 (self->Next) => (-> self Next)
903 (self->Next obj.field) => ((-> self Next) (. obj field))
904
905 Similar to
906 ((identity identity) 42) => 42 in Clojure
907
908 ASDL
909 (Node left:(. x4beef2))
910 (Node left !x4beef2)
911
912 # Ambiguous because value can be identifier.
913 # We have to look ahead to and see if there's a colon :
914 field =
915 Identifier ':' value
916 | value
917
918 record = '(' head field* ')'
919
920 - Identifier | Symbol are treated the same, it's a side effect of
921 the lexing style
922 - do positional args come before named args
923 - () is invalid? Use [] for empty list
924 """
925 assert self.tok_id == Id.J8_LParen, Id_str(self.tok_id)
926
927 items = [] # type: List[nvalue_t]
928
929 self._Next()
930 if self.tok_id == Id.J8_RParen:
931 self._Next()
932 return nvalue.List(items)
933
934 #log('TOK %s', Id_str(self.tok_id))
935 while self.tok_id != Id.J8_RParen:
936 items.append(self._ParseNil8())
937 #log('TOK 2 %s', Id_str(self.tok_id))
938
939 self._Eat(Id.J8_RParen)
940
941 return nvalue.List(items)
942
943 def _ParseList8(self):
944 # type: () -> nvalue_t
945 """
946 List8 = '[' value* ']'
947
948 No commas, not even optional ones for now.
949 """
950 assert self.tok_id == Id.J8_LBracket, Id_str(self.tok_id)
951
952 items = [] # type: List[nvalue_t]
953
954 self._Next()
955 if self.tok_id == Id.J8_RBracket:
956 self._Next()
957 return nvalue.List(items)
958
959 #log('TOK %s', Id_str(self.tok_id))
960 while self.tok_id != Id.J8_RBracket:
961 items.append(self._ParseNil8())
962 #log('TOK 2 %s', Id_str(self.tok_id))
963
964 self._Eat(Id.J8_RBracket)
965
966 return nvalue.List(items)
967
968 def _ParseNil8(self):
969 # type: () -> nvalue_t
970 if self.tok_id == Id.J8_LParen:
971 obj = self._ParseRecord() # type: nvalue_t
972 #return obj
973
974 elif self.tok_id == Id.J8_LBracket:
975 obj = self._ParseList8()
976 #return obj
977
978 # Primitives are copied from J8 above.
979 # TODO: We also want hex literals.
980 elif self.tok_id == Id.J8_Null:
981 self._Next()
982 obj = nvalue.Null
983
984 elif self.tok_id == Id.J8_Bool:
985 b = nvalue.Bool(self.s[self.start_pos] == 't')
986 self._Next()
987 obj = b
988
989 elif self.tok_id == Id.J8_Int:
990 part = self.s[self.start_pos:self.end_pos]
991 self._Next()
992 obj = nvalue.Int(int(part))
993
994 elif self.tok_id == Id.J8_Float:
995 part = self.s[self.start_pos:self.end_pos]
996 self._Next()
997 obj = nvalue.Float(float(part))
998
999 elif self.tok_id == Id.J8_String:
1000 str_val = nvalue.Str(self.decoded)
1001 self._Next()
1002 obj = str_val
1003
1004 # <- etc.
1005 elif self.tok_id in (Id.J8_Identifier, Id.J8_Operator, Id.J8_Colon,
1006 Id.J8_Comma):
1007 # unquoted "word" treated like a string
1008 part = self.s[self.start_pos:self.end_pos]
1009 self._Next()
1010 obj = nvalue.Symbol(part)
1011
1012 elif self.tok_id == Id.Eol_Tok:
1013 raise self._Error('Unexpected EOF while parsing %s' %
1014 self.lang_str)
1015
1016 else: # Id.Unknown_Tok, Id.J8_{LParen,RParen}
1017 raise self._Error('Invalid token while parsing %s: %s' %
1018 (self.lang_str, Id_str(self.tok_id)))
1019
1020 #log('YO %s', Id_str(self.tok_id))
1021 if self.tok_id in (Id.J8_Operator, Id.J8_Colon, Id.J8_Comma):
1022 #log('AT %s', Id_str(self.tok_id))
1023
1024 # key: "value" -> (: key "value")
1025 part = self.s[self.start_pos:self.end_pos]
1026 op = nvalue.Symbol(part)
1027
1028 self._Next()
1029 operand2 = self._ParseNil8()
1030 infix = nvalue.List([op, obj, operand2]) # type: nvalue_t
1031 #print("--> INFIX %d %s" % (id(infix), infix))
1032 return infix
1033
1034 #next_id = self._LookAhead()
1035 #print('NEXT %s' % Id_str(next_id))
1036
1037 #raise AssertionError()
1038 #print("--> OBJ %d %s" % (id(obj), obj))
1039 return obj
1040
1041 def ParseNil8(self):
1042 # type: () -> nvalue_t
1043 """ Raises error.Decode. """
1044 self._Next()
1045 #print('yo')
1046 obj = self._ParseNil8()
1047 #print("==> %d %s" % (id(obj), obj))
1048 if self.tok_id != Id.Eol_Tok:
1049 raise self._Error('Unexpected trailing input')
1050 return obj
1051
1052
1053# vim: sw=4