OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1633 lines, 977 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 NameTok,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46)
47from _devbuild.gen.value_asdl import value, value_t
48from _devbuild.gen import grammar_nt
49from core.error import p_die
50from core import num
51from frontend import consts
52from frontend import lexer
53from mycpp import mops
54from mycpp import mylib
55from mycpp.mylib import log, tagswitch
56from ysh import expr_parse
57from ysh import regex_translate
58
59from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
60if TYPE_CHECKING:
61 from pgen2.grammar import Grammar
62 from pgen2.pnode import PNode
63
64_ = log
65
66PERL_CLASSES = {
67 'd': 'd',
68 'w': 'w',
69 'word': 'w',
70 's': 's',
71}
72# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
73POSIX_CLASSES = [
74 'alnum',
75 'cntrl',
76 'lower',
77 'space',
78 'alpha',
79 'digit',
80 'print',
81 'upper',
82 'blank',
83 'graph',
84 'punct',
85 'xdigit',
86]
87# NOTE: There are also things like \p{Greek} that we could put in the
88# "non-sigil" namespace.
89
90RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
91
92# Copied from pgen2/token.py to avoid dependency.
93NT_OFFSET = 256
94
95if mylib.PYTHON:
96
97 def MakeGrammarNames(ysh_grammar):
98 # type: (Grammar) -> Dict[int, str]
99
100 # TODO: Break this dependency
101 from frontend import lexer_def
102
103 names = {}
104
105 #from _devbuild.gen.id_kind_asdl import _Id_str
106 # This is a dictionary
107
108 # _Id_str()
109
110 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
111 # Hm some are out of range
112 #assert k < 256, (k, id_name)
113
114 # HACK: Cut it off at 256 now! Expr/Arith/Op doesn't go higher than
115 # that. TODO: Change NT_OFFSET? That might affect C code though.
116 # Best to keep everything fed to pgen under 256. This only affects
117 # pretty printing.
118 if k < 256:
119 names[k] = id_name
120
121 for k, v in ysh_grammar.number2symbol.items():
122 # eval_input == 256. Remove?
123 assert k >= 256, (k, v)
124 names[k] = v
125
126 return names
127
128
129def ISNONTERMINAL(x):
130 # type: (int) -> bool
131 assert isinstance(x, int), x
132 return x >= NT_OFFSET
133
134
135class Transformer(object):
136 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
137
138 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
139 so this "transformer" is the equivalent.
140
141 Files to refer to when modifying this function:
142
143 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
144 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
145
146 Related examples:
147
148 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
149 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
150
151 Other:
152 frontend/parse_lib.py (turn on print_parse_tree)
153
154 Public methods:
155 Expr, VarDecl
156 atom, trailer, etc. are private, named after productions in grammar.pgen2.
157 """
158
159 def __init__(self, gr):
160 # type: (Grammar) -> None
161 self.number2symbol = gr.number2symbol
162 if mylib.PYTHON:
163 names = MakeGrammarNames(gr)
164 # print raw nodes
165 self.p_printer = expr_parse.ParseTreePrinter(names)
166
167 def _LeftAssoc(self, p_node):
168 # type: (PNode) -> expr_t
169 """For an associative binary operation.
170
171 Examples:
172 xor_expr: and_expr ('xor' and_expr)*
173 term: factor (('*'|'/'|'%'|'div') factor)*
174
175 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
176 """
177 # Note: Compare the iteractive com_binary() method in
178 # opy/compiler2/transformer.py.
179
180 # Examples:
181 # - The PNode for '3 - 1' will have 3 children
182 # - The PNode for '3 - 1 - 2' will have 5 children
183
184 #self.p_printer.Print(p_node)
185
186 i = 1 # index of the operator
187 n = p_node.NumChildren()
188
189 left = self.Expr(p_node.GetChild(0))
190 while i < n:
191 op = p_node.GetChild(i)
192 right = self.Expr(p_node.GetChild(i + 1))
193
194 # create a new left node
195 left = expr.Binary(op.tok, left, right)
196 i += 2
197
198 return left
199
200 def _Trailer(self, base, p_trailer):
201 # type: (expr_t, PNode) -> expr_t
202 """
203 Trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
204 | '.' NAME | '->' NAME | '::' NAME
205 )
206 """
207 op_tok = p_trailer.GetChild(0).tok
208
209 if op_tok.id == Id.Op_LParen:
210 lparen = op_tok
211 rparen = p_trailer.GetChild(-1).tok
212 arglist = ArgList(lparen, [], None, [], rparen)
213 if p_trailer.NumChildren() == 2: # ()
214 return expr.FuncCall(base, arglist)
215
216 p = p_trailer.GetChild(1) # the X in ( X )
217 assert p.typ == grammar_nt.arglist # f(x, y)
218 self._ArgList(p, arglist)
219 return expr.FuncCall(base, arglist)
220
221 if op_tok.id == Id.Op_LBracket:
222 p_args = p_trailer.GetChild(1)
223 assert p_args.typ == grammar_nt.subscriptlist
224 n = p_args.NumChildren()
225 if n > 1:
226 p_die("Only 1 subscript is accepted", p_args.GetChild(1).tok)
227
228 a = p_args.GetChild(0)
229 return Subscript(op_tok, base, self._Subscript(a))
230
231 if op_tok.id in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
232 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
233 return Attribute(base, op_tok, attr, lexer.TokenVal(attr),
234 expr_context_e.Store)
235
236 raise AssertionError(Id_str(op_tok.id))
237
238 def _DictPair(self, p_node):
239 # type: (PNode) -> Tuple[expr_t, expr_t]
240 """dict_pair: ( Expr_Name [':' test] |
241
242 '[' testlist ']' ':' test )
243 """
244 assert p_node.typ == grammar_nt.dict_pair
245
246 typ = p_node.GetChild(0).typ
247
248 if ISNONTERMINAL(typ): # for sq_string
249 # Note: Could inline these cases instead of going through self.Expr.
250 if typ == grammar_nt.sq_string:
251 key = self.Expr(p_node.GetChild(0)) # type: expr_t
252 elif typ == grammar_nt.dq_string:
253 key = self.Expr(p_node.GetChild(0))
254
255 val = self.Expr(p_node.GetChild(2))
256 return key, val
257
258 tok0 = p_node.GetChild(0).tok
259 id_ = tok0.id
260
261 if id_ == Id.Expr_Name:
262 key_str = value.Str(lexer.TokenVal(tok0))
263 key = expr.Const(tok0, key_str)
264 if p_node.NumChildren() >= 3:
265 val = self.Expr(p_node.GetChild(2))
266 else:
267 val = expr.Implicit
268
269 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
270 key = self.Expr(p_node.GetChild(1))
271 val = self.Expr(p_node.GetChild(4))
272 return key, val
273
274 return key, val
275
276 def _Dict(self, parent, p_node):
277 # type: (PNode, PNode) -> expr.Dict
278 """Parse tree to LST
279
280 dict: dict_pair (',' dict_pair)* [',']
281 dict2: dict_pair (comma_newline dict_pair)* [comma_newline]
282 """
283 if not ISNONTERMINAL(p_node.typ):
284 assert p_node.tok.id == Id.Op_RBrace
285 return expr.Dict(parent.tok, [], [])
286
287 #assert p_node.typ == grammar_nt.dict
288
289 keys = [] # type: List[expr_t]
290 values = [] # type: List[expr_t]
291
292 n = p_node.NumChildren()
293 for i in xrange(0, n, 2):
294 key, val = self._DictPair(p_node.GetChild(i))
295 keys.append(key)
296 values.append(val)
297
298 return expr.Dict(parent.tok, keys, values)
299
300 def _Tuple(self, parent):
301 # type: (PNode) -> expr_t
302
303 n = parent.NumChildren()
304
305 # (x) -- not a tuple
306 if n == 1:
307 return self.Expr(parent.GetChild(0))
308
309 # x, and (x,) aren't allowed
310 if n == 2:
311 p_die('Invalid trailing comma', parent.GetChild(1).tok)
312
313 elts = [] # type: List[expr_t]
314 for i in xrange(0, n, 2): # skip commas
315 p_node = parent.GetChild(i)
316 elts.append(self.Expr(p_node))
317
318 return expr.Tuple(parent.tok, elts,
319 expr_context_e.Store) # unused expr_context_e
320
321 def _TestlistComp(self, parent, p_node, id0):
322 # type: (PNode, PNode, Id_t) -> expr_t
323 """Parse tree to LST
324
325 testlist_comp:
326 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
327 """
328 assert p_node.typ == grammar_nt.testlist_comp
329 n = p_node.NumChildren()
330 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
331 elt = self.Expr(p_node.GetChild(0))
332 comp = self._CompFor(p_node.GetChild(1))
333 if id0 == Id.Op_LParen:
334 return expr.GeneratorExp(elt, [comp])
335 if id0 == Id.Op_LBracket:
336 return expr.ListComp(parent.tok, elt, [comp])
337 raise AssertionError()
338
339 if id0 == Id.Op_LParen:
340 if n == 1: # parenthesized expression like (x+1) or (x)
341 return self.Expr(p_node.GetChild(0))
342
343 # (1,) (1, 2) etc.
344 if p_node.GetChild(1).tok.id == Id.Arith_Comma:
345 return self._Tuple(p_node)
346
347 raise NotImplementedError('testlist_comp')
348
349 if id0 == Id.Op_LBracket:
350 elts = [] # type: List[expr_t]
351 for i in xrange(0, n, 2): # skip commas
352 elts.append(self.Expr(p_node.GetChild(i)))
353
354 return expr.List(parent.tok, elts,
355 expr_context_e.Store) # unused expr_context_e
356
357 raise AssertionError(Id_str(id0))
358
359 def _Atom(self, parent):
360 # type: (PNode) -> expr_t
361 """Handle alternatives of 'atom' where there's more than one child."""
362
363 tok = parent.GetChild(0).tok
364 id_ = tok.id
365 n = parent.NumChildren()
366
367 if id_ == Id.Op_LParen:
368 # atom: '(' [yield_expr|testlist_comp] ')' | ...
369 if n == 2: # () is a tuple
370 assert (parent.GetChild(1).tok.id == Id.Op_RParen
371 ), parent.GetChild(1)
372 return expr.Tuple(tok, [], expr_context_e.Store)
373
374 return self._TestlistComp(parent, parent.GetChild(1), id_)
375
376 if id_ == Id.Op_LBracket:
377 # atom: ... | '[' [testlist_comp] ']' | ...
378
379 if n == 2: # []
380 assert (parent.GetChild(1).tok.id == Id.Op_RBracket
381 ), parent.GetChild(1)
382 return expr.List(tok, [],
383 expr_context_e.Store) # unused expr_context_e
384
385 return self._TestlistComp(parent, parent.GetChild(1), id_)
386
387 if id_ == Id.Op_LBrace:
388 # atom: ... | '{' [Op_Newline] [dict] '}'
389 i = 1
390 if parent.GetChild(i).tok.id == Id.Op_Newline:
391 i += 1
392 return self._Dict(parent, parent.GetChild(i))
393
394 if id_ == Id.Arith_Amp:
395 n = parent.NumChildren()
396 if n >= 3:
397 p_die("Places in containers not implemented yet",
398 parent.GetChild(2).tok)
399
400 name_tok = parent.GetChild(1).tok
401 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
402
403 if id_ == Id.Expr_Func:
404 # STUB. This should really be a Func, not Lambda.
405 return expr.Lambda([], expr.Implicit)
406
407 # 100 M
408 # Ignoring the suffix for now
409 if id_ == Id.Expr_DecInt:
410 assert n > 1
411 p_die("Units suffix not implemented", parent.GetChild(1).tok)
412 #return self.Expr(parent.GetChild(0))
413
414 # 100.5 M
415 # Ignoring the suffix for now
416 if id_ == Id.Expr_Float:
417 assert n > 1
418 p_die("unix suffix implemented", parent.GetChild(1).tok)
419 #return self.Expr(parent.GetChild(0))
420
421 raise NotImplementedError(Id_str(id_))
422
423 def _NameType(self, p_node):
424 # type: (PNode) -> NameType
425 """
426 name_type: Expr_Name [':'] [type_expr]
427 """
428 name_tok = p_node.GetChild(0).tok
429 typ = None # type: Optional[TypeExpr]
430
431 n = p_node.NumChildren()
432 if n == 2:
433 typ = self._TypeExpr(p_node.GetChild(1))
434 if n == 3:
435 typ = self._TypeExpr(p_node.GetChild(2))
436
437 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
438
439 def _NameTypeList(self, p_node):
440 # type: (PNode) -> List[NameType]
441 """name_type_list: name_type (',' name_type)*"""
442 assert p_node.typ == grammar_nt.name_type_list
443 results = [] # type: List[NameType]
444
445 n = p_node.NumChildren()
446 for i in xrange(0, n, 2): # was children[::2]
447 results.append(self._NameType(p_node.GetChild(i)))
448 return results
449
450 def _CompFor(self, p_node):
451 # type: (PNode) -> Comprehension
452 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
453 lhs = self._NameTypeList(p_node.GetChild(1))
454 iterable = self.Expr(p_node.GetChild(3))
455
456 if p_node.NumChildren() >= 6:
457 cond = self.Expr(p_node.GetChild(5))
458 else:
459 cond = None
460
461 return Comprehension(lhs, iterable, cond)
462
463 def _CompareChain(self, parent):
464 # type: (PNode) -> expr_t
465 """comparison: expr (comp_op expr)*"""
466 cmp_ops = [] # type: List[Token]
467 comparators = [] # type: List[expr_t]
468 left = self.Expr(parent.GetChild(0))
469
470 i = 1
471 n = parent.NumChildren()
472 while i < n:
473 p = parent.GetChild(i)
474 op = p.GetChild(0).tok
475 if p.NumChildren() == 2:
476 # Blame the first token, and change its type
477 if op.id == Id.Expr_Not: # not in
478 op.id = Id.Node_NotIn
479 elif op.id == Id.Expr_Is: # is not
480 op.id = Id.Node_IsNot
481 else:
482 raise AssertionError()
483 else:
484 # is, <, ==, etc.
485 pass
486
487 cmp_ops.append(op)
488 i += 1
489 comparators.append(self.Expr(parent.GetChild(i)))
490 i += 1
491 return expr.Compare(left, cmp_ops, comparators)
492
493 def _Subscript(self, parent):
494 # type: (PNode) -> expr_t
495 """subscript: expr | [expr] ':' [expr]"""
496 typ0 = parent.GetChild(0).typ
497
498 n = parent.NumChildren()
499
500 if ISNONTERMINAL(typ0):
501 if n == 3: # a[1:2]
502 lower = self.Expr(parent.GetChild(0))
503 upper = self.Expr(parent.GetChild(2))
504 elif n == 2: # a[1:]
505 lower = self.Expr(parent.GetChild(0))
506 upper = None
507 else: # a[1]
508 return self.Expr(parent.GetChild(0))
509 else:
510 assert parent.GetChild(0).tok.id == Id.Arith_Colon
511 lower = None
512 if n == 1: # a[:]
513 upper = None
514 else: # a[:3]
515 upper = self.Expr(parent.GetChild(1))
516 return expr.Slice(lower, parent.GetChild(0).tok, upper)
517
518 def Expr(self, pnode):
519 # type: (PNode) -> expr_t
520 """Transform expressions (as opposed to statements)."""
521 typ = pnode.typ
522 tok = pnode.tok
523
524 if ISNONTERMINAL(typ):
525
526 #
527 # Oil Entry Points / Additions
528 #
529
530 if typ == grammar_nt.ysh_expr: # for if/while
531 # ysh_expr: '(' testlist ')'
532 return self.Expr(pnode.GetChild(1))
533
534 if typ == grammar_nt.command_expr:
535 # return_expr: testlist end_stmt
536 return self.Expr(pnode.GetChild(0))
537
538 #
539 # Python-like Expressions / Operators
540 #
541
542 if typ == grammar_nt.atom:
543 if pnode.NumChildren() == 1:
544 return self.Expr(pnode.GetChild(0))
545 return self._Atom(pnode)
546
547 if typ == grammar_nt.testlist:
548 # testlist: test (',' test)* [',']
549 return self._Tuple(pnode)
550
551 if typ == grammar_nt.test:
552 # test: or_test ['if' or_test 'else' test] | lambdef
553 if pnode.NumChildren() == 1:
554 return self.Expr(pnode.GetChild(0))
555
556 # TODO: Handle lambdef
557
558 test = self.Expr(pnode.GetChild(2))
559 body = self.Expr(pnode.GetChild(0))
560 orelse = self.Expr(pnode.GetChild(4))
561 return expr.IfExp(test, body, orelse)
562
563 if typ == grammar_nt.lambdef:
564 # lambdef: '|' [name_type_list] '|' test
565
566 n = pnode.NumChildren()
567 if n == 4:
568 params = self._NameTypeList(pnode.GetChild(1))
569 else:
570 params = []
571
572 body = self.Expr(pnode.GetChild(n - 1))
573 return expr.Lambda(params, body)
574
575 #
576 # Operators with Precedence
577 #
578
579 if typ == grammar_nt.or_test:
580 # or_test: and_test ('or' and_test)*
581 return self._LeftAssoc(pnode)
582
583 if typ == grammar_nt.and_test:
584 # and_test: not_test ('and' not_test)*
585 return self._LeftAssoc(pnode)
586
587 if typ == grammar_nt.not_test:
588 # not_test: 'not' not_test | comparison
589 if pnode.NumChildren() == 1:
590 return self.Expr(pnode.GetChild(0))
591
592 op_tok = pnode.GetChild(0).tok # not
593 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
594
595 elif typ == grammar_nt.comparison:
596 if pnode.NumChildren() == 1:
597 return self.Expr(pnode.GetChild(0))
598
599 return self._CompareChain(pnode)
600
601 elif typ == grammar_nt.range_expr:
602 n = pnode.NumChildren()
603 if n == 1:
604 return self.Expr(pnode.GetChild(0))
605
606 if n == 3:
607 return expr.Range(self.Expr(pnode.GetChild(0)),
608 pnode.GetChild(1).tok,
609 self.Expr(pnode.GetChild(2)))
610
611 raise AssertionError(n)
612
613 elif typ == grammar_nt.expr:
614 # expr: xor_expr ('|' xor_expr)*
615 return self._LeftAssoc(pnode)
616
617 if typ == grammar_nt.xor_expr:
618 # xor_expr: and_expr ('xor' and_expr)*
619 return self._LeftAssoc(pnode)
620
621 if typ == grammar_nt.and_expr: # a & b
622 # and_expr: shift_expr ('&' shift_expr)*
623 return self._LeftAssoc(pnode)
624
625 elif typ == grammar_nt.shift_expr:
626 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
627 return self._LeftAssoc(pnode)
628
629 elif typ == grammar_nt.arith_expr:
630 # arith_expr: term (('+'|'-') term)*
631 return self._LeftAssoc(pnode)
632
633 elif typ == grammar_nt.term:
634 # term: factor (('*'|'/'|'div'|'mod') factor)*
635 return self._LeftAssoc(pnode)
636
637 elif typ == grammar_nt.factor:
638 # factor: ('+'|'-'|'~') factor | power
639 # the power would have already been reduced
640 if pnode.NumChildren() == 1:
641 return self.Expr(pnode.GetChild(0))
642
643 assert pnode.NumChildren() == 2
644 op = pnode.GetChild(0)
645 e = pnode.GetChild(1)
646
647 assert isinstance(op.tok, Token)
648 return expr.Unary(op.tok, self.Expr(e))
649
650 elif typ == grammar_nt.power:
651 # power: atom trailer* ['**' factor]
652
653 node = self.Expr(pnode.GetChild(0))
654 if pnode.NumChildren() == 1: # No trailers
655 return node
656
657 # Support a->startswith(b) and mydict.key
658 n = pnode.NumChildren()
659 i = 1
660 while i < n and ISNONTERMINAL(pnode.GetChild(i).typ):
661 node = self._Trailer(node, pnode.GetChild(i))
662 i += 1
663
664 if i != n: # ['**' factor]
665 op_tok = pnode.GetChild(i).tok
666 assert op_tok.id == Id.Arith_DStar, op_tok
667 factor = self.Expr(pnode.GetChild(i + 1))
668 node = expr.Binary(op_tok, node, factor)
669
670 return node
671
672 elif typ == grammar_nt.literal_expr:
673 inner = self.Expr(pnode.GetChild(1))
674 return expr.Literal(inner)
675
676 elif typ == grammar_nt.eggex:
677 return self._Eggex(pnode)
678
679 elif typ == grammar_nt.ysh_expr_sub:
680 return self.Expr(pnode.GetChild(0))
681
682 #
683 # YSH Lexer Modes
684 #
685
686 elif typ == grammar_nt.sh_array_literal:
687 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
688
689 elif typ == grammar_nt.old_sh_array_literal:
690 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
691
692 elif typ == grammar_nt.sh_command_sub:
693 return cast(CommandSub, pnode.GetChild(1).tok)
694
695 elif typ == grammar_nt.braced_var_sub:
696 return cast(BracedVarSub, pnode.GetChild(1).tok)
697
698 elif typ == grammar_nt.dq_string:
699 s = cast(DoubleQuoted, pnode.GetChild(1).tok)
700 # sugar: ^"..." is short for ^["..."]
701 if pnode.GetChild(0).tok.id == Id.Left_CaretDoubleQuote:
702 return expr.Literal(s)
703 return s
704
705 elif typ == grammar_nt.sq_string:
706 return cast(SingleQuoted, pnode.GetChild(1).tok)
707
708 elif typ == grammar_nt.simple_var_sub:
709 tok = pnode.GetChild(0).tok
710
711 if tok.id == Id.VSub_DollarName: # $foo is disallowed
712 bare = lexer.TokenSliceLeft(tok, 1)
713 p_die(
714 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
715 % (bare, bare), tok)
716
717 # $? is allowed
718 return NameTok(tok, lexer.TokenSliceLeft(tok, 1))
719
720 else:
721 nt_name = self.number2symbol[typ]
722 raise AssertionError("PNode type %d (%s) wasn't handled" %
723 (typ, nt_name))
724
725 else: # Terminals should have a token
726 id_ = tok.id
727
728 if id_ == Id.Expr_Name:
729 return expr.Var(tok, lexer.TokenVal(tok))
730
731 tok_str = lexer.TokenVal(tok)
732
733 # Remove underscores from 1_000_000. The lexer is responsible for
734 # validation.
735 c_under = tok_str.replace('_', '')
736
737 if id_ == Id.Expr_DecInt:
738 try:
739 cval = value.Int(mops.FromStr(c_under)) # type: value_t
740 except ValueError:
741 p_die('Decimal int constant is too large', tok)
742 elif id_ == Id.Expr_BinInt:
743 assert c_under[:2] in ('0b', '0B'), c_under
744 try:
745 cval = value.Int(mops.FromStr(c_under[2:], 2))
746 except ValueError:
747 p_die('Binary int constant is too large', tok)
748 elif id_ == Id.Expr_OctInt:
749 assert c_under[:2] in ('0o', '0O'), c_under
750 try:
751 cval = value.Int(mops.FromStr(c_under[2:], 8))
752 except ValueError:
753 p_die('Octal int constant is too large', tok)
754 elif id_ == Id.Expr_HexInt:
755 assert c_under[:2] in ('0x', '0X'), c_under
756 try:
757 cval = value.Int(mops.FromStr(c_under[2:], 16))
758 except ValueError:
759 p_die('Hex int constant is too large', tok)
760
761 elif id_ == Id.Expr_Float:
762 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
763 # I think this never raises ValueError, because the lexer
764 # should only accept strings that strtod() does?
765 cval = value.Float(float(c_under))
766
767 elif id_ == Id.Expr_Null:
768 cval = value.Null
769 elif id_ == Id.Expr_True:
770 cval = value.Bool(True)
771 elif id_ == Id.Expr_False:
772 cval = value.Bool(False)
773
774 # What to do with the char constants?
775 # \n \u{3bc} #'a'
776 # Are they integers or strings?
777 #
778 # Integers could be ord(\n), or strings could chr(\n)
779 # Or just remove them, with ord(u'\n') and chr(u'\n')
780 #
781 # I think this relies on small string optimization. If we have it,
782 # then 1-4 byte characters are efficient, and don't require heap
783 # allocation.
784
785 elif id_ == Id.Char_OneChar:
786 # TODO: look up integer directly?
787 cval = num.ToBig(ord(consts.LookupCharC(tok_str[1])))
788 elif id_ == Id.Char_UBraced:
789 hex_str = tok_str[3:-1] # \u{123}
790 # ValueError shouldn't happen because lexer validates
791 cval = value.Int(mops.FromStr(hex_str, 16))
792
793 # This could be a char integer? Not sure
794 elif id_ == Id.Char_Pound:
795 # TODO: accept UTF-8 code point instead of single byte
796 byte = tok_str[2] # the a in #'a'
797 cval = num.ToBig(ord(byte)) # It's an integer
798
799 else:
800 raise AssertionError(Id_str(id_))
801
802 return expr.Const(tok, cval)
803
804 def _ArrayItem(self, p_node):
805 # type: (PNode) -> expr_t
806 assert p_node.typ == grammar_nt.array_item
807
808 child0 = p_node.GetChild(0)
809 typ0 = child0.typ
810 if ISNONTERMINAL(typ0):
811 return self.Expr(child0)
812 else:
813 if child0.tok.id == Id.Op_LParen: # (x+1)
814 return self.Expr(p_node.GetChild(1))
815 return self.Expr(child0) # $1 ${x} etc.
816
817 def _LhsExprList(self, p_node):
818 # type: (PNode) -> List[y_lhs_t]
819 """lhs_list: expr (',' expr)*"""
820 assert p_node.typ == grammar_nt.lhs_list
821
822 lhs_list = [] # type: List[y_lhs_t]
823 n = p_node.NumChildren()
824 for i in xrange(0, n, 2): # was children[::2]
825 p = p_node.GetChild(i)
826
827 e = self.Expr(p)
828 UP_e = e
829 with tagswitch(e) as case:
830 if case(expr_e.Var):
831 e = cast(expr.Var, UP_e)
832 lhs_list.append(NameTok(e.left, e.name))
833
834 elif case(expr_e.Subscript):
835 e = cast(Subscript, UP_e)
836 lhs_list.append(e)
837
838 elif case(expr_e.Attribute):
839 e = cast(Attribute, UP_e)
840 if e.op.id != Id.Expr_Dot:
841 # e.g. setvar obj->method is not valid
842 p_die("Can't assign to this attribute expr", e.op)
843 lhs_list.append(e)
844
845 else:
846 pass # work around mycpp bug
847
848 # TODO: could blame arbitary expr_t, bu this works most of
849 # the time
850 if p.tok:
851 blame = p.tok # type: loc_t
852 else:
853 blame = loc.Missing
854 p_die("Can't assign to this expression", blame)
855
856 return lhs_list
857
858 def MakeVarDecl(self, p_node):
859 # type: (PNode) -> command.VarDecl
860 """
861 ysh_var_decl: name_type_list ['=' testlist] end_stmt
862 """
863 assert p_node.typ == grammar_nt.ysh_var_decl
864
865 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
866
867 # This syntax is confusing, and different than JavaScript
868 # var x, y = 1, 2
869 # But this is useful:
870 # var flag, i = parseArgs(spec, argv)
871
872 n = p_node.NumChildren()
873 if n >= 3:
874 rhs = self.Expr(p_node.GetChild(2))
875 else:
876 rhs = None
877
878 # The caller should fill in the keyword token.
879 return command.VarDecl(None, lhs, rhs)
880
881 def MakeMutation(self, p_node):
882 # type: (PNode) -> command.Mutation
883 """Parse tree to LST
884
885 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
886 """
887 typ = p_node.typ
888 assert typ == grammar_nt.ysh_mutation
889
890 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
891 op_tok = p_node.GetChild(1).tok
892 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
893 p_die('Multiple assignment must use =', op_tok)
894 rhs = self.Expr(p_node.GetChild(2))
895 return command.Mutation(None, lhs_list, op_tok, rhs)
896
897 def _EggexFlag(self, p_node):
898 # type: (PNode) -> EggexFlag
899 n = p_node.NumChildren()
900 if n == 1:
901 return EggexFlag(False, p_node.GetChild(0).tok)
902 elif n == 2:
903 return EggexFlag(True, p_node.GetChild(1).tok)
904 else:
905 raise AssertionError()
906
907 def _Eggex(self, p_node):
908 # type: (PNode) -> Eggex
909 """
910 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
911 """
912 n = p_node.NumChildren()
913 left = p_node.GetChild(0).tok
914 regex = self._Regex(p_node.GetChild(1))
915
916 flags = [] # type: List[EggexFlag]
917 trans_pref = None # type: Optional[Token]
918
919 i = 2
920 current = p_node.GetChild(i)
921 if current.typ == Id.Op_Semi:
922 i += 1
923 while True:
924 current = p_node.GetChild(i)
925 if not ISNONTERMINAL(current.typ):
926 break
927 flags.append(self._EggexFlag(current))
928 i += 1
929
930 if current.typ == Id.Op_Semi:
931 i += 1
932 trans_pref = p_node.GetChild(i).tok
933
934 # Canonicalize and validate flags for ERE only. Default is ERE.
935 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
936 canonical_flags = regex_translate.CanonicalFlags(flags)
937 else:
938 canonical_flags = None
939
940 return Eggex(left, regex, flags, trans_pref, canonical_flags)
941
942 def YshCasePattern(self, pnode):
943 # type: (PNode) -> pat_t
944 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
945
946 pattern = pnode.GetChild(0)
947 typ = pattern.typ
948 if typ == Id.Op_LParen:
949 # pat_expr or pat_else
950 pattern = pnode.GetChild(1)
951 typ = pattern.typ
952
953 if typ == grammar_nt.pat_else:
954 return pat.Else
955 elif typ == grammar_nt.pat_exprs:
956 exprs = [] # type: List[expr_t]
957 for i in xrange(pattern.NumChildren()):
958 child = pattern.GetChild(i)
959 if child.typ == grammar_nt.expr:
960 expr = self.Expr(child)
961 exprs.append(expr)
962 return pat.YshExprs(exprs)
963
964 elif typ == grammar_nt.eggex:
965 return self._Eggex(pattern)
966
967 raise NotImplementedError()
968
969 def _Argument(self, p_node, after_semi, arglist):
970 # type: (PNode, bool, ArgList) -> None
971 """Parse tree to LST
972
973 argument: (
974 test [comp_for]
975 | test '=' test # named arg
976 | '...' test # var args
977 )
978 """
979 pos_args = arglist.pos_args
980 named_args = arglist.named_args
981
982 assert p_node.typ == grammar_nt.argument, p_node
983 n = p_node.NumChildren()
984 if n == 1:
985 child = p_node.GetChild(0)
986 if after_semi:
987 p_die('Positional args must come before the semi-colon',
988 child.tok)
989 arg = self.Expr(child)
990 pos_args.append(arg)
991 return
992
993 if n == 2:
994 # Note: We allow multiple spreads, just like Julia. They are
995 # concatenated as in lists and dicts.
996 tok0 = p_node.GetChild(0).tok
997 if tok0.id == Id.Expr_Ellipsis:
998 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
999 if after_semi: # f(; ... named)
1000 named_args.append(NamedArg(None, spread_expr))
1001 else: # f(...named)
1002 pos_args.append(spread_expr)
1003 return
1004
1005 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1006 child = p_node.GetChild(0)
1007 if after_semi:
1008 p_die('Positional args must come before the semi-colon',
1009 child.tok)
1010
1011 elt = self.Expr(child)
1012 comp = self._CompFor(p_node.GetChild(1))
1013 arg = expr.GeneratorExp(elt, [comp])
1014 pos_args.append(arg)
1015 return
1016
1017 raise AssertionError()
1018
1019 if n == 3: # named args can come before or after the semicolon
1020 n1 = NamedArg(
1021 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1022 named_args.append(n1)
1023 return
1024
1025 raise AssertionError()
1026
1027 def _ArgGroup(self, p_node, after_semi, arglist):
1028 # type: (PNode, bool, ArgList) -> None
1029 """
1030 arg_group: argument (',' argument)* [',']
1031 """
1032 for i in xrange(p_node.NumChildren()):
1033 p_child = p_node.GetChild(i)
1034 if ISNONTERMINAL(p_child.typ):
1035 self._Argument(p_child, after_semi, arglist)
1036
1037 def _ArgList(self, p_node, arglist):
1038 # type: (PNode, ArgList) -> None
1039 """For both funcs and procs
1040
1041 arglist: [arg_group] [';' arg_group]
1042 """
1043 n = p_node.NumChildren()
1044 if n == 0:
1045 return
1046
1047 p0 = p_node.GetChild(0)
1048 i = 0
1049 if ISNONTERMINAL(p0.typ):
1050 self._ArgGroup(p0, False, arglist)
1051 i += 1
1052
1053 if n >= 2:
1054 arglist.semi_tok = p_node.GetChild(i).tok
1055 self._ArgGroup(p_node.GetChild(i + 1), True, arglist)
1056
1057 def ToArgList(self, pnode, arglist):
1058 # type: (PNode, ArgList) -> None
1059 """
1060 ysh_eager_arglist: '(' [arglist] ')'
1061 ysh_lazy_arglist: '[' [arglist] ']'
1062 """
1063 if pnode.NumChildren() == 2: # f()
1064 return
1065
1066 assert pnode.NumChildren() == 3
1067 p = pnode.GetChild(1) # the X in '( X )'
1068
1069 assert p.typ == grammar_nt.arglist
1070 self._ArgList(p, arglist)
1071
1072 def _TypeExpr(self, pnode):
1073 # type: (PNode) -> TypeExpr
1074 """
1075 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1076 """
1077 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1078
1079 #self.p_printer.Print(pnode)
1080
1081 ty = TypeExpr.CreateNull() # don't allocate children
1082
1083 ty.tok = pnode.GetChild(0).tok
1084 ty.name = lexer.TokenVal(ty.tok)
1085
1086 n = pnode.NumChildren()
1087 if n == 1:
1088 return ty
1089
1090 ty.params = []
1091 i = 2
1092 while i < n:
1093 p = self._TypeExpr(pnode.GetChild(i))
1094 ty.params.append(p)
1095 i += 2 # skip comma
1096
1097 return ty
1098
1099 def _Param(self, pnode):
1100 # type: (PNode) -> Param
1101 """
1102 param: Expr_Name [type_expr] ['=' expr]
1103 """
1104 assert pnode.typ == grammar_nt.param
1105
1106 name_tok = pnode.GetChild(0).tok
1107 n = pnode.NumChildren()
1108
1109 assert name_tok.id == Id.Expr_Name, name_tok
1110
1111 default_val = None # type: expr_t
1112 type_ = None # type: TypeExpr
1113
1114 #self.p_printer.Print(pnode)
1115
1116 if n == 1:
1117 # proc p(a)
1118 pass
1119
1120 elif n == 2:
1121 # proc p(a Int)
1122 type_ = self._TypeExpr(pnode.GetChild(1))
1123
1124 elif n == 3:
1125 # proc p(a = 3)
1126 default_val = self.Expr(pnode.GetChild(2))
1127
1128 elif n == 4:
1129 # proc p(a Int = 3)
1130 type_ = self._TypeExpr(pnode.GetChild(1))
1131 default_val = self.Expr(pnode.GetChild(3))
1132
1133 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1134
1135 def _ParamGroup(self, p_node):
1136 # type: (PNode) -> ParamGroup
1137 """
1138 param_group:
1139 (param ',')*
1140 [ (param | '...' Expr_Name) [,] ]
1141 """
1142 assert p_node.typ == grammar_nt.param_group, p_node
1143
1144 #self.p_printer.Print(p_node)
1145
1146 params = [] # type: List[Param]
1147 rest_of = None # type: Optional[RestParam]
1148
1149 n = p_node.NumChildren()
1150 i = 0
1151 while i < n:
1152 child = p_node.GetChild(i)
1153 if child.typ == grammar_nt.param:
1154 params.append(self._Param(child))
1155 elif child.tok.id == Id.Expr_Ellipsis:
1156 tok = p_node.GetChild(i + 1).tok
1157 rest_of = RestParam(tok, lexer.TokenVal(tok))
1158 i += 2
1159 #log('i %d n %d', i, n)
1160
1161 return ParamGroup(params, rest_of)
1162
1163 def Proc(self, p_node):
1164 # type: (PNode) -> proc_sig_t
1165 """
1166 ysh_proc: (
1167 [ '('
1168 [ param_group ] # word params, with defaults
1169 [ ';' [ param_group ] ] # positional typed params, with defaults
1170 [ ';' [ param_group ] ] # named params, with defaults
1171 [ ';' Expr_Name ] # optional block param, with no type or default
1172 ')'
1173 ]
1174 '{' # opening { for pgen2
1175 )
1176 """
1177 typ = p_node.typ
1178 assert typ == grammar_nt.ysh_proc
1179
1180 #self.p_printer.Print(p_node)
1181
1182 n = p_node.NumChildren()
1183 if n == 1: # proc f {
1184 return proc_sig.Open
1185
1186 if n == 3: # proc f () {
1187 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1188
1189 # proc f( three param groups, and block group )
1190 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1191
1192 # Word args
1193 i = 1
1194 child = p_node.GetChild(i)
1195 if child.typ == grammar_nt.param_group:
1196 sig.word = self._ParamGroup(p_node.GetChild(i))
1197
1198 # Validate word args
1199 for word in sig.word.params:
1200 if word.type:
1201 if word.type.name not in ('Str', 'Ref'):
1202 p_die('Word params may only have type Str or Ref',
1203 word.type.tok)
1204 if word.type.params is not None:
1205 p_die('Unexpected type parameters', word.type.tok)
1206
1207 i += 2
1208 else:
1209 i += 1
1210
1211 #log('i %d n %d', i, n)
1212 if i >= n:
1213 return sig
1214
1215 # Positional args
1216 child = p_node.GetChild(i)
1217 if child.typ == grammar_nt.param_group:
1218 sig.positional = self._ParamGroup(p_node.GetChild(i))
1219 i += 2
1220 else:
1221 i += 1
1222
1223 #log('i %d n %d', i, n)
1224 if i >= n:
1225 return sig
1226
1227 # Keyword args
1228 child = p_node.GetChild(i)
1229 if child.typ == grammar_nt.param_group:
1230 sig.named = self._ParamGroup(p_node.GetChild(i))
1231 i += 2
1232 else:
1233 i += 1
1234
1235 #log('i %d n %d', i, n)
1236 if i >= n:
1237 return sig
1238
1239 child = p_node.GetChild(i)
1240 if child.typ == grammar_nt.param_group:
1241 group = self._ParamGroup(p_node.GetChild(i))
1242 params = group.params
1243 if len(params) > 1:
1244 p_die('Only 1 block param is allowed', params[1].blame_tok)
1245 if group.rest_of:
1246 p_die("Rest param isn't allowed for blocks",
1247 group.rest_of.blame_tok)
1248
1249 if len(params) == 1:
1250 if params[0].type:
1251 if params[0].type.name != 'Command':
1252 p_die('Block param must have type Command',
1253 params[0].type.tok)
1254 if params[0].type.params is not None:
1255 p_die('Unexpected type parameters', params[0].type.tok)
1256
1257 sig.block_param = params[0]
1258
1259 return sig
1260
1261 def YshFunc(self, p_node, out):
1262 # type: (PNode, Func) -> None
1263 """Parse tree to LST
1264
1265 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1266 """
1267 assert p_node.typ == grammar_nt.ysh_func
1268
1269 #self.p_printer.Print(p_node)
1270
1271 out.name = p_node.GetChild(0).tok
1272
1273 n = p_node.NumChildren()
1274 i = 2 # after (
1275
1276 child = p_node.GetChild(i)
1277 if child.typ == grammar_nt.param_group:
1278 out.positional = self._ParamGroup(child)
1279 i += 2 # skip past ;
1280 else:
1281 i += 1
1282
1283 if i >= n:
1284 return
1285
1286 child = p_node.GetChild(i)
1287 if child.typ == grammar_nt.param_group:
1288 out.named = self._ParamGroup(child)
1289
1290 #
1291 # Regex Language
1292 #
1293
1294 def _RangeChar(self, p_node):
1295 # type: (PNode) -> Token
1296 """An endpoint of a range (single char)
1297
1298 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1299 a-z 0-9 'a'-'z' \x00-\xff
1300 """
1301 assert p_node.typ == grammar_nt.range_char, p_node
1302 typ = p_node.GetChild(0).typ
1303 if ISNONTERMINAL(typ):
1304 # 'a' in 'a'-'b'
1305 if typ == grammar_nt.sq_string:
1306 sq_part = cast(SingleQuoted,
1307 p_node.GetChild(0).GetChild(1).tok)
1308 tokens = sq_part.tokens
1309 # Can happen with multiline single-quoted strings
1310 if len(tokens) > 1:
1311 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1312 if tokens[0].length > 1:
1313 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1314 return tokens[0]
1315
1316 if typ == grammar_nt.char_literal:
1317 tok = p_node.GetChild(0).GetChild(0).tok
1318 return tok
1319
1320 raise NotImplementedError()
1321 else:
1322 # Expr_Name or Expr_DecInt
1323 tok = p_node.tok
1324 if tok.id in (Id.Expr_Name, Id.Expr_DecInt):
1325 # For the a in a-z, 0 in 0-9
1326 if tok.length != 1:
1327 p_die(RANGE_POINT_TOO_LONG, tok)
1328 return tok
1329
1330 raise NotImplementedError()
1331
1332 def _NonRangeChars(self, p_node):
1333 # type: (PNode) -> class_literal_term_t
1334 """\" \u1234 '#'."""
1335 assert p_node.typ == grammar_nt.range_char, p_node
1336 typ = p_node.GetChild(0).typ
1337 if ISNONTERMINAL(typ):
1338 p_child = p_node.GetChild(0)
1339 if typ == grammar_nt.sq_string:
1340 return cast(SingleQuoted, p_child.GetChild(1).tok)
1341
1342 if typ == grammar_nt.char_literal:
1343 return class_literal_term.CharLiteral(p_node.GetChild(0).tok)
1344
1345 raise NotImplementedError()
1346 else:
1347 # Look up PerlClass and PosixClass
1348 return self._NameInClass(None, p_node.GetChild(0).tok)
1349
1350 def _ClassLiteralTerm(self, p_node):
1351 # type: (PNode) -> class_literal_term_t
1352 """Parse tree to LST
1353
1354 class_literal_term:
1355 range_char ['-' range_char ]
1356 | '@' Expr_Name # splice
1357 | '!' Expr_Name # negate char class
1358 ...
1359 """
1360 assert p_node.typ == grammar_nt.class_literal_term, p_node
1361
1362 first = p_node.GetChild(0)
1363 typ = first.typ
1364
1365 if ISNONTERMINAL(typ):
1366 n = p_node.NumChildren()
1367
1368 if n == 1 and typ == grammar_nt.range_char:
1369 return self._NonRangeChars(p_node.GetChild(0))
1370
1371 # 'a'-'z' etc.
1372 if n == 3 and p_node.GetChild(1).tok.id == Id.Arith_Minus:
1373 start = self._RangeChar(p_node.GetChild(0))
1374 end = self._RangeChar(p_node.GetChild(2))
1375 return class_literal_term.Range(start, end)
1376
1377 else:
1378 if first.tok.id == Id.Expr_At:
1379 tok1 = p_node.GetChild(1).tok
1380 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1381
1382 if first.tok.id == Id.Expr_Bang:
1383 return self._NameInClass(
1384 p_node.GetChild(0).tok,
1385 p_node.GetChild(1).tok)
1386
1387 raise AssertionError(p_node.GetChild(0).tok.id)
1388
1389 nt_name = self.number2symbol[typ]
1390 raise NotImplementedError(nt_name)
1391
1392 def _ClassLiteral(self, p_node):
1393 # type: (PNode) -> List[class_literal_term_t]
1394 """class_literal: '[' class_literal_term+ ']'."""
1395 assert p_node.typ == grammar_nt.class_literal
1396 # skip [ and ]
1397 terms = [] # type: List[class_literal_term_t]
1398 for i in xrange(1, p_node.NumChildren() - 1):
1399 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1400
1401 return terms
1402
1403 def _NameInRegex(self, negated_tok, tok):
1404 # type: (Token, Token) -> re_t
1405 tok_str = lexer.TokenVal(tok)
1406 if tok_str == 'dot':
1407 if negated_tok:
1408 p_die("Can't negate this symbol", tok)
1409 return tok
1410
1411 if tok_str in POSIX_CLASSES:
1412 return PosixClass(negated_tok, tok_str)
1413
1414 perl = PERL_CLASSES.get(tok_str)
1415 if perl is not None:
1416 return PerlClass(negated_tok, perl)
1417
1418 if tok_str[0].isupper(): # e.g. HexDigit
1419 return re.Splice(tok, lexer.TokenVal(tok))
1420
1421 p_die("%r isn't a character class" % tok_str, tok)
1422
1423 def _NameInClass(self, negated_tok, tok):
1424 # type: (Token, Token) -> class_literal_term_t
1425 """Like the above, but 'dot' doesn't mean anything.
1426
1427 And `d` is a literal 'd', not `digit`.
1428 """
1429 tok_str = lexer.TokenVal(tok)
1430
1431 # A bare, unquoted character literal. In the grammar, this is expressed as
1432 # range_char without an ending.
1433
1434 # d is NOT 'digit', it's a literal 'd'!
1435 if len(tok_str) == 1:
1436 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1437 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1438
1439 if negated_tok: # [~d] is not allowed, only [~digit]
1440 p_die("Can't negate this symbol", tok)
1441 return class_literal_term.CharLiteral(tok)
1442
1443 # digit, word, but not d, w, etc.
1444 if tok_str in POSIX_CLASSES:
1445 return PosixClass(negated_tok, tok_str)
1446
1447 perl = PERL_CLASSES.get(tok_str)
1448 if perl is not None:
1449 return PerlClass(negated_tok, perl)
1450 p_die("%r isn't a character class" % tok_str, tok)
1451
1452 def _ReAtom(self, p_atom):
1453 # type: (PNode) -> re_t
1454 """
1455 re_atom: ( char_literal | ...
1456 """
1457 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1458
1459 typ = p_atom.GetChild(0).typ
1460
1461 if ISNONTERMINAL(typ):
1462 p_child = p_atom.GetChild(0)
1463 if typ == grammar_nt.class_literal:
1464 return re.CharClassLiteral(False, self._ClassLiteral(p_child))
1465
1466 if typ == grammar_nt.sq_string:
1467 return cast(SingleQuoted, p_child.GetChild(1).tok)
1468
1469 if typ == grammar_nt.char_literal:
1470 return p_atom.GetChild(0).tok
1471
1472 raise NotImplementedError(typ)
1473
1474 else:
1475 tok = p_atom.GetChild(0).tok
1476
1477 # Special punctuation
1478 if tok.id in (Id.Expr_Dot, Id.Arith_Caret, Id.Expr_Dollar):
1479 return tok
1480
1481 # TODO: d digit can turn into PosixClass and PerlClass right here!
1482 # It's parsing.
1483 if tok.id == Id.Expr_Name:
1484 return self._NameInRegex(None, tok)
1485
1486 if tok.id == Id.Expr_Symbol:
1487 # Validate symbols here, like we validate PerlClass, etc.
1488 tok_str = lexer.TokenVal(tok)
1489 if tok_str in ('%start', '%end', 'dot'):
1490 return tok
1491 p_die("Unexpected token %r in regex" % tok_str, tok)
1492
1493 if tok.id == Id.Expr_At:
1494 # | '@' Expr_Name
1495 tok = p_atom.GetChild(1).tok
1496 return re.Splice(tok, lexer.TokenVal(tok))
1497
1498 if tok.id == Id.Expr_Bang:
1499 # | '!' (Expr_Name | class_literal)
1500 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1501 n = p_atom.NumChildren()
1502 if n == 2:
1503 typ = p_atom.GetChild(1).typ
1504 if ISNONTERMINAL(typ):
1505 return re.CharClassLiteral(
1506 True, self._ClassLiteral(p_atom.GetChild(1)))
1507 else:
1508 return self._NameInRegex(tok, p_atom.GetChild(1).tok)
1509 else:
1510 # Note: !! conflicts with shell history
1511 p_die(
1512 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1513 p_atom.GetChild(1).tok)
1514
1515 if tok.id == Id.Op_LParen:
1516 # | '(' regex ')'
1517
1518 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1519 # Capture.
1520 return re.Group(self._Regex(p_atom.GetChild(1)))
1521
1522 if tok.id == Id.Arith_Less:
1523 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1524
1525 n = p_atom.NumChildren()
1526 assert n == 4 or n == 6 or n == 8, n
1527
1528 # < capture d+ >
1529 regex = self._Regex(p_atom.GetChild(2))
1530
1531 as_name = None # type: Optional[Token]
1532 func_name = None # type: Optional[Token]
1533
1534 i = 3 # points at any of > as :
1535
1536 tok = p_atom.GetChild(i).tok
1537 if tok.id == Id.Expr_As:
1538 as_name = p_atom.GetChild(i + 1).tok
1539 i += 2
1540
1541 tok = p_atom.GetChild(i).tok
1542 if tok.id == Id.Arith_Colon:
1543 func_name = p_atom.GetChild(i + 1).tok
1544
1545 return re.Capture(regex, as_name, func_name)
1546
1547 if tok.id == Id.Arith_Colon:
1548 # | ':' '(' regex ')'
1549 raise NotImplementedError(Id_str(tok.id))
1550
1551 raise NotImplementedError(Id_str(tok.id))
1552
1553 def _RepeatOp(self, p_repeat):
1554 # type: (PNode) -> re_repeat_t
1555 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1556
1557 tok = p_repeat.GetChild(0).tok
1558 id_ = tok.id
1559 # a+
1560 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1561 return re_repeat.Op(tok)
1562
1563 if id_ == Id.Op_LBrace:
1564 p_range = p_repeat.GetChild(1)
1565 assert p_range.typ == grammar_nt.repeat_range, p_range
1566
1567 # repeat_range: (
1568 # Expr_DecInt [',']
1569 # | ',' Expr_DecInt
1570 # | Expr_DecInt ',' Expr_DecInt
1571 # )
1572
1573 n = p_range.NumChildren()
1574 if n == 1: # {3}
1575 return re_repeat.Num(p_range.GetChild(0).tok)
1576
1577 if n == 2:
1578 if p_range.GetChild(0).tok.id == Id.Expr_DecInt: # {,3}
1579 return re_repeat.Range(p_range.GetChild(0).tok, None)
1580 else: # {1,}
1581 return re_repeat.Range(None, p_range.GetChild(1).tok)
1582
1583 if n == 3: # {1,3}
1584 return re_repeat.Range(
1585 p_range.GetChild(0).tok,
1586 p_range.GetChild(2).tok)
1587
1588 raise AssertionError(n)
1589
1590 raise AssertionError(id_)
1591
1592 def _Regex(self, p_node):
1593 # type: (PNode) -> re_t
1594 typ = p_node.typ
1595
1596 if typ == grammar_nt.regex:
1597 # regex: [re_alt] (('|'|'or') re_alt)*
1598
1599 if p_node.NumChildren() == 1:
1600 return self._Regex(p_node.GetChild(0))
1601
1602 # NOTE: We're losing the | and or operators
1603 alts = [] # type: List[re_t]
1604 n = p_node.NumChildren()
1605 for i in xrange(0, n, 2): # was children[::2]
1606 c = p_node.GetChild(i)
1607 alts.append(self._Regex(c))
1608 return re.Alt(alts)
1609
1610 if typ == grammar_nt.re_alt:
1611 # re_alt: (re_atom [repeat_op])+
1612 i = 0
1613 n = p_node.NumChildren()
1614 seq = [] # type: List[re_t]
1615 while i < n:
1616 r = self._ReAtom(p_node.GetChild(i))
1617 i += 1
1618 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1619 repeat_op = self._RepeatOp(p_node.GetChild(i))
1620 r = re.Repeat(r, repeat_op)
1621 i += 1
1622 seq.append(r)
1623
1624 if len(seq) == 1:
1625 return seq[0]
1626 else:
1627 return re.Seq(seq)
1628
1629 nt_name = self.number2symbol[typ]
1630 raise NotImplementedError(nt_name)
1631
1632
1633# vim: sw=4