OILS / ysh / expr_to_ast.py View on Github | oilshell.org

1669 lines, 1006 significant
1"""expr_to_ast.py."""
2from __future__ import print_function
3
4from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
5from _devbuild.gen.syntax_asdl import (
6 Token,
7 SimpleVarSub,
8 loc,
9 loc_t,
10 DoubleQuoted,
11 SingleQuoted,
12 BracedVarSub,
13 CommandSub,
14 ShArrayLiteral,
15 command,
16 expr,
17 expr_e,
18 expr_t,
19 expr_context_e,
20 re,
21 re_t,
22 re_repeat,
23 re_repeat_t,
24 class_literal_term,
25 class_literal_term_t,
26 PosixClass,
27 PerlClass,
28 NameType,
29 y_lhs_t,
30 Comprehension,
31 Subscript,
32 Attribute,
33 proc_sig,
34 proc_sig_t,
35 Param,
36 RestParam,
37 ParamGroup,
38 NamedArg,
39 ArgList,
40 pat,
41 pat_t,
42 TypeExpr,
43 Func,
44 Eggex,
45 EggexFlag,
46 CharCode,
47 CharRange,
48)
49from _devbuild.gen.value_asdl import value, value_t
50from _devbuild.gen import grammar_nt
51from core.error import p_die
52from core import num
53from frontend import consts
54from frontend import lexer
55from mycpp import mops
56from mycpp import mylib
57from mycpp.mylib import log, tagswitch
58from osh import word_compile
59from ysh import expr_parse
60from ysh import regex_translate
61
62from typing import TYPE_CHECKING, Dict, List, Tuple, Optional, cast
63if TYPE_CHECKING:
64 from pgen2.grammar import Grammar
65 from pgen2.pnode import PNode
66
67_ = log
68
69PERL_CLASSES = {
70 'd': 'd',
71 'w': 'w',
72 'word': 'w',
73 's': 's',
74}
75# https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html
76POSIX_CLASSES = [
77 'alnum',
78 'cntrl',
79 'lower',
80 'space',
81 'alpha',
82 'digit',
83 'print',
84 'upper',
85 'blank',
86 'graph',
87 'punct',
88 'xdigit',
89]
90# NOTE: There are also things like \p{Greek} that we could put in the
91# "non-sigil" namespace.
92
93RANGE_POINT_TOO_LONG = "Range start/end shouldn't have more than one character"
94
95# Copied from pgen2/token.py to avoid dependency.
96NT_OFFSET = 256
97
98if mylib.PYTHON:
99
100 def MakeGrammarNames(ysh_grammar):
101 # type: (Grammar) -> Dict[int, str]
102
103 # TODO: Break this dependency
104 from frontend import lexer_def
105
106 names = {}
107
108 #from _devbuild.gen.id_kind_asdl import _Id_str
109 # This is a dictionary
110
111 # _Id_str()
112
113 for id_name, k in lexer_def.ID_SPEC.id_str2int.items():
114 # Hm some are out of range
115 #assert k < 256, (k, id_name)
116
117 # HACK: Cut it off at 256 now! Expr/Arith/Op doesn't go higher than
118 # that. TODO: Change NT_OFFSET? That might affect C code though.
119 # Best to keep everything fed to pgen under 256. This only affects
120 # pretty printing.
121 if k < 256:
122 names[k] = id_name
123
124 for k, v in ysh_grammar.number2symbol.items():
125 # eval_input == 256. Remove?
126 assert k >= 256, (k, v)
127 names[k] = v
128
129 return names
130
131
132def ISNONTERMINAL(x):
133 # type: (int) -> bool
134 assert isinstance(x, int), x
135 return x >= NT_OFFSET
136
137
138class Transformer(object):
139 """Homogeneous parse tree -> heterogeneous AST ("lossless syntax tree")
140
141 pgen2 (Python's LL parser generator) doesn't have semantic actions like yacc,
142 so this "transformer" is the equivalent.
143
144 Files to refer to when modifying this function:
145
146 ysh/grammar.pgen2 (generates _devbuild/gen/grammar_nt.py)
147 frontend/syntax.asdl (generates _devbuild/gen/syntax_asdl.py)
148
149 Related examples:
150
151 opy/compiler2/transformer.py (Python's parse tree -> AST, ~1500 lines)
152 Python-2.7.13/Python/ast.c (the "real" CPython version, ~3600 lines)
153
154 Other:
155 frontend/parse_lib.py (turn on print_parse_tree)
156
157 Public methods:
158 Expr, VarDecl
159 atom, trailer, etc. are private, named after productions in grammar.pgen2.
160 """
161
162 def __init__(self, gr):
163 # type: (Grammar) -> None
164 self.number2symbol = gr.number2symbol
165 if mylib.PYTHON:
166 names = MakeGrammarNames(gr)
167 # print raw nodes
168 self.p_printer = expr_parse.ParseTreePrinter(names)
169
170 def _LeftAssoc(self, p_node):
171 # type: (PNode) -> expr_t
172 """For an associative binary operation.
173
174 Examples:
175 xor_expr: and_expr ('xor' and_expr)*
176 term: factor (('*'|'/'|'%'|'div') factor)*
177
178 3 - 1 - 2 must be grouped as ((3 - 1) - 2).
179 """
180 # Note: Compare the iteractive com_binary() method in
181 # opy/compiler2/transformer.py.
182
183 # Examples:
184 # - The PNode for '3 - 1' will have 3 children
185 # - The PNode for '3 - 1 - 2' will have 5 children
186
187 #self.p_printer.Print(p_node)
188
189 i = 1 # index of the operator
190 n = p_node.NumChildren()
191
192 left = self.Expr(p_node.GetChild(0))
193 while i < n:
194 op = p_node.GetChild(i)
195 right = self.Expr(p_node.GetChild(i + 1))
196
197 # create a new left node
198 left = expr.Binary(op.tok, left, right)
199 i += 2
200
201 return left
202
203 def _Trailer(self, base, p_trailer):
204 # type: (expr_t, PNode) -> expr_t
205 """
206 Trailer: ( '(' [arglist] ')' | '[' subscriptlist ']'
207 | '.' NAME | '->' NAME | '::' NAME
208 )
209 """
210 op_tok = p_trailer.GetChild(0).tok
211
212 if op_tok.id == Id.Op_LParen:
213 lparen = op_tok
214 rparen = p_trailer.GetChild(-1).tok
215 arglist = ArgList(lparen, [], None, [], rparen)
216 if p_trailer.NumChildren() == 2: # ()
217 return expr.FuncCall(base, arglist)
218
219 p = p_trailer.GetChild(1) # the X in ( X )
220 assert p.typ == grammar_nt.arglist # f(x, y)
221 self._ArgList(p, arglist)
222 return expr.FuncCall(base, arglist)
223
224 if op_tok.id == Id.Op_LBracket:
225 p_args = p_trailer.GetChild(1)
226 assert p_args.typ == grammar_nt.subscriptlist
227 n = p_args.NumChildren()
228 if n > 1:
229 p_die("Only 1 subscript is accepted", p_args.GetChild(1).tok)
230
231 a = p_args.GetChild(0)
232 return Subscript(op_tok, base, self._Subscript(a))
233
234 if op_tok.id in (Id.Expr_Dot, Id.Expr_RArrow, Id.Expr_RDArrow):
235 attr = p_trailer.GetChild(1).tok # will be Id.Expr_Name
236 return Attribute(base, op_tok, attr, lexer.TokenVal(attr),
237 expr_context_e.Store)
238
239 raise AssertionError(Id_str(op_tok.id))
240
241 def _DictPair(self, p_node):
242 # type: (PNode) -> Tuple[expr_t, expr_t]
243 """dict_pair: ( Expr_Name [':' test] |
244
245 '[' testlist ']' ':' test )
246 """
247 assert p_node.typ == grammar_nt.dict_pair
248
249 typ = p_node.GetChild(0).typ
250
251 if ISNONTERMINAL(typ): # for sq_string
252 # Note: Could inline these cases instead of going through self.Expr.
253 if typ == grammar_nt.sq_string:
254 key = self.Expr(p_node.GetChild(0)) # type: expr_t
255 elif typ == grammar_nt.dq_string:
256 key = self.Expr(p_node.GetChild(0))
257
258 val = self.Expr(p_node.GetChild(2))
259 return key, val
260
261 tok0 = p_node.GetChild(0).tok
262 id_ = tok0.id
263
264 if id_ == Id.Expr_Name:
265 key_str = value.Str(lexer.TokenVal(tok0))
266 key = expr.Const(tok0, key_str)
267 if p_node.NumChildren() >= 3:
268 val = self.Expr(p_node.GetChild(2))
269 else:
270 val = expr.Implicit
271
272 if id_ == Id.Op_LBracket: # {[x+y]: 'val'}
273 key = self.Expr(p_node.GetChild(1))
274 val = self.Expr(p_node.GetChild(4))
275 return key, val
276
277 return key, val
278
279 def _Dict(self, parent, p_node):
280 # type: (PNode, PNode) -> expr.Dict
281 """Parse tree to LST
282
283 dict: dict_pair (',' dict_pair)* [',']
284 dict2: dict_pair (comma_newline dict_pair)* [comma_newline]
285 """
286 if not ISNONTERMINAL(p_node.typ):
287 assert p_node.tok.id == Id.Op_RBrace
288 return expr.Dict(parent.tok, [], [])
289
290 #assert p_node.typ == grammar_nt.dict
291
292 keys = [] # type: List[expr_t]
293 values = [] # type: List[expr_t]
294
295 n = p_node.NumChildren()
296 for i in xrange(0, n, 2):
297 key, val = self._DictPair(p_node.GetChild(i))
298 keys.append(key)
299 values.append(val)
300
301 return expr.Dict(parent.tok, keys, values)
302
303 def _Tuple(self, parent):
304 # type: (PNode) -> expr_t
305
306 n = parent.NumChildren()
307
308 # (x) -- not a tuple
309 if n == 1:
310 return self.Expr(parent.GetChild(0))
311
312 # x, and (x,) aren't allowed
313 if n == 2:
314 p_die('Invalid trailing comma', parent.GetChild(1).tok)
315
316 elts = [] # type: List[expr_t]
317 for i in xrange(0, n, 2): # skip commas
318 p_node = parent.GetChild(i)
319 elts.append(self.Expr(p_node))
320
321 return expr.Tuple(parent.tok, elts,
322 expr_context_e.Store) # unused expr_context_e
323
324 def _TestlistComp(self, parent, p_node, id0):
325 # type: (PNode, PNode, Id_t) -> expr_t
326 """Parse tree to LST
327
328 testlist_comp:
329 (test|star_expr) ( comp_for | (',' (test|star_expr))* [','] )
330 """
331 assert p_node.typ == grammar_nt.testlist_comp
332 n = p_node.NumChildren()
333 if n > 1 and p_node.GetChild(1).typ == grammar_nt.comp_for:
334 elt = self.Expr(p_node.GetChild(0))
335 comp = self._CompFor(p_node.GetChild(1))
336 if id0 == Id.Op_LParen:
337 return expr.GeneratorExp(elt, [comp])
338 if id0 == Id.Op_LBracket:
339 return expr.ListComp(parent.tok, elt, [comp])
340 raise AssertionError()
341
342 if id0 == Id.Op_LParen:
343 if n == 1: # parenthesized expression like (x+1) or (x)
344 return self.Expr(p_node.GetChild(0))
345
346 # (1,) (1, 2) etc.
347 if p_node.GetChild(1).tok.id == Id.Arith_Comma:
348 return self._Tuple(p_node)
349
350 raise NotImplementedError('testlist_comp')
351
352 if id0 == Id.Op_LBracket:
353 elts = [] # type: List[expr_t]
354 for i in xrange(0, n, 2): # skip commas
355 elts.append(self.Expr(p_node.GetChild(i)))
356
357 return expr.List(parent.tok, elts,
358 expr_context_e.Store) # unused expr_context_e
359
360 raise AssertionError(Id_str(id0))
361
362 def _Atom(self, parent):
363 # type: (PNode) -> expr_t
364 """Handle alternatives of 'atom' where there's more than one child."""
365
366 tok = parent.GetChild(0).tok
367 id_ = tok.id
368 n = parent.NumChildren()
369
370 if id_ == Id.Op_LParen:
371 # atom: '(' [yield_expr|testlist_comp] ')' | ...
372 if n == 2: # () is a tuple
373 assert (parent.GetChild(1).tok.id == Id.Op_RParen
374 ), parent.GetChild(1)
375 return expr.Tuple(tok, [], expr_context_e.Store)
376
377 return self._TestlistComp(parent, parent.GetChild(1), id_)
378
379 if id_ == Id.Op_LBracket:
380 # atom: ... | '[' [testlist_comp] ']' | ...
381
382 if n == 2: # []
383 assert (parent.GetChild(1).tok.id == Id.Op_RBracket
384 ), parent.GetChild(1)
385 return expr.List(tok, [],
386 expr_context_e.Store) # unused expr_context_e
387
388 return self._TestlistComp(parent, parent.GetChild(1), id_)
389
390 if id_ == Id.Op_LBrace:
391 # atom: ... | '{' [Op_Newline] [dict] '}'
392 i = 1
393 if parent.GetChild(i).tok.id == Id.Op_Newline:
394 i += 1
395 return self._Dict(parent, parent.GetChild(i))
396
397 if id_ == Id.Arith_Amp:
398 n = parent.NumChildren()
399 if n >= 3:
400 p_die("Places in containers not implemented yet",
401 parent.GetChild(2).tok)
402
403 name_tok = parent.GetChild(1).tok
404 return expr.Place(name_tok, lexer.TokenVal(name_tok), [])
405
406 if id_ == Id.Expr_Func:
407 # STUB. This should really be a Func, not Lambda.
408 return expr.Lambda([], expr.Implicit)
409
410 # 100 M
411 # Ignoring the suffix for now
412 if id_ == Id.Expr_DecInt:
413 assert n > 1
414 p_die("Units suffix not implemented", parent.GetChild(1).tok)
415 #return self.Expr(parent.GetChild(0))
416
417 # 100.5 M
418 # Ignoring the suffix for now
419 if id_ == Id.Expr_Float:
420 assert n > 1
421 p_die("unix suffix implemented", parent.GetChild(1).tok)
422 #return self.Expr(parent.GetChild(0))
423
424 raise NotImplementedError(Id_str(id_))
425
426 def _NameType(self, p_node):
427 # type: (PNode) -> NameType
428 """
429 name_type: Expr_Name [':'] [type_expr]
430 """
431 name_tok = p_node.GetChild(0).tok
432 typ = None # type: Optional[TypeExpr]
433
434 n = p_node.NumChildren()
435 if n == 2:
436 typ = self._TypeExpr(p_node.GetChild(1))
437 if n == 3:
438 typ = self._TypeExpr(p_node.GetChild(2))
439
440 return NameType(name_tok, lexer.TokenVal(name_tok), typ)
441
442 def _NameTypeList(self, p_node):
443 # type: (PNode) -> List[NameType]
444 """name_type_list: name_type (',' name_type)*"""
445 assert p_node.typ == grammar_nt.name_type_list
446 results = [] # type: List[NameType]
447
448 n = p_node.NumChildren()
449 for i in xrange(0, n, 2): # was children[::2]
450 results.append(self._NameType(p_node.GetChild(i)))
451 return results
452
453 def _CompFor(self, p_node):
454 # type: (PNode) -> Comprehension
455 """comp_for: 'for' exprlist 'in' or_test ['if' or_test]"""
456 lhs = self._NameTypeList(p_node.GetChild(1))
457 iterable = self.Expr(p_node.GetChild(3))
458
459 if p_node.NumChildren() >= 6:
460 cond = self.Expr(p_node.GetChild(5))
461 else:
462 cond = None
463
464 return Comprehension(lhs, iterable, cond)
465
466 def _CompareChain(self, parent):
467 # type: (PNode) -> expr_t
468 """comparison: expr (comp_op expr)*"""
469 cmp_ops = [] # type: List[Token]
470 comparators = [] # type: List[expr_t]
471 left = self.Expr(parent.GetChild(0))
472
473 i = 1
474 n = parent.NumChildren()
475 while i < n:
476 p = parent.GetChild(i)
477 op = p.GetChild(0).tok
478 if p.NumChildren() == 2:
479 # Blame the first token, and change its type
480 if op.id == Id.Expr_Not: # not in
481 op.id = Id.Node_NotIn
482 elif op.id == Id.Expr_Is: # is not
483 op.id = Id.Node_IsNot
484 else:
485 raise AssertionError()
486 else:
487 # is, <, ==, etc.
488 pass
489
490 cmp_ops.append(op)
491 i += 1
492 comparators.append(self.Expr(parent.GetChild(i)))
493 i += 1
494 return expr.Compare(left, cmp_ops, comparators)
495
496 def _Subscript(self, parent):
497 # type: (PNode) -> expr_t
498 """subscript: expr | [expr] ':' [expr]"""
499 typ0 = parent.GetChild(0).typ
500
501 n = parent.NumChildren()
502
503 if ISNONTERMINAL(typ0):
504 if n == 3: # a[1:2]
505 lower = self.Expr(parent.GetChild(0))
506 upper = self.Expr(parent.GetChild(2))
507 elif n == 2: # a[1:]
508 lower = self.Expr(parent.GetChild(0))
509 upper = None
510 else: # a[1]
511 return self.Expr(parent.GetChild(0))
512 else:
513 assert parent.GetChild(0).tok.id == Id.Arith_Colon
514 lower = None
515 if n == 1: # a[:]
516 upper = None
517 else: # a[:3]
518 upper = self.Expr(parent.GetChild(1))
519 return expr.Slice(lower, parent.GetChild(0).tok, upper)
520
521 def Expr(self, pnode):
522 # type: (PNode) -> expr_t
523 """Transform expressions (as opposed to statements)."""
524 typ = pnode.typ
525 tok = pnode.tok
526
527 if ISNONTERMINAL(typ):
528
529 #
530 # Oil Entry Points / Additions
531 #
532
533 if typ == grammar_nt.ysh_expr: # for if/while
534 # ysh_expr: '(' testlist ')'
535 return self.Expr(pnode.GetChild(1))
536
537 if typ == grammar_nt.command_expr:
538 # return_expr: testlist end_stmt
539 return self.Expr(pnode.GetChild(0))
540
541 #
542 # Python-like Expressions / Operators
543 #
544
545 if typ == grammar_nt.atom:
546 if pnode.NumChildren() == 1:
547 return self.Expr(pnode.GetChild(0))
548 return self._Atom(pnode)
549
550 if typ == grammar_nt.testlist:
551 # testlist: test (',' test)* [',']
552 return self._Tuple(pnode)
553
554 if typ == grammar_nt.test:
555 # test: or_test ['if' or_test 'else' test] | lambdef
556 if pnode.NumChildren() == 1:
557 return self.Expr(pnode.GetChild(0))
558
559 # TODO: Handle lambdef
560
561 test = self.Expr(pnode.GetChild(2))
562 body = self.Expr(pnode.GetChild(0))
563 orelse = self.Expr(pnode.GetChild(4))
564 return expr.IfExp(test, body, orelse)
565
566 if typ == grammar_nt.lambdef:
567 # lambdef: '|' [name_type_list] '|' test
568
569 n = pnode.NumChildren()
570 if n == 4:
571 params = self._NameTypeList(pnode.GetChild(1))
572 else:
573 params = []
574
575 body = self.Expr(pnode.GetChild(n - 1))
576 return expr.Lambda(params, body)
577
578 #
579 # Operators with Precedence
580 #
581
582 if typ == grammar_nt.or_test:
583 # or_test: and_test ('or' and_test)*
584 return self._LeftAssoc(pnode)
585
586 if typ == grammar_nt.and_test:
587 # and_test: not_test ('and' not_test)*
588 return self._LeftAssoc(pnode)
589
590 if typ == grammar_nt.not_test:
591 # not_test: 'not' not_test | comparison
592 if pnode.NumChildren() == 1:
593 return self.Expr(pnode.GetChild(0))
594
595 op_tok = pnode.GetChild(0).tok # not
596 return expr.Unary(op_tok, self.Expr(pnode.GetChild(1)))
597
598 elif typ == grammar_nt.comparison:
599 if pnode.NumChildren() == 1:
600 return self.Expr(pnode.GetChild(0))
601
602 return self._CompareChain(pnode)
603
604 elif typ == grammar_nt.range_expr:
605 n = pnode.NumChildren()
606 if n == 1:
607 return self.Expr(pnode.GetChild(0))
608
609 if n == 3:
610 return expr.Range(self.Expr(pnode.GetChild(0)),
611 pnode.GetChild(1).tok,
612 self.Expr(pnode.GetChild(2)))
613
614 raise AssertionError(n)
615
616 elif typ == grammar_nt.expr:
617 # expr: xor_expr ('|' xor_expr)*
618 return self._LeftAssoc(pnode)
619
620 if typ == grammar_nt.xor_expr:
621 # xor_expr: and_expr ('xor' and_expr)*
622 return self._LeftAssoc(pnode)
623
624 if typ == grammar_nt.and_expr: # a & b
625 # and_expr: shift_expr ('&' shift_expr)*
626 return self._LeftAssoc(pnode)
627
628 elif typ == grammar_nt.shift_expr:
629 # shift_expr: arith_expr (('<<'|'>>') arith_expr)*
630 return self._LeftAssoc(pnode)
631
632 elif typ == grammar_nt.arith_expr:
633 # arith_expr: term (('+'|'-') term)*
634 return self._LeftAssoc(pnode)
635
636 elif typ == grammar_nt.term:
637 # term: factor (('*'|'/'|'div'|'mod') factor)*
638 return self._LeftAssoc(pnode)
639
640 elif typ == grammar_nt.factor:
641 # factor: ('+'|'-'|'~') factor | power
642 # the power would have already been reduced
643 if pnode.NumChildren() == 1:
644 return self.Expr(pnode.GetChild(0))
645
646 assert pnode.NumChildren() == 2
647 op = pnode.GetChild(0)
648 e = pnode.GetChild(1)
649
650 assert isinstance(op.tok, Token)
651 return expr.Unary(op.tok, self.Expr(e))
652
653 elif typ == grammar_nt.power:
654 # power: atom trailer* ['**' factor]
655
656 node = self.Expr(pnode.GetChild(0))
657 if pnode.NumChildren() == 1: # No trailers
658 return node
659
660 # Support a->startswith(b) and mydict.key
661 n = pnode.NumChildren()
662 i = 1
663 while i < n and ISNONTERMINAL(pnode.GetChild(i).typ):
664 node = self._Trailer(node, pnode.GetChild(i))
665 i += 1
666
667 if i != n: # ['**' factor]
668 op_tok = pnode.GetChild(i).tok
669 assert op_tok.id == Id.Arith_DStar, op_tok
670 factor = self.Expr(pnode.GetChild(i + 1))
671 node = expr.Binary(op_tok, node, factor)
672
673 return node
674
675 elif typ == grammar_nt.literal_expr:
676 inner = self.Expr(pnode.GetChild(1))
677 return expr.Literal(inner)
678
679 elif typ == grammar_nt.eggex:
680 return self._Eggex(pnode)
681
682 elif typ == grammar_nt.ysh_expr_sub:
683 return self.Expr(pnode.GetChild(0))
684
685 #
686 # YSH Lexer Modes
687 #
688
689 elif typ == grammar_nt.sh_array_literal:
690 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
691
692 elif typ == grammar_nt.old_sh_array_literal:
693 return cast(ShArrayLiteral, pnode.GetChild(1).tok)
694
695 elif typ == grammar_nt.sh_command_sub:
696 return cast(CommandSub, pnode.GetChild(1).tok)
697
698 elif typ == grammar_nt.braced_var_sub:
699 return cast(BracedVarSub, pnode.GetChild(1).tok)
700
701 elif typ == grammar_nt.dq_string:
702 s = cast(DoubleQuoted, pnode.GetChild(1).tok)
703 # sugar: ^"..." is short for ^["..."]
704 if pnode.GetChild(0).tok.id == Id.Left_CaretDoubleQuote:
705 return expr.Literal(s)
706 return s
707
708 elif typ == grammar_nt.sq_string:
709 return cast(SingleQuoted, pnode.GetChild(1).tok)
710
711 elif typ == grammar_nt.simple_var_sub:
712 tok = pnode.GetChild(0).tok
713
714 if tok.id == Id.VSub_DollarName: # $foo is disallowed
715 bare = lexer.TokenSliceLeft(tok, 1)
716 p_die(
717 'In expressions, remove $ and use `%s`, or sometimes "$%s"'
718 % (bare, bare), tok)
719
720 # $? is allowed
721 return SimpleVarSub(tok)
722
723 else:
724 nt_name = self.number2symbol[typ]
725 raise AssertionError("PNode type %d (%s) wasn't handled" %
726 (typ, nt_name))
727
728 else: # Terminals should have a token
729 id_ = tok.id
730
731 if id_ == Id.Expr_Name:
732 return expr.Var(tok, lexer.TokenVal(tok))
733
734 tok_str = lexer.TokenVal(tok)
735
736 # Remove underscores from 1_000_000. The lexer is responsible for
737 # validation.
738 c_under = tok_str.replace('_', '')
739
740 if id_ == Id.Expr_DecInt:
741 try:
742 cval = value.Int(mops.FromStr(c_under)) # type: value_t
743 except ValueError:
744 p_die('Decimal int constant is too large', tok)
745 elif id_ == Id.Expr_BinInt:
746 assert c_under[:2] in ('0b', '0B'), c_under
747 try:
748 cval = value.Int(mops.FromStr(c_under[2:], 2))
749 except ValueError:
750 p_die('Binary int constant is too large', tok)
751 elif id_ == Id.Expr_OctInt:
752 assert c_under[:2] in ('0o', '0O'), c_under
753 try:
754 cval = value.Int(mops.FromStr(c_under[2:], 8))
755 except ValueError:
756 p_die('Octal int constant is too large', tok)
757 elif id_ == Id.Expr_HexInt:
758 assert c_under[:2] in ('0x', '0X'), c_under
759 try:
760 cval = value.Int(mops.FromStr(c_under[2:], 16))
761 except ValueError:
762 p_die('Hex int constant is too large', tok)
763
764 elif id_ == Id.Expr_Float:
765 # Note: float() in mycpp/gc_builtins.cc currently uses strtod
766 # I think this never raises ValueError, because the lexer
767 # should only accept strings that strtod() does?
768 cval = value.Float(float(c_under))
769
770 elif id_ == Id.Expr_Null:
771 cval = value.Null
772 elif id_ == Id.Expr_True:
773 cval = value.Bool(True)
774 elif id_ == Id.Expr_False:
775 cval = value.Bool(False)
776
777 # What to do with the char constants?
778 # \n \u{3bc} #'a'
779 # Are they integers or strings?
780 #
781 # Integers could be ord(\n), or strings could chr(\n)
782 # Or just remove them, with ord(u'\n') and chr(u'\n')
783 #
784 # I think this relies on small string optimization. If we have it,
785 # then 1-4 byte characters are efficient, and don't require heap
786 # allocation.
787
788 elif id_ == Id.Char_OneChar:
789 # TODO: look up integer directly?
790 cval = num.ToBig(ord(consts.LookupCharC(tok_str[1])))
791 elif id_ == Id.Char_UBraced:
792 hex_str = tok_str[3:-1] # \u{123}
793 # ValueError shouldn't happen because lexer validates
794 cval = value.Int(mops.FromStr(hex_str, 16))
795
796 # This could be a char integer? Not sure
797 elif id_ == Id.Char_Pound:
798 # TODO: accept UTF-8 code point instead of single byte
799 byte = tok_str[2] # the a in #'a'
800 cval = num.ToBig(ord(byte)) # It's an integer
801
802 else:
803 raise AssertionError(Id_str(id_))
804
805 return expr.Const(tok, cval)
806
807 def _ArrayItem(self, p_node):
808 # type: (PNode) -> expr_t
809 assert p_node.typ == grammar_nt.array_item
810
811 child0 = p_node.GetChild(0)
812 typ0 = child0.typ
813 if ISNONTERMINAL(typ0):
814 return self.Expr(child0)
815 else:
816 if child0.tok.id == Id.Op_LParen: # (x+1)
817 return self.Expr(p_node.GetChild(1))
818 return self.Expr(child0) # $1 ${x} etc.
819
820 def _LhsExprList(self, p_node):
821 # type: (PNode) -> List[y_lhs_t]
822 """lhs_list: expr (',' expr)*"""
823 assert p_node.typ == grammar_nt.lhs_list
824
825 lhs_list = [] # type: List[y_lhs_t]
826 n = p_node.NumChildren()
827 for i in xrange(0, n, 2): # was children[::2]
828 p = p_node.GetChild(i)
829
830 e = self.Expr(p)
831 UP_e = e
832 with tagswitch(e) as case:
833 if case(expr_e.Var):
834 e = cast(expr.Var, UP_e)
835 lhs_list.append(e.left)
836
837 elif case(expr_e.Subscript):
838 e = cast(Subscript, UP_e)
839 lhs_list.append(e)
840
841 elif case(expr_e.Attribute):
842 e = cast(Attribute, UP_e)
843 if e.op.id != Id.Expr_Dot:
844 # e.g. setvar obj->method is not valid
845 p_die("Can't assign to this attribute expr", e.op)
846 lhs_list.append(e)
847
848 else:
849 pass # work around mycpp bug
850
851 # TODO: could blame arbitary expr_t, bu this works most of
852 # the time
853 if p.tok:
854 blame = p.tok # type: loc_t
855 else:
856 blame = loc.Missing
857 p_die("Can't assign to this expression", blame)
858
859 return lhs_list
860
861 def MakeVarDecl(self, p_node):
862 # type: (PNode) -> command.VarDecl
863 """
864 ysh_var_decl: name_type_list ['=' testlist] end_stmt
865 """
866 assert p_node.typ == grammar_nt.ysh_var_decl
867
868 lhs = self._NameTypeList(p_node.GetChild(0)) # could be a tuple
869
870 # This syntax is confusing, and different than JavaScript
871 # var x, y = 1, 2
872 # But this is useful:
873 # var flag, i = parseArgs(spec, argv)
874
875 n = p_node.NumChildren()
876 if n >= 3:
877 rhs = self.Expr(p_node.GetChild(2))
878 else:
879 rhs = None
880
881 # The caller should fill in the keyword token.
882 return command.VarDecl(None, lhs, rhs)
883
884 def MakeMutation(self, p_node):
885 # type: (PNode) -> command.Mutation
886 """Parse tree to LST
887
888 ysh_mutation: lhs_list (augassign | '=') testlist end_stmt
889 """
890 typ = p_node.typ
891 assert typ == grammar_nt.ysh_mutation
892
893 lhs_list = self._LhsExprList(p_node.GetChild(0)) # could be a tuple
894 op_tok = p_node.GetChild(1).tok
895 if len(lhs_list) > 1 and op_tok.id != Id.Arith_Equal:
896 p_die('Multiple assignment must use =', op_tok)
897 rhs = self.Expr(p_node.GetChild(2))
898 return command.Mutation(None, lhs_list, op_tok, rhs)
899
900 def _EggexFlag(self, p_node):
901 # type: (PNode) -> EggexFlag
902 n = p_node.NumChildren()
903 if n == 1:
904 return EggexFlag(False, p_node.GetChild(0).tok)
905 elif n == 2:
906 return EggexFlag(True, p_node.GetChild(1).tok)
907 else:
908 raise AssertionError()
909
910 def _Eggex(self, p_node):
911 # type: (PNode) -> Eggex
912 """
913 eggex: '/' regex [';' re_flag* [';' Expr_Name] ] '/'
914 """
915 n = p_node.NumChildren()
916 left = p_node.GetChild(0).tok
917 regex = self._Regex(p_node.GetChild(1))
918
919 flags = [] # type: List[EggexFlag]
920 trans_pref = None # type: Optional[Token]
921
922 i = 2
923 current = p_node.GetChild(i)
924 if current.typ == Id.Op_Semi:
925 i += 1
926 while True:
927 current = p_node.GetChild(i)
928 if not ISNONTERMINAL(current.typ):
929 break
930 flags.append(self._EggexFlag(current))
931 i += 1
932
933 if current.typ == Id.Op_Semi:
934 i += 1
935 trans_pref = p_node.GetChild(i).tok
936
937 # Canonicalize and validate flags for ERE only. Default is ERE.
938 if trans_pref is None or lexer.TokenVal(trans_pref) == 'ERE':
939 canonical_flags = regex_translate.CanonicalFlags(flags)
940 else:
941 canonical_flags = None
942
943 return Eggex(left, regex, flags, trans_pref, canonical_flags)
944
945 def YshCasePattern(self, pnode):
946 # type: (PNode) -> pat_t
947 assert pnode.typ == grammar_nt.ysh_case_pat, pnode
948
949 pattern = pnode.GetChild(0)
950 typ = pattern.typ
951 if typ == Id.Op_LParen:
952 # pat_expr or pat_else
953 pattern = pnode.GetChild(1)
954 typ = pattern.typ
955
956 if typ == grammar_nt.pat_else:
957 return pat.Else
958 elif typ == grammar_nt.pat_exprs:
959 exprs = [] # type: List[expr_t]
960 for i in xrange(pattern.NumChildren()):
961 child = pattern.GetChild(i)
962 if child.typ == grammar_nt.expr:
963 expr = self.Expr(child)
964 exprs.append(expr)
965 return pat.YshExprs(exprs)
966
967 elif typ == grammar_nt.eggex:
968 return self._Eggex(pattern)
969
970 raise NotImplementedError()
971
972 def _Argument(self, p_node, after_semi, arglist):
973 # type: (PNode, bool, ArgList) -> None
974 """Parse tree to LST
975
976 argument: (
977 test [comp_for]
978 | test '=' test # named arg
979 | '...' test # var args
980 )
981 """
982 pos_args = arglist.pos_args
983 named_args = arglist.named_args
984
985 assert p_node.typ == grammar_nt.argument, p_node
986 n = p_node.NumChildren()
987 if n == 1:
988 child = p_node.GetChild(0)
989 if after_semi:
990 p_die('Positional args must come before the semi-colon',
991 child.tok)
992 arg = self.Expr(child)
993 pos_args.append(arg)
994 return
995
996 if n == 2:
997 # Note: We allow multiple spreads, just like Julia. They are
998 # concatenated as in lists and dicts.
999 tok0 = p_node.GetChild(0).tok
1000 if tok0.id == Id.Expr_Ellipsis:
1001 spread_expr = expr.Spread(tok0, self.Expr(p_node.GetChild(1)))
1002 if after_semi: # f(; ... named)
1003 named_args.append(NamedArg(None, spread_expr))
1004 else: # f(...named)
1005 pos_args.append(spread_expr)
1006 return
1007
1008 if p_node.GetChild(1).typ == grammar_nt.comp_for:
1009 child = p_node.GetChild(0)
1010 if after_semi:
1011 p_die('Positional args must come before the semi-colon',
1012 child.tok)
1013
1014 elt = self.Expr(child)
1015 comp = self._CompFor(p_node.GetChild(1))
1016 arg = expr.GeneratorExp(elt, [comp])
1017 pos_args.append(arg)
1018 return
1019
1020 raise AssertionError()
1021
1022 if n == 3: # named args can come before or after the semicolon
1023 n1 = NamedArg(
1024 p_node.GetChild(0).tok, self.Expr(p_node.GetChild(2)))
1025 named_args.append(n1)
1026 return
1027
1028 raise AssertionError()
1029
1030 def _ArgGroup(self, p_node, after_semi, arglist):
1031 # type: (PNode, bool, ArgList) -> None
1032 """
1033 arg_group: argument (',' argument)* [',']
1034 """
1035 for i in xrange(p_node.NumChildren()):
1036 p_child = p_node.GetChild(i)
1037 if ISNONTERMINAL(p_child.typ):
1038 self._Argument(p_child, after_semi, arglist)
1039
1040 def _ArgList(self, p_node, arglist):
1041 # type: (PNode, ArgList) -> None
1042 """For both funcs and procs
1043
1044 arglist: [arg_group] [';' arg_group]
1045 """
1046 n = p_node.NumChildren()
1047 if n == 0:
1048 return
1049
1050 p0 = p_node.GetChild(0)
1051 i = 0
1052 if ISNONTERMINAL(p0.typ):
1053 self._ArgGroup(p0, False, arglist)
1054 i += 1
1055
1056 if n >= 2:
1057 arglist.semi_tok = p_node.GetChild(i).tok
1058 self._ArgGroup(p_node.GetChild(i + 1), True, arglist)
1059
1060 def ToArgList(self, pnode, arglist):
1061 # type: (PNode, ArgList) -> None
1062 """
1063 ysh_eager_arglist: '(' [arglist] ')'
1064 ysh_lazy_arglist: '[' [arglist] ']'
1065 """
1066 if pnode.NumChildren() == 2: # f()
1067 return
1068
1069 assert pnode.NumChildren() == 3
1070 p = pnode.GetChild(1) # the X in '( X )'
1071
1072 assert p.typ == grammar_nt.arglist
1073 self._ArgList(p, arglist)
1074
1075 def _TypeExpr(self, pnode):
1076 # type: (PNode) -> TypeExpr
1077 """
1078 type_expr: Expr_Name [ '[' type_expr (',' type_expr)* ']' ]
1079 """
1080 assert pnode.typ == grammar_nt.type_expr, pnode.typ
1081
1082 #self.p_printer.Print(pnode)
1083
1084 ty = TypeExpr.CreateNull() # don't allocate children
1085
1086 ty.tok = pnode.GetChild(0).tok
1087 ty.name = lexer.TokenVal(ty.tok)
1088
1089 n = pnode.NumChildren()
1090 if n == 1:
1091 return ty
1092
1093 ty.params = []
1094 i = 2
1095 while i < n:
1096 p = self._TypeExpr(pnode.GetChild(i))
1097 ty.params.append(p)
1098 i += 2 # skip comma
1099
1100 return ty
1101
1102 def _Param(self, pnode):
1103 # type: (PNode) -> Param
1104 """
1105 param: Expr_Name [type_expr] ['=' expr]
1106 """
1107 assert pnode.typ == grammar_nt.param
1108
1109 name_tok = pnode.GetChild(0).tok
1110 n = pnode.NumChildren()
1111
1112 assert name_tok.id == Id.Expr_Name, name_tok
1113
1114 default_val = None # type: expr_t
1115 type_ = None # type: TypeExpr
1116
1117 #self.p_printer.Print(pnode)
1118
1119 if n == 1:
1120 # proc p(a)
1121 pass
1122
1123 elif n == 2:
1124 # proc p(a Int)
1125 type_ = self._TypeExpr(pnode.GetChild(1))
1126
1127 elif n == 3:
1128 # proc p(a = 3)
1129 default_val = self.Expr(pnode.GetChild(2))
1130
1131 elif n == 4:
1132 # proc p(a Int = 3)
1133 type_ = self._TypeExpr(pnode.GetChild(1))
1134 default_val = self.Expr(pnode.GetChild(3))
1135
1136 return Param(name_tok, lexer.TokenVal(name_tok), type_, default_val)
1137
1138 def _ParamGroup(self, p_node):
1139 # type: (PNode) -> ParamGroup
1140 """
1141 param_group:
1142 (param ',')*
1143 [ (param | '...' Expr_Name) [,] ]
1144 """
1145 assert p_node.typ == grammar_nt.param_group, p_node
1146
1147 #self.p_printer.Print(p_node)
1148
1149 params = [] # type: List[Param]
1150 rest_of = None # type: Optional[RestParam]
1151
1152 n = p_node.NumChildren()
1153 i = 0
1154 while i < n:
1155 child = p_node.GetChild(i)
1156 if child.typ == grammar_nt.param:
1157 params.append(self._Param(child))
1158 elif child.tok.id == Id.Expr_Ellipsis:
1159 tok = p_node.GetChild(i + 1).tok
1160 rest_of = RestParam(tok, lexer.TokenVal(tok))
1161 i += 2
1162 #log('i %d n %d', i, n)
1163
1164 return ParamGroup(params, rest_of)
1165
1166 def Proc(self, p_node):
1167 # type: (PNode) -> proc_sig_t
1168 """
1169 ysh_proc: (
1170 [ '('
1171 [ param_group ] # word params, with defaults
1172 [ ';' [ param_group ] ] # positional typed params, with defaults
1173 [ ';' [ param_group ] ] # named params, with defaults
1174 [ ';' Expr_Name ] # optional block param, with no type or default
1175 ')'
1176 ]
1177 '{' # opening { for pgen2
1178 )
1179 """
1180 typ = p_node.typ
1181 assert typ == grammar_nt.ysh_proc
1182
1183 #self.p_printer.Print(p_node)
1184
1185 n = p_node.NumChildren()
1186 if n == 1: # proc f {
1187 return proc_sig.Open
1188
1189 if n == 3: # proc f () {
1190 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1191
1192 # proc f( three param groups, and block group )
1193 sig = proc_sig.Closed.CreateNull(alloc_lists=True) # no params
1194
1195 # Word args
1196 i = 1
1197 child = p_node.GetChild(i)
1198 if child.typ == grammar_nt.param_group:
1199 sig.word = self._ParamGroup(p_node.GetChild(i))
1200
1201 # Validate word args
1202 for word in sig.word.params:
1203 if word.type:
1204 if word.type.name not in ('Str', 'Ref'):
1205 p_die('Word params may only have type Str or Ref',
1206 word.type.tok)
1207 if word.type.params is not None:
1208 p_die('Unexpected type parameters', word.type.tok)
1209
1210 i += 2
1211 else:
1212 i += 1
1213
1214 #log('i %d n %d', i, n)
1215 if i >= n:
1216 return sig
1217
1218 # Positional args
1219 child = p_node.GetChild(i)
1220 if child.typ == grammar_nt.param_group:
1221 sig.positional = self._ParamGroup(p_node.GetChild(i))
1222 i += 2
1223 else:
1224 i += 1
1225
1226 #log('i %d n %d', i, n)
1227 if i >= n:
1228 return sig
1229
1230 # Keyword args
1231 child = p_node.GetChild(i)
1232 if child.typ == grammar_nt.param_group:
1233 sig.named = self._ParamGroup(p_node.GetChild(i))
1234 i += 2
1235 else:
1236 i += 1
1237
1238 #log('i %d n %d', i, n)
1239 if i >= n:
1240 return sig
1241
1242 child = p_node.GetChild(i)
1243 if child.typ == grammar_nt.param_group:
1244 group = self._ParamGroup(p_node.GetChild(i))
1245 params = group.params
1246 if len(params) > 1:
1247 p_die('Only 1 block param is allowed', params[1].blame_tok)
1248 if group.rest_of:
1249 p_die("Rest param isn't allowed for blocks",
1250 group.rest_of.blame_tok)
1251
1252 if len(params) == 1:
1253 if params[0].type:
1254 if params[0].type.name != 'Command':
1255 p_die('Block param must have type Command',
1256 params[0].type.tok)
1257 if params[0].type.params is not None:
1258 p_die('Unexpected type parameters', params[0].type.tok)
1259
1260 sig.block_param = params[0]
1261
1262 return sig
1263
1264 def YshFunc(self, p_node, out):
1265 # type: (PNode, Func) -> None
1266 """Parse tree to LST
1267
1268 ysh_func: Expr_Name '(' [param_group] [';' param_group] ')'
1269 """
1270 assert p_node.typ == grammar_nt.ysh_func
1271
1272 #self.p_printer.Print(p_node)
1273
1274 out.name = p_node.GetChild(0).tok
1275
1276 n = p_node.NumChildren()
1277 i = 2 # after (
1278
1279 child = p_node.GetChild(i)
1280 if child.typ == grammar_nt.param_group:
1281 out.positional = self._ParamGroup(child)
1282 i += 2 # skip past ;
1283 else:
1284 i += 1
1285
1286 if i >= n:
1287 return
1288
1289 child = p_node.GetChild(i)
1290 if child.typ == grammar_nt.param_group:
1291 out.named = self._ParamGroup(child)
1292
1293 #
1294 # Regex Language
1295 #
1296
1297 def _RangeCharSingleQuoted(self, p_node):
1298 # type: (PNode) -> Optional[CharCode]
1299
1300 assert p_node.typ == grammar_nt.range_char, p_node
1301 typ = p_node.GetChild(0).typ
1302
1303 # 'a' in 'a'-'b'
1304 if ISNONTERMINAL(typ) and typ == grammar_nt.sq_string:
1305 sq_part = cast(SingleQuoted, p_node.GetChild(0).GetChild(1).tok)
1306 n = len(sq_part.sval)
1307 if n == 0:
1308 p_die("Quoted range char can't be empty",
1309 loc.WordPart(sq_part))
1310 elif n == 1:
1311 return CharCode(sq_part.left, ord(sq_part.sval[0]), False)
1312 else:
1313 p_die(RANGE_POINT_TOO_LONG, loc.WordPart(sq_part))
1314 return None
1315
1316 def _OtherRangeToken(self, p_node):
1317 # type: (PNode) -> Token
1318 """An endpoint of a range (single char)
1319
1320 range_char: Expr_Name | Expr_DecInt | sq_string | char_literal
1321 a-z 0-9 'a'-'z' \x00-\xff
1322 """
1323 assert p_node.typ == grammar_nt.range_char, p_node
1324
1325 typ = p_node.GetChild(0).typ
1326 if ISNONTERMINAL(typ):
1327 # \x00 in /[\x00 - \x20]/
1328 assert typ == grammar_nt.char_literal, typ
1329 tok = p_node.GetChild(0).GetChild(0).tok
1330 return tok
1331
1332 tok = p_node.tok
1333 # a in a-z is Expr_Name
1334 # 0 in 0-9 is Expr_DecInt
1335 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt), tok
1336
1337 if tok.length != 1:
1338 p_die(RANGE_POINT_TOO_LONG, tok)
1339 return tok
1340
1341 def _NonRangeChars(self, p_node):
1342 # type: (PNode) -> class_literal_term_t
1343 """
1344 \" \u1234 '#'
1345 """
1346 assert p_node.typ == grammar_nt.range_char, p_node
1347 typ = p_node.GetChild(0).typ
1348 if ISNONTERMINAL(typ):
1349 p_child = p_node.GetChild(0)
1350 if typ == grammar_nt.sq_string:
1351 return cast(SingleQuoted, p_child.GetChild(1).tok)
1352
1353 if typ == grammar_nt.char_literal:
1354 tok = p_node.GetChild(0).tok
1355 return word_compile.EvalCharLiteralForRegex(tok)
1356
1357 raise NotImplementedError()
1358 else:
1359 # Look up PerlClass and PosixClass
1360 return self._NameInClass(None, p_node.GetChild(0).tok)
1361
1362 def _ClassLiteralTerm(self, p_node):
1363 # type: (PNode) -> class_literal_term_t
1364 """Parse tree to LST
1365
1366 class_literal_term:
1367 range_char ['-' range_char ]
1368 | '@' Expr_Name # splice
1369 | '!' Expr_Name # negate char class
1370 ...
1371 """
1372 assert p_node.typ == grammar_nt.class_literal_term, p_node
1373
1374 first = p_node.GetChild(0)
1375 typ = first.typ
1376
1377 if ISNONTERMINAL(typ):
1378 n = p_node.NumChildren()
1379
1380 if n == 1 and typ == grammar_nt.range_char:
1381 return self._NonRangeChars(p_node.GetChild(0))
1382
1383 # 'a'-'z' etc.
1384 if n == 3 and p_node.GetChild(1).tok.id == Id.Arith_Minus:
1385 left = p_node.GetChild(0)
1386 right = p_node.GetChild(2)
1387
1388 code1 = self._RangeCharSingleQuoted(left)
1389 if code1 is None:
1390 tok1 = self._OtherRangeToken(left)
1391 code1 = word_compile.EvalCharLiteralForRegex(tok1)
1392
1393 code2 = self._RangeCharSingleQuoted(right)
1394 if code2 is None:
1395 tok2 = self._OtherRangeToken(right)
1396 code2 = word_compile.EvalCharLiteralForRegex(tok2)
1397 return CharRange(code1, code2)
1398
1399 else:
1400 if first.tok.id == Id.Expr_At:
1401 tok1 = p_node.GetChild(1).tok
1402 return class_literal_term.Splice(tok1, lexer.TokenVal(tok1))
1403
1404 if first.tok.id == Id.Expr_Bang:
1405 return self._NameInClass(
1406 p_node.GetChild(0).tok,
1407 p_node.GetChild(1).tok)
1408
1409 raise AssertionError(p_node.GetChild(0).tok.id)
1410
1411 nt_name = self.number2symbol[typ]
1412 raise NotImplementedError(nt_name)
1413
1414 def _ClassLiteral(self, p_node):
1415 # type: (PNode) -> List[class_literal_term_t]
1416 """class_literal: '[' class_literal_term+ ']'."""
1417 assert p_node.typ == grammar_nt.class_literal
1418 # skip [ and ]
1419 terms = [] # type: List[class_literal_term_t]
1420 for i in xrange(1, p_node.NumChildren() - 1):
1421 terms.append(self._ClassLiteralTerm(p_node.GetChild(i)))
1422
1423 return terms
1424
1425 def _NameInRegex(self, negated_tok, tok):
1426 # type: (Token, Token) -> re_t
1427 tok_str = lexer.TokenVal(tok)
1428 if tok_str == 'dot':
1429 if negated_tok:
1430 p_die("Can't negate this symbol", tok)
1431 return re.Primitive(tok, Id.Re_Dot)
1432
1433 if tok_str in POSIX_CLASSES:
1434 return PosixClass(negated_tok, tok_str)
1435
1436 perl = PERL_CLASSES.get(tok_str)
1437 if perl is not None:
1438 return PerlClass(negated_tok, perl)
1439
1440 if tok_str[0].isupper(): # e.g. HexDigit
1441 return re.Splice(tok, lexer.TokenVal(tok))
1442
1443 p_die("%r isn't a character class" % tok_str, tok)
1444
1445 def _NameInClass(self, negated_tok, tok):
1446 # type: (Token, Token) -> class_literal_term_t
1447 """Like the above, but 'dot' and 'd' don't mean anything within []"""
1448 tok_str = lexer.TokenVal(tok)
1449
1450 # A bare, unquoted character literal. In the grammar, this is expressed as
1451 # range_char without an ending.
1452
1453 # d is NOT 'digit', it's a literal 'd'!
1454 if len(tok_str) == 1:
1455 # Expr_Name matches VAR_NAME_RE, which starts with [a-zA-Z_]
1456 assert tok.id in (Id.Expr_Name, Id.Expr_DecInt)
1457
1458 if negated_tok: # [~d] is not allowed, only [~digit]
1459 p_die("Can't negate this symbol", tok)
1460 return word_compile.EvalCharLiteralForRegex(tok)
1461
1462 # digit, word, but not d, w, etc.
1463 if tok_str in POSIX_CLASSES:
1464 return PosixClass(negated_tok, tok_str)
1465
1466 perl = PERL_CLASSES.get(tok_str)
1467 if perl is not None:
1468 return PerlClass(negated_tok, perl)
1469 p_die("%r isn't a character class" % tok_str, tok)
1470
1471 def _ReAtom(self, p_atom):
1472 # type: (PNode) -> re_t
1473 """
1474 re_atom: ( char_literal | ...
1475 """
1476 assert p_atom.typ == grammar_nt.re_atom, p_atom.typ
1477
1478 typ = p_atom.GetChild(0).typ
1479
1480 if ISNONTERMINAL(typ):
1481 p_child = p_atom.GetChild(0)
1482 if typ == grammar_nt.class_literal:
1483 return re.CharClassLiteral(False, self._ClassLiteral(p_child))
1484
1485 if typ == grammar_nt.sq_string:
1486 return cast(SingleQuoted, p_child.GetChild(1).tok)
1487
1488 if typ == grammar_nt.char_literal:
1489 tok = p_atom.GetChild(0).tok
1490 # Must be Id.Char_{OneChar,Hex,UBraced}
1491 assert consts.GetKind(tok.id) == Kind.Char
1492 s = word_compile.EvalCStringToken(tok.id, lexer.TokenVal(tok))
1493 return re.LiteralChars(tok, s)
1494
1495 raise NotImplementedError(typ)
1496
1497 else:
1498 tok = p_atom.GetChild(0).tok
1499
1500 # Special punctuation
1501 if tok.id == Id.Expr_Dot: # .
1502 return re.Primitive(tok, Id.Re_Dot)
1503
1504 if tok.id == Id.Arith_Caret: # ^
1505 return re.Primitive(tok, Id.Re_Start)
1506
1507 if tok.id == Id.Expr_Dollar: # $
1508 return re.Primitive(tok, Id.Re_End)
1509
1510 if tok.id == Id.Expr_Name:
1511 # d digit -> PosixClass PerlClass etc.
1512 return self._NameInRegex(None, tok)
1513
1514 if tok.id == Id.Expr_Symbol:
1515 # Validate symbols here, like we validate PerlClass, etc.
1516 tok_str = lexer.TokenVal(tok)
1517 if tok_str == '%start':
1518 return re.Primitive(tok, Id.Re_Start)
1519 if tok_str == '%end':
1520 return re.Primitive(tok, Id.Re_End)
1521 p_die("Unexpected token %r in regex" % tok_str, tok)
1522
1523 if tok.id == Id.Expr_At:
1524 # | '@' Expr_Name
1525 tok = p_atom.GetChild(1).tok
1526 return re.Splice(tok, lexer.TokenVal(tok))
1527
1528 if tok.id == Id.Expr_Bang:
1529 # | '!' (Expr_Name | class_literal)
1530 # | '!' '!' Expr_Name (Expr_Name | Expr_DecInt | '(' regex ')')
1531 n = p_atom.NumChildren()
1532 if n == 2:
1533 typ = p_atom.GetChild(1).typ
1534 if ISNONTERMINAL(typ):
1535 return re.CharClassLiteral(
1536 True, self._ClassLiteral(p_atom.GetChild(1)))
1537 else:
1538 return self._NameInRegex(tok, p_atom.GetChild(1).tok)
1539 else:
1540 # Note: !! conflicts with shell history
1541 p_die(
1542 "Backtracking with !! isn't implemented (requires Python/PCRE)",
1543 p_atom.GetChild(1).tok)
1544
1545 if tok.id == Id.Op_LParen:
1546 # | '(' regex ')'
1547
1548 # Note: in ERE (d+) is the same as <d+>. That is, Group becomes
1549 # Capture.
1550 return re.Group(self._Regex(p_atom.GetChild(1)))
1551
1552 if tok.id == Id.Arith_Less:
1553 # | '<' 'capture' regex ['as' Expr_Name] [':' Expr_Name] '>'
1554
1555 n = p_atom.NumChildren()
1556 assert n == 4 or n == 6 or n == 8, n
1557
1558 # < capture d+ >
1559 regex = self._Regex(p_atom.GetChild(2))
1560
1561 as_name = None # type: Optional[Token]
1562 func_name = None # type: Optional[Token]
1563
1564 i = 3 # points at any of > as :
1565
1566 tok = p_atom.GetChild(i).tok
1567 if tok.id == Id.Expr_As:
1568 as_name = p_atom.GetChild(i + 1).tok
1569 i += 2
1570
1571 tok = p_atom.GetChild(i).tok
1572 if tok.id == Id.Arith_Colon:
1573 func_name = p_atom.GetChild(i + 1).tok
1574
1575 return re.Capture(regex, as_name, func_name)
1576
1577 if tok.id == Id.Arith_Colon:
1578 # | ':' '(' regex ')'
1579 raise NotImplementedError(Id_str(tok.id))
1580
1581 raise NotImplementedError(Id_str(tok.id))
1582
1583 def _RepeatOp(self, p_repeat):
1584 # type: (PNode) -> re_repeat_t
1585 assert p_repeat.typ == grammar_nt.repeat_op, p_repeat
1586
1587 tok = p_repeat.GetChild(0).tok
1588 id_ = tok.id
1589 # a+
1590 if id_ in (Id.Arith_Plus, Id.Arith_Star, Id.Arith_QMark):
1591 return tok
1592
1593 if id_ == Id.Op_LBrace:
1594 p_range = p_repeat.GetChild(1)
1595 assert p_range.typ == grammar_nt.repeat_range, p_range
1596
1597 # repeat_range: (
1598 # Expr_DecInt [',']
1599 # | ',' Expr_DecInt
1600 # | Expr_DecInt ',' Expr_DecInt
1601 # )
1602
1603 n = p_range.NumChildren()
1604 if n == 1: # {3}
1605 tok = p_range.GetChild(0).tok
1606 return tok # different operator than + * ?
1607
1608 if n == 2:
1609 if p_range.GetChild(0).tok.id == Id.Expr_DecInt: # {,3}
1610 left = p_range.GetChild(0).tok
1611 return re_repeat.Range(left, lexer.TokenVal(left), '',
1612 None)
1613 else: # {1,}
1614 right = p_range.GetChild(1).tok
1615 return re_repeat.Range(None, '', lexer.TokenVal(right),
1616 right)
1617
1618 if n == 3: # {1,3}
1619 left = p_range.GetChild(0).tok
1620 right = p_range.GetChild(2).tok
1621 return re_repeat.Range(left, lexer.TokenVal(left),
1622 lexer.TokenVal(right), right)
1623
1624 raise AssertionError(n)
1625
1626 raise AssertionError(id_)
1627
1628 def _Regex(self, p_node):
1629 # type: (PNode) -> re_t
1630 typ = p_node.typ
1631
1632 if typ == grammar_nt.regex:
1633 # regex: [re_alt] (('|'|'or') re_alt)*
1634
1635 if p_node.NumChildren() == 1:
1636 return self._Regex(p_node.GetChild(0))
1637
1638 # NOTE: We're losing the | and or operators
1639 alts = [] # type: List[re_t]
1640 n = p_node.NumChildren()
1641 for i in xrange(0, n, 2): # was children[::2]
1642 c = p_node.GetChild(i)
1643 alts.append(self._Regex(c))
1644 return re.Alt(alts)
1645
1646 if typ == grammar_nt.re_alt:
1647 # re_alt: (re_atom [repeat_op])+
1648 i = 0
1649 n = p_node.NumChildren()
1650 seq = [] # type: List[re_t]
1651 while i < n:
1652 r = self._ReAtom(p_node.GetChild(i))
1653 i += 1
1654 if i < n and p_node.GetChild(i).typ == grammar_nt.repeat_op:
1655 repeat_op = self._RepeatOp(p_node.GetChild(i))
1656 r = re.Repeat(r, repeat_op)
1657 i += 1
1658 seq.append(r)
1659
1660 if len(seq) == 1:
1661 return seq[0]
1662 else:
1663 return re.Seq(seq)
1664
1665 nt_name = self.number2symbol[typ]
1666 raise NotImplementedError(nt_name)
1667
1668
1669# vim: sw=4