1 | """expr_parse.py."""
2 | from __future__ import print_function
3 |
4 | from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5 | CommandSub, ShArrayLiteral,
6 | CompoundWord, word_part_t, word_e)
7 | from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8 | from _devbuild.gen.types_asdl import lex_mode_e
9 |
10 | from core import ui
11 | from core.error import p_die
12 | from frontend import consts
13 | from frontend import lexer
14 | from frontend import reader
15 | from mycpp import mylib
16 | from mycpp.mylib import log, tagswitch
17 | from osh import braces
18 | from osh import word_
19 | from osh import word_compile
20 | from pgen2 import parse
21 | from pgen2.pnode import PNodeAllocator
22 |
23 | _ = log
24 |
25 | from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
27 | from frontend.lexer import Lexer
28 | from frontend.parse_lib import ParseContext
29 | from pgen2.grammar import Grammar
30 | from pgen2.pnode import PNode
31 |
32 | if mylib.PYTHON:
33 |
34 | class ParseTreePrinter(object):
35 | """Prints a tree of PNode instances."""
36 |
37 | def __init__(self, names):
38 | # type: (Dict[int, str]) -> None
39 | self.names = names
40 | self.f = mylib.Stdout()
41 |
42 | def _Print(self, pnode, indent, i):
43 | # type: (PNode, int, int) -> None
44 |
45 | ind = ' ' * indent
46 | # NOTE:
47 | # - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
48 | # rid of.
49 | if pnode.tok:
50 | if isinstance(pnode.tok, Token):
51 | v = lexer.TokenVal(pnode.tok)
52 | else:
53 | # e.g. CommandSub for x = $(echo hi)
54 | v = repr(pnode.tok)
55 | else:
56 | v = '-'
57 | self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
58 | if pnode.children is not None:
59 | for i, child in enumerate(pnode.children):
60 | self._Print(child, indent + 1, i)
61 |
62 | def Print(self, pnode):
63 | # type: (PNode) -> None
64 | self._Print(pnode, 0, 0)
65 |
66 |
67 | def _Classify(gr, tok):
68 | # type: (Grammar, Token) -> int
69 |
70 | # We have to match up what ParserGenerator.make_grammar() did when
71 | # calling make_label() and make_first(). See classify() in
72 | # opy/pgen2/driver.py.
73 |
74 | id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
75 |
76 | # TODO: use something more efficient than a Dict
77 | if id_ in gr.tokens:
78 | return gr.tokens[id_]
79 |
80 | if id_ == Id.Unknown_DEqual:
81 | p_die('Use === to be exact, or ~== to convert types', tok)
82 |
83 | if id_ == Id.Unknown_Tok:
84 | type_str = ''
85 | else:
86 | type_str = ' (%s)' % ui.PrettyId(tok.id)
87 | p_die('Unexpected token in expression mode%s' % type_str, tok)
88 |
89 |
90 | # Newlines are ignored between these pairs.
91 | # yapf: disable
93 |
94 | # Parenthesized expressions (tuples) and func/proc parameter lists
95 | Id.Op_LParen: 1,
96 | Id.Op_RParen: -1,
97 | Id.Op_LBracket: 1,
98 | Id.Op_RBracket: -1,
99 |
100 | # Dicts are {}, and the grammar respects Op_Newline.
101 | }
102 | # yapf: enable
103 |
104 |
105 | def _PushYshTokens(parse_ctx, gr, p, lex):
106 | # type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
107 | """Push tokens onto pgen2's parser.
108 |
109 | Returns the last token so it can be reused/seen by the CommandParser.
110 | """
111 | #log('keywords = %s', gr.keywords)
112 | #log('tokens = %s', gr.tokens)
113 |
114 | last_token = None # type: Optional[Token]
115 | prev_was_newline = False
116 |
117 | balance = 0 # to ignore newlines
118 |
119 | while True:
120 | if last_token: # e.g. left over from WordParser
121 | tok = last_token
122 | #log('last_token = %s', last_token)
123 | last_token = None
124 | else:
125 | tok = lex.Read(lex_mode_e.Expr)
126 | #log('tok = %s', tok)
127 |
128 | # Comments and whitespace. Newlines aren't ignored.
129 | if consts.GetKind(tok.id) == Kind.Ignored:
130 | continue
131 |
132 | # For multiline lists, maps, etc.
133 | if tok.id == Id.Op_Newline:
134 | if balance > 0:
135 | #log('*** SKIPPING NEWLINE')
136 | continue
137 | # Eliminate duplicate newline tokens. It makes the grammar simpler, and
138 | # it's consistent with CPython's lexer and our own WordParser.
139 | if prev_was_newline:
140 | continue
141 | prev_was_newline = True
142 | else:
143 | prev_was_newline = False
144 |
145 | balance += _OTHER_BALANCE.get(tok.id, 0)
146 | #log('BALANCE after seeing %s = %d', tok.id, balance)
147 |
148 | if tok.id == Id.Op_LParen:
149 | # For nesting inside $()
150 | lex.PushHint(Id.Op_RParen, Id.Op_RParen)
151 |
152 | #if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
153 | # tok.id = KEYWORDS[tok.val]
154 | # log('Replaced with %s', tok.id)
155 |
156 | assert tok.id < 256, Id_str(tok.id)
157 |
158 | ilabel = _Classify(gr, tok)
159 | #log('tok = %s, ilabel = %d', tok, ilabel)
160 |
161 | if p.addtoken(tok.id, tok, ilabel):
162 | return tok
163 |
164 | #
165 | # Mututally recursive calls into the command/word parsers.
166 | #
167 |
168 | if tok.id in (Id.Left_ColonPipe,
169 | Id.Left_PercentParen): # :| %( LEGACY!
170 | left_tok = tok
171 | if tok.id == Id.Left_PercentParen:
172 | lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
173 |
174 | # Blame the opening token
175 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
176 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
177 | words = [] # type: List[CompoundWord]
178 | close_tok = None # type: Optional[Token]
179 | done = False
180 | while not done:
181 | w = w_parser.ReadWord(lex_mode_e.ShCommand)
182 | with tagswitch(w) as case:
183 | if case(word_e.Operator):
184 | tok = cast(Token, w)
185 | if tok.id == Id.Right_ShArrayLiteral:
186 | if left_tok.id != Id.Left_PercentParen:
187 | p_die('Expected ) to close', left_tok)
188 | close_tok = tok
189 | done = True # can't use break here
190 | elif tok.id == Id.Op_Pipe:
191 | if left_tok.id != Id.Left_ColonPipe:
192 | p_die('Expected ) to close', left_tok)
193 | close_tok = tok
194 | done = True # can't use break here
195 | elif tok.id == Id.Op_Newline: # internal newlines allowed
196 | continue
197 | else:
198 | p_die('Unexpected token in array literal',
199 | loc.Word(w))
200 |
201 | elif case(word_e.Compound):
202 | words.append(cast(CompoundWord, w))
203 |
204 | else:
205 | raise AssertionError()
206 |
207 | words2 = braces.BraceDetectAll(words)
208 | words3 = word_.TildeDetectAll(words2)
209 |
210 | typ = Id.Expr_CastedDummy
211 |
212 | lit_part = ShArrayLiteral(left_tok, words3, close_tok)
213 | opaque = cast(Token, lit_part) # HACK for expr_to_ast
214 | done = p.addtoken(typ, opaque, gr.tokens[typ])
215 | assert not done # can't end the expression
216 |
217 | # Now push the closing )
218 | ilabel = _Classify(gr, close_tok)
219 | done = p.addtoken(tok.id, close_tok, ilabel)
220 | assert not done # can't end the expression
221 |
222 | continue
223 |
224 | # $( @( ^(
225 | if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
226 | Id.Left_CaretParen):
227 |
228 | left_token = tok
229 |
230 | lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
231 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
232 | c_parser = parse_ctx.MakeParserForCommandSub(
233 | line_reader, lex, Id.Eof_RParen)
234 | node = c_parser.ParseCommandSub()
235 | # A little gross: Copied from osh/word_parse.py
236 | right_token = c_parser.w_parser.cur_token
237 |
238 | cs_part = CommandSub(left_token, node, right_token)
239 |
240 | typ = Id.Expr_CastedDummy
241 | opaque = cast(Token, cs_part) # HACK for expr_to_ast
242 | done = p.addtoken(typ, opaque, gr.tokens[typ])
243 | assert not done # can't end the expression
244 |
245 | # Now push the closing )
246 | ilabel = _Classify(gr, right_token)
247 | done = p.addtoken(right_token.id, right_token, ilabel)
248 | assert not done # can't end the expression
249 |
250 | continue
251 |
252 | # ", """ and ^"
253 | if tok.id in (Id.Left_DoubleQuote, Id.Left_TDoubleQuote,
254 | Id.Left_CaretDoubleQuote):
255 | left_token = tok
256 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
257 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
258 |
259 | parts = [] # type: List[word_part_t]
260 | last_token = w_parser.ReadDoubleQuoted(left_token, parts)
261 | expr_dq_part = DoubleQuoted(left_token, parts, last_token)
262 |
263 | typ = Id.Expr_CastedDummy
264 | opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
265 | done = p.addtoken(typ, opaque, gr.tokens[typ])
266 | assert not done # can't end the expression
267 |
268 | continue
269 |
270 | # ${
271 | if tok.id == Id.Left_DollarBrace:
272 | left_token = tok
273 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
274 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
275 |
276 | part, last_token = w_parser.ReadBracedVarSub(left_token)
277 |
278 | # It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
279 | typ = Id.Expr_CastedDummy
280 | opaque = cast(Token, part) # HACK for expr_to_ast
281 | done = p.addtoken(typ, opaque, gr.tokens[typ])
282 | assert not done # can't end the expression
283 |
284 | continue
285 |
286 | # 'x' '''x'''
287 | # r'x' r'''x'''
288 | # u'x' u'''x'''
289 | # b'x' b'''x'''
290 | # $'x'
291 | if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
292 | Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
293 | Id.Left_USingleQuote, Id.Left_UTSingleQuote,
294 | Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
295 | Id.Left_DollarSingleQuote):
296 | if tok.id == Id.Left_DollarSingleQuote:
297 | sq_mode = lex_mode_e.SQ_C
298 | elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
299 | Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
300 | sq_mode = lex_mode_e.J8_Str
301 | else:
302 | sq_mode = lex_mode_e.SQ_Raw
303 |
304 | left_token = tok
305 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
306 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
307 |
308 | tokens = [] # type: List[Token]
309 | last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
310 | True)
311 |
312 | sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
313 | sq_part = SingleQuoted(left_token, sval, last_token)
314 |
315 | typ = Id.Expr_CastedDummy
316 | opaque = cast(Token, sq_part) # HACK for expr_to_ast
317 | done = p.addtoken(typ, opaque, gr.tokens[typ])
318 | assert not done # can't end the expression
319 | continue
320 |
321 | else:
322 | # We never broke out -- EOF is too soon (how can this happen???)
323 | raise parse.ParseError("incomplete input", tok.id, tok)
324 |
325 |
326 | class ExprParser(object):
327 | """A wrapper around a pgen2 parser."""
328 |
329 | def __init__(self, parse_ctx, gr):
330 | # type: (ParseContext, Grammar) -> None
331 | self.parse_ctx = parse_ctx
332 | self.gr = gr
333 | # Reused multiple times.
334 | self.push_parser = parse.Parser(gr)
335 | self.pnode_alloc = None # type: Optional[PNodeAllocator]
336 |
337 | def Parse(self, lexer, start_symbol):
338 | # type: (Lexer, int) -> Tuple[PNode, Token]
339 |
340 | # Reuse the parser
341 | self.push_parser.setup(start_symbol, self.pnode_alloc)
342 | try:
343 | last_token = _PushYshTokens(self.parse_ctx, self.gr,
344 | self.push_parser, lexer)
345 | except parse.ParseError as e:
346 | #log('ERROR %s', e)
347 | # TODO:
348 | # - Describe what lexer mode we're in (Invalid syntax in regex)
349 | # - Maybe say where the mode started
350 | # - Id.Unknown_Tok could say "This character is invalid"
351 |
352 | # ParseError has a "too much input" case but I haven't been able to
353 | # tickle it. Maybe it's because of the Eof tokens?
354 |
355 | p_die(
356 | 'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
357 | e.tok)
358 |
359 | return self.push_parser.rootnode, last_token
360 |
361 |
362 | class ctx_PNodeAllocator(object):
363 |
364 | def __init__(self, ep):
365 | # type: (ExprParser) -> None
366 | self.expr_parser = ep
367 | self.expr_parser.pnode_alloc = PNodeAllocator()
368 |
369 | def __enter__(self):
370 | # type: () -> None
371 | pass
372 |
373 | def __exit__(self, type, value, traceback):
374 | # type: (Any, Any, Any) -> None
375 | self.expr_parser.pnode_alloc.Clear()
376 | self.expr_parser.pnode_alloc = None