OILS / ysh / expr_parse.py View on Github | oilshell.org

373 lines, 224 significant
1"""expr_parse.py."""
2from __future__ import print_function
3
4from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5 CommandSub, ShArrayLiteral,
6 CompoundWord, word_part_t, word_e)
7from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8from _devbuild.gen.types_asdl import lex_mode_e
9
10from core import ui
11from core.error import p_die
12from frontend import consts
13from frontend import lexer
14from frontend import reader
15from mycpp import mylib
16from mycpp.mylib import log, tagswitch
17from osh import braces
18from osh import word_
19from pgen2 import parse
20from pgen2.pnode import PNodeAllocator
21
22_ = log
23
24from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
25if TYPE_CHECKING:
26 from frontend.lexer import Lexer
27 from frontend.parse_lib import ParseContext
28 from pgen2.grammar import Grammar
29 from pgen2.pnode import PNode
30
31if mylib.PYTHON:
32
33 class ParseTreePrinter(object):
34 """Prints a tree of PNode instances."""
35
36 def __init__(self, names):
37 # type: (Dict[int, str]) -> None
38 self.names = names
39 self.f = mylib.Stdout()
40
41 def _Print(self, pnode, indent, i):
42 # type: (PNode, int, int) -> None
43
44 ind = ' ' * indent
45 # NOTE:
46 # - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
47 # rid of.
48 if pnode.tok:
49 if isinstance(pnode.tok, Token):
50 v = lexer.TokenVal(pnode.tok)
51 else:
52 # e.g. CommandSub for x = $(echo hi)
53 v = repr(pnode.tok)
54 else:
55 v = '-'
56 self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
57 if pnode.children is not None:
58 for i, child in enumerate(pnode.children):
59 self._Print(child, indent + 1, i)
60
61 def Print(self, pnode):
62 # type: (PNode) -> None
63 self._Print(pnode, 0, 0)
64
65
66def _Classify(gr, tok):
67 # type: (Grammar, Token) -> int
68
69 # We have to match up what ParserGenerator.make_grammar() did when
70 # calling make_label() and make_first(). See classify() in
71 # opy/pgen2/driver.py.
72
73 # TODO: use something more efficient than a Dict
74 if tok.id in gr.tokens:
75 return gr.tokens[tok.id]
76
77 if tok.id == Id.Unknown_DEqual:
78 p_die('Use === to be exact, or ~== to convert types', tok)
79
80 if tok.id == Id.Unknown_Tok:
81 type_str = ''
82 else:
83 type_str = ' (%s)' % ui.PrettyId(tok.id)
84 p_die('Unexpected token in expression mode%s' % type_str, tok)
85
86
87# Newlines are ignored between these pairs.
88# yapf: disable
89_OTHER_BALANCE = {
90
91 # Parenthesized expressions (tuples) and func/proc parameter lists
92 Id.Op_LParen: 1,
93 Id.Op_RParen: -1,
94 Id.Op_LBracket: 1,
95 Id.Op_RBracket: -1,
96
97 # Dicts are {}, and the grammar respects Op_Newline.
98}
99# yapf: enable
100
101
102def _PushOilTokens(parse_ctx, gr, p, lex):
103 # type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
104 """Push tokens onto pgen2's parser.
105
106 Returns the last token so it can be reused/seen by the
107 CommandParser.
108 """
109 #log('keywords = %s', gr.keywords)
110 #log('tokens = %s', gr.tokens)
111
112 last_token = None # type: Optional[Token]
113 prev_was_newline = False
114
115 balance = 0 # to ignore newlines
116
117 while True:
118 if last_token: # e.g. left over from WordParser
119 tok = last_token
120 #log('last_token = %s', last_token)
121 last_token = None
122 else:
123 tok = lex.Read(lex_mode_e.Expr)
124 #log('tok = %s', tok)
125
126 # Comments and whitespace. Newlines aren't ignored.
127 if consts.GetKind(tok.id) == Kind.Ignored:
128 continue
129
130 # For multiline lists, maps, etc.
131 if tok.id == Id.Op_Newline:
132 if balance > 0:
133 #log('*** SKIPPING NEWLINE')
134 continue
135 # Eliminate duplicate newline tokens. It makes the grammar simpler, and
136 # it's consistent with CPython's lexer and our own WordParser.
137 if prev_was_newline:
138 continue
139 prev_was_newline = True
140 else:
141 prev_was_newline = False
142
143 balance += _OTHER_BALANCE.get(tok.id, 0)
144 #log('BALANCE after seeing %s = %d', tok.id, balance)
145
146 if tok.id == Id.Op_LParen:
147 # For nesting inside $()
148 lex.PushHint(Id.Op_RParen, Id.Op_RParen)
149
150 #if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
151 # tok.id = KEYWORDS[tok.val]
152 # log('Replaced with %s', tok.id)
153
154 assert tok.id < 256, Id_str(tok.id)
155
156 ilabel = _Classify(gr, tok)
157 #log('tok = %s, ilabel = %d', tok, ilabel)
158
159 if p.addtoken(tok.id, tok, ilabel):
160 return tok
161
162 #
163 # Mututally recursive calls into the command/word parsers.
164 #
165
166 if tok.id in (Id.Left_ColonPipe,
167 Id.Left_PercentParen): # :| %( LEGACY!
168 left_tok = tok
169 if tok.id == Id.Left_PercentParen:
170 lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
171
172 # Blame the opening token
173 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
174 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
175 words = [] # type: List[CompoundWord]
176 close_tok = None # type: Optional[Token]
177 done = False
178 while not done:
179 w = w_parser.ReadWord(lex_mode_e.ShCommand)
180 with tagswitch(w) as case:
181 if case(word_e.Operator):
182 tok = cast(Token, w)
183 if tok.id == Id.Right_ShArrayLiteral:
184 if left_tok.id != Id.Left_PercentParen:
185 p_die('Expected ) to close', left_tok)
186 close_tok = tok
187 done = True # can't use break here
188 elif tok.id == Id.Op_Pipe:
189 if left_tok.id != Id.Left_ColonPipe:
190 p_die('Expected ) to close', left_tok)
191 close_tok = tok
192 done = True # can't use break here
193 elif tok.id == Id.Op_Newline: # internal newlines allowed
194 continue
195 else:
196 p_die('Unexpected token in array literal',
197 loc.Word(w))
198
199 elif case(word_e.Compound):
200 words.append(cast(CompoundWord, w))
201
202 else:
203 raise AssertionError()
204
205 words2 = braces.BraceDetectAll(words)
206 words3 = word_.TildeDetectAll(words2)
207
208 typ = Id.Expr_CastedDummy
209
210 lit_part = ShArrayLiteral(left_tok, words3, close_tok)
211 opaque = cast(Token, lit_part) # HACK for expr_to_ast
212 done = p.addtoken(typ, opaque, gr.tokens[typ])
213 assert not done # can't end the expression
214
215 # Now push the closing )
216 ilabel = _Classify(gr, close_tok)
217 done = p.addtoken(tok.id, close_tok, ilabel)
218 assert not done # can't end the expression
219
220 continue
221
222 # $( @( ^(
223 if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
224 Id.Left_CaretParen):
225
226 left_token = tok
227
228 lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
229 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
230 c_parser = parse_ctx.MakeParserForCommandSub(
231 line_reader, lex, Id.Eof_RParen)
232 node = c_parser.ParseCommandSub()
233 # A little gross: Copied from osh/word_parse.py
234 right_token = c_parser.w_parser.cur_token
235
236 cs_part = CommandSub(left_token, node, right_token)
237
238 typ = Id.Expr_CastedDummy
239 opaque = cast(Token, cs_part) # HACK for expr_to_ast
240 done = p.addtoken(typ, opaque, gr.tokens[typ])
241 assert not done # can't end the expression
242
243 # Now push the closing )
244 ilabel = _Classify(gr, right_token)
245 done = p.addtoken(right_token.id, right_token, ilabel)
246 assert not done # can't end the expression
247
248 continue
249
250 # ", """ and ^"
251 if tok.id in (Id.Left_DoubleQuote, Id.Left_TDoubleQuote,
252 Id.Left_CaretDoubleQuote):
253 left_token = tok
254 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
255 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
256
257 parts = [] # type: List[word_part_t]
258 last_token = w_parser.ReadDoubleQuoted(left_token, parts)
259 expr_dq_part = DoubleQuoted(left_token, parts, last_token)
260
261 typ = Id.Expr_CastedDummy
262 opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
263 done = p.addtoken(typ, opaque, gr.tokens[typ])
264 assert not done # can't end the expression
265
266 continue
267
268 # ${
269 if tok.id == Id.Left_DollarBrace:
270 left_token = tok
271 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
272 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
273
274 part, last_token = w_parser.ReadBracedVarSub(left_token)
275
276 # It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
277 typ = Id.Expr_CastedDummy
278 opaque = cast(Token, part) # HACK for expr_to_ast
279 done = p.addtoken(typ, opaque, gr.tokens[typ])
280 assert not done # can't end the expression
281
282 continue
283
284 # 'x' '''x'''
285 # r'x' r'''x'''
286 # u'x' u'''x'''
287 # b'x' b'''x'''
288 # $'x'
289 if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
290 Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
291 Id.Left_USingleQuote, Id.Left_UTSingleQuote,
292 Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
293 Id.Left_DollarSingleQuote):
294 if tok.id == Id.Left_DollarSingleQuote:
295 sq_mode = lex_mode_e.SQ_C
296 elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
297 Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
298 sq_mode = lex_mode_e.J8_Str
299 else:
300 sq_mode = lex_mode_e.SQ_Raw
301
302 left_token = tok
303 line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
304 w_parser = parse_ctx.MakeWordParser(lex, line_reader)
305
306 tokens = [] # type: List[Token]
307 last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
308 True)
309
310 sq_part = SingleQuoted(left_token, tokens, last_token)
311
312 typ = Id.Expr_CastedDummy
313 opaque = cast(Token, sq_part) # HACK for expr_to_ast
314 done = p.addtoken(typ, opaque, gr.tokens[typ])
315 assert not done # can't end the expression
316 continue
317
318 else:
319 # We never broke out -- EOF is too soon (how can this happen???)
320 raise parse.ParseError("incomplete input", tok.id, tok)
321
322
323class ExprParser(object):
324 """A wrapper around a pgen2 parser."""
325
326 def __init__(self, parse_ctx, gr):
327 # type: (ParseContext, Grammar) -> None
328 self.parse_ctx = parse_ctx
329 self.gr = gr
330 # Reused multiple times.
331 self.push_parser = parse.Parser(gr)
332 self.pnode_alloc = None # type: Optional[PNodeAllocator]
333
334 def Parse(self, lexer, start_symbol):
335 # type: (Lexer, int) -> Tuple[PNode, Token]
336
337 # Reuse the parser
338 self.push_parser.setup(start_symbol, self.pnode_alloc)
339 try:
340 last_token = _PushOilTokens(self.parse_ctx, self.gr,
341 self.push_parser, lexer)
342 except parse.ParseError as e:
343 #log('ERROR %s', e)
344 # TODO:
345 # - Describe what lexer mode we're in (Invalid syntax in regex)
346 # - Maybe say where the mode started
347 # - Id.Unknown_Tok could say "This character is invalid"
348
349 # ParseError has a "too much input" case but I haven't been able to
350 # tickle it. Maybe it's because of the Eof tokens?
351
352 p_die(
353 'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
354 e.tok)
355
356 return self.push_parser.rootnode, last_token
357
358
359class ctx_PNodeAllocator(object):
360
361 def __init__(self, ep):
362 # type: (ExprParser) -> None
363 self.expr_parser = ep
364 self.expr_parser.pnode_alloc = PNodeAllocator()
365
366 def __enter__(self):
367 # type: () -> None
368 pass
369
370 def __exit__(self, type, value, traceback):
371 # type: (Any, Any, Any) -> None
372 self.expr_parser.pnode_alloc.Clear()
373 self.expr_parser.pnode_alloc = None