ysh/expr_parse.py

OILS / ysh / expr_parse.py View on Github | oilshell.org

376 lines, 227 significant

1	"""expr_parse.py."""
2	from __future__ import print_function
3
4	from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5	CommandSub, ShArrayLiteral,
6	CompoundWord, word_part_t, word_e)
7	from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8	from _devbuild.gen.types_asdl import lex_mode_e
9
10	from core import ui
11	from core.error import p_die
12	from frontend import consts
13	from frontend import lexer
14	from frontend import reader
15	from mycpp import mylib
16	from mycpp.mylib import log, tagswitch
17	from osh import braces
18	from osh import word_
19	from osh import word_compile
20	from pgen2 import parse
21	from pgen2.pnode import PNodeAllocator
22
23	_ = log
24
25	from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
26	if TYPE_CHECKING:
27	from frontend.lexer import Lexer
28	from frontend.parse_lib import ParseContext
29	from pgen2.grammar import Grammar
30	from pgen2.pnode import PNode
31
32	if mylib.PYTHON:
33
34	class ParseTreePrinter(object):
35	"""Prints a tree of PNode instances."""
36
37	def __init__(self, names):
38	# type: (Dict[int, str]) -> None
39	self.names = names
40	self.f = mylib.Stdout()
41
42	def _Print(self, pnode, indent, i):
43	# type: (PNode, int, int) -> None
44
45	ind = ' ' * indent
46	# NOTE:
47	# - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
48	# rid of.
49	if pnode.tok:
50	if isinstance(pnode.tok, Token):
51	v = lexer.TokenVal(pnode.tok)
52	else:
53	# e.g. CommandSub for x = $(echo hi)
54	v = repr(pnode.tok)
55	else:
56	v = '-'
57	self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
58	if pnode.children is not None:
59	for i, child in enumerate(pnode.children):
60	self._Print(child, indent + 1, i)
61
62	def Print(self, pnode):
63	# type: (PNode) -> None
64	self._Print(pnode, 0, 0)
65
66
67	def _Classify(gr, tok):
68	# type: (Grammar, Token) -> int
69
70	# We have to match up what ParserGenerator.make_grammar() did when
71	# calling make_label() and make_first(). See classify() in
72	# opy/pgen2/driver.py.
73
74	id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
75
76	# TODO: use something more efficient than a Dict
77	if id_ in gr.tokens:
78	return gr.tokens[id_]
79
80	if id_ == Id.Unknown_DEqual:
81	p_die('Use === to be exact, or ~== to convert types', tok)
82
83	if id_ == Id.Unknown_Tok:
84	type_str = ''
85	else:
86	type_str = ' (%s)' % ui.PrettyId(tok.id)
87	p_die('Unexpected token in expression mode%s' % type_str, tok)
88
89
90	# Newlines are ignored between these pairs.
91	# yapf: disable
92	_OTHER_BALANCE = {
93
94	# Parenthesized expressions (tuples) and func/proc parameter lists
95	Id.Op_LParen: 1,
96	Id.Op_RParen: -1,
97	Id.Op_LBracket: 1,
98	Id.Op_RBracket: -1,
99
100	# Dicts are {}, and the grammar respects Op_Newline.
101	}
102	# yapf: enable
103
104
105	def _PushYshTokens(parse_ctx, gr, p, lex):
106	# type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
107	"""Push tokens onto pgen2's parser.
108
109	Returns the last token so it can be reused/seen by the CommandParser.
110	"""
111	#log('keywords = %s', gr.keywords)
112	#log('tokens = %s', gr.tokens)
113
114	last_token = None # type: Optional[Token]
115	prev_was_newline = False
116
117	balance = 0 # to ignore newlines
118
119	while True:
120	if last_token: # e.g. left over from WordParser
121	tok = last_token
122	#log('last_token = %s', last_token)
123	last_token = None
124	else:
125	tok = lex.Read(lex_mode_e.Expr)
126	#log('tok = %s', tok)
127
128	# Comments and whitespace. Newlines aren't ignored.
129	if consts.GetKind(tok.id) == Kind.Ignored:
130	continue
131
132	# For multiline lists, maps, etc.
133	if tok.id == Id.Op_Newline:
134	if balance > 0:
135	#log('*** SKIPPING NEWLINE')
136	continue
137	# Eliminate duplicate newline tokens. It makes the grammar simpler, and
138	# it's consistent with CPython's lexer and our own WordParser.
139	if prev_was_newline:
140	continue
141	prev_was_newline = True
142	else:
143	prev_was_newline = False
144
145	balance += _OTHER_BALANCE.get(tok.id, 0)
146	#log('BALANCE after seeing %s = %d', tok.id, balance)
147
148	if tok.id == Id.Op_LParen:
149	# For nesting inside $()
150	lex.PushHint(Id.Op_RParen, Id.Op_RParen)
151
152	#if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
153	# tok.id = KEYWORDS[tok.val]
154	# log('Replaced with %s', tok.id)
155
156	assert tok.id < 256, Id_str(tok.id)
157
158	ilabel = _Classify(gr, tok)
159	#log('tok = %s, ilabel = %d', tok, ilabel)
160
161	if p.addtoken(tok.id, tok, ilabel):
162	return tok
163
164	#
165	# Mututally recursive calls into the command/word parsers.
166	#
167
168	if tok.id in (Id.Left_ColonPipe,
169	Id.Left_PercentParen): # :\| %( LEGACY!
170	left_tok = tok
171	if tok.id == Id.Left_PercentParen:
172	lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
173
174	# Blame the opening token
175	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
176	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
177	words = [] # type: List[CompoundWord]
178	close_tok = None # type: Optional[Token]
179	done = False
180	while not done:
181	w = w_parser.ReadWord(lex_mode_e.ShCommand)
182	with tagswitch(w) as case:
183	if case(word_e.Operator):
184	tok = cast(Token, w)
185	if tok.id == Id.Right_ShArrayLiteral:
186	if left_tok.id != Id.Left_PercentParen:
187	p_die('Expected ) to close', left_tok)
188	close_tok = tok
189	done = True # can't use break here
190	elif tok.id == Id.Op_Pipe:
191	if left_tok.id != Id.Left_ColonPipe:
192	p_die('Expected ) to close', left_tok)
193	close_tok = tok
194	done = True # can't use break here
195	elif tok.id == Id.Op_Newline: # internal newlines allowed
196	continue
197	else:
198	p_die('Unexpected token in array literal',
199	loc.Word(w))
200
201	elif case(word_e.Compound):
202	words.append(cast(CompoundWord, w))
203
204	else:
205	raise AssertionError()
206
207	words2 = braces.BraceDetectAll(words)
208	words3 = word_.TildeDetectAll(words2)
209
210	typ = Id.Expr_CastedDummy
211
212	lit_part = ShArrayLiteral(left_tok, words3, close_tok)
213	opaque = cast(Token, lit_part) # HACK for expr_to_ast
214	done = p.addtoken(typ, opaque, gr.tokens[typ])
215	assert not done # can't end the expression
216
217	# Now push the closing )
218	ilabel = _Classify(gr, close_tok)
219	done = p.addtoken(tok.id, close_tok, ilabel)
220	assert not done # can't end the expression
221
222	continue
223
224	# $( @( ^(
225	if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
226	Id.Left_CaretParen):
227
228	left_token = tok
229
230	lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
231	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
232	c_parser = parse_ctx.MakeParserForCommandSub(
233	line_reader, lex, Id.Eof_RParen)
234	node = c_parser.ParseCommandSub()
235	# A little gross: Copied from osh/word_parse.py
236	right_token = c_parser.w_parser.cur_token
237
238	cs_part = CommandSub(left_token, node, right_token)
239
240	typ = Id.Expr_CastedDummy
241	opaque = cast(Token, cs_part) # HACK for expr_to_ast
242	done = p.addtoken(typ, opaque, gr.tokens[typ])
243	assert not done # can't end the expression
244
245	# Now push the closing )
246	ilabel = _Classify(gr, right_token)
247	done = p.addtoken(right_token.id, right_token, ilabel)
248	assert not done # can't end the expression
249
250	continue
251
252	# ", """ and ^"
253	if tok.id in (Id.Left_DoubleQuote, Id.Left_TDoubleQuote,
254	Id.Left_CaretDoubleQuote):
255	left_token = tok
256	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
257	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
258
259	parts = [] # type: List[word_part_t]
260	last_token = w_parser.ReadDoubleQuoted(left_token, parts)
261	expr_dq_part = DoubleQuoted(left_token, parts, last_token)
262
263	typ = Id.Expr_CastedDummy
264	opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
265	done = p.addtoken(typ, opaque, gr.tokens[typ])
266	assert not done # can't end the expression
267
268	continue
269
270	# ${
271	if tok.id == Id.Left_DollarBrace:
272	left_token = tok
273	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
274	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
275
276	part, last_token = w_parser.ReadBracedVarSub(left_token)
277
278	# It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
279	typ = Id.Expr_CastedDummy
280	opaque = cast(Token, part) # HACK for expr_to_ast
281	done = p.addtoken(typ, opaque, gr.tokens[typ])
282	assert not done # can't end the expression
283
284	continue
285
286	# 'x' '''x'''
287	# r'x' r'''x'''
288	# u'x' u'''x'''
289	# b'x' b'''x'''
290	# $'x'
291	if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
292	Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
293	Id.Left_USingleQuote, Id.Left_UTSingleQuote,
294	Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
295	Id.Left_DollarSingleQuote):
296	if tok.id == Id.Left_DollarSingleQuote:
297	sq_mode = lex_mode_e.SQ_C
298	elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
299	Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
300	sq_mode = lex_mode_e.J8_Str
301	else:
302	sq_mode = lex_mode_e.SQ_Raw
303
304	left_token = tok
305	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
306	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
307
308	tokens = [] # type: List[Token]
309	last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
310	True)
311
312	sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
313	sq_part = SingleQuoted(left_token, sval, last_token)
314
315	typ = Id.Expr_CastedDummy
316	opaque = cast(Token, sq_part) # HACK for expr_to_ast
317	done = p.addtoken(typ, opaque, gr.tokens[typ])
318	assert not done # can't end the expression
319	continue
320
321	else:
322	# We never broke out -- EOF is too soon (how can this happen???)
323	raise parse.ParseError("incomplete input", tok.id, tok)
324
325
326	class ExprParser(object):
327	"""A wrapper around a pgen2 parser."""
328
329	def __init__(self, parse_ctx, gr):
330	# type: (ParseContext, Grammar) -> None
331	self.parse_ctx = parse_ctx
332	self.gr = gr
333	# Reused multiple times.
334	self.push_parser = parse.Parser(gr)
335	self.pnode_alloc = None # type: Optional[PNodeAllocator]
336
337	def Parse(self, lexer, start_symbol):
338	# type: (Lexer, int) -> Tuple[PNode, Token]
339
340	# Reuse the parser
341	self.push_parser.setup(start_symbol, self.pnode_alloc)
342	try:
343	last_token = _PushYshTokens(self.parse_ctx, self.gr,
344	self.push_parser, lexer)
345	except parse.ParseError as e:
346	#log('ERROR %s', e)
347	# TODO:
348	# - Describe what lexer mode we're in (Invalid syntax in regex)
349	# - Maybe say where the mode started
350	# - Id.Unknown_Tok could say "This character is invalid"
351
352	# ParseError has a "too much input" case but I haven't been able to
353	# tickle it. Maybe it's because of the Eof tokens?
354
355	p_die(
356	'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
357	e.tok)
358
359	return self.push_parser.rootnode, last_token
360
361
362	class ctx_PNodeAllocator(object):
363
364	def __init__(self, ep):
365	# type: (ExprParser) -> None
366	self.expr_parser = ep
367	self.expr_parser.pnode_alloc = PNodeAllocator()
368
369	def __enter__(self):
370	# type: () -> None
371	pass
372
373	def __exit__(self, type, value, traceback):
374	# type: (Any, Any, Any) -> None
375	self.expr_parser.pnode_alloc.Clear()
376	self.expr_parser.pnode_alloc = None