ysh/expr_parse.py

OILS / ysh / expr_parse.py View on Github | oilshell.org

373 lines, 224 significant

1	"""expr_parse.py."""
2	from __future__ import print_function
3
4	from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
5	CommandSub, ShArrayLiteral,
6	CompoundWord, word_part_t, word_e)
7	from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
8	from _devbuild.gen.types_asdl import lex_mode_e
9
10	from core import ui
11	from core.error import p_die
12	from frontend import consts
13	from frontend import lexer
14	from frontend import reader
15	from mycpp import mylib
16	from mycpp.mylib import log, tagswitch
17	from osh import braces
18	from osh import word_
19	from pgen2 import parse
20	from pgen2.pnode import PNodeAllocator
21
22	_ = log
23
24	from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
25	if TYPE_CHECKING:
26	from frontend.lexer import Lexer
27	from frontend.parse_lib import ParseContext
28	from pgen2.grammar import Grammar
29	from pgen2.pnode import PNode
30
31	if mylib.PYTHON:
32
33	class ParseTreePrinter(object):
34	"""Prints a tree of PNode instances."""
35
36	def __init__(self, names):
37	# type: (Dict[int, str]) -> None
38	self.names = names
39	self.f = mylib.Stdout()
40
41	def _Print(self, pnode, indent, i):
42	# type: (PNode, int, int) -> None
43
44	ind = ' ' * indent
45	# NOTE:
46	# - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
47	# rid of.
48	if pnode.tok:
49	if isinstance(pnode.tok, Token):
50	v = lexer.TokenVal(pnode.tok)
51	else:
52	# e.g. CommandSub for x = $(echo hi)
53	v = repr(pnode.tok)
54	else:
55	v = '-'
56	self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
57	if pnode.children is not None:
58	for i, child in enumerate(pnode.children):
59	self._Print(child, indent + 1, i)
60
61	def Print(self, pnode):
62	# type: (PNode) -> None
63	self._Print(pnode, 0, 0)
64
65
66	def _Classify(gr, tok):
67	# type: (Grammar, Token) -> int
68
69	# We have to match up what ParserGenerator.make_grammar() did when
70	# calling make_label() and make_first(). See classify() in
71	# opy/pgen2/driver.py.
72
73	# TODO: use something more efficient than a Dict
74	if tok.id in gr.tokens:
75	return gr.tokens[tok.id]
76
77	if tok.id == Id.Unknown_DEqual:
78	p_die('Use === to be exact, or ~== to convert types', tok)
79
80	if tok.id == Id.Unknown_Tok:
81	type_str = ''
82	else:
83	type_str = ' (%s)' % ui.PrettyId(tok.id)
84	p_die('Unexpected token in expression mode%s' % type_str, tok)
85
86
87	# Newlines are ignored between these pairs.
88	# yapf: disable
89	_OTHER_BALANCE = {
90
91	# Parenthesized expressions (tuples) and func/proc parameter lists
92	Id.Op_LParen: 1,
93	Id.Op_RParen: -1,
94	Id.Op_LBracket: 1,
95	Id.Op_RBracket: -1,
96
97	# Dicts are {}, and the grammar respects Op_Newline.
98	}
99	# yapf: enable
100
101
102	def _PushOilTokens(parse_ctx, gr, p, lex):
103	# type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
104	"""Push tokens onto pgen2's parser.
105
106	Returns the last token so it can be reused/seen by the
107	CommandParser.
108	"""
109	#log('keywords = %s', gr.keywords)
110	#log('tokens = %s', gr.tokens)
111
112	last_token = None # type: Optional[Token]
113	prev_was_newline = False
114
115	balance = 0 # to ignore newlines
116
117	while True:
118	if last_token: # e.g. left over from WordParser
119	tok = last_token
120	#log('last_token = %s', last_token)
121	last_token = None
122	else:
123	tok = lex.Read(lex_mode_e.Expr)
124	#log('tok = %s', tok)
125
126	# Comments and whitespace. Newlines aren't ignored.
127	if consts.GetKind(tok.id) == Kind.Ignored:
128	continue
129
130	# For multiline lists, maps, etc.
131	if tok.id == Id.Op_Newline:
132	if balance > 0:
133	#log('*** SKIPPING NEWLINE')
134	continue
135	# Eliminate duplicate newline tokens. It makes the grammar simpler, and
136	# it's consistent with CPython's lexer and our own WordParser.
137	if prev_was_newline:
138	continue
139	prev_was_newline = True
140	else:
141	prev_was_newline = False
142
143	balance += _OTHER_BALANCE.get(tok.id, 0)
144	#log('BALANCE after seeing %s = %d', tok.id, balance)
145
146	if tok.id == Id.Op_LParen:
147	# For nesting inside $()
148	lex.PushHint(Id.Op_RParen, Id.Op_RParen)
149
150	#if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
151	# tok.id = KEYWORDS[tok.val]
152	# log('Replaced with %s', tok.id)
153
154	assert tok.id < 256, Id_str(tok.id)
155
156	ilabel = _Classify(gr, tok)
157	#log('tok = %s, ilabel = %d', tok, ilabel)
158
159	if p.addtoken(tok.id, tok, ilabel):
160	return tok
161
162	#
163	# Mututally recursive calls into the command/word parsers.
164	#
165
166	if tok.id in (Id.Left_ColonPipe,
167	Id.Left_PercentParen): # :\| %( LEGACY!
168	left_tok = tok
169	if tok.id == Id.Left_PercentParen:
170	lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
171
172	# Blame the opening token
173	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
174	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
175	words = [] # type: List[CompoundWord]
176	close_tok = None # type: Optional[Token]
177	done = False
178	while not done:
179	w = w_parser.ReadWord(lex_mode_e.ShCommand)
180	with tagswitch(w) as case:
181	if case(word_e.Operator):
182	tok = cast(Token, w)
183	if tok.id == Id.Right_ShArrayLiteral:
184	if left_tok.id != Id.Left_PercentParen:
185	p_die('Expected ) to close', left_tok)
186	close_tok = tok
187	done = True # can't use break here
188	elif tok.id == Id.Op_Pipe:
189	if left_tok.id != Id.Left_ColonPipe:
190	p_die('Expected ) to close', left_tok)
191	close_tok = tok
192	done = True # can't use break here
193	elif tok.id == Id.Op_Newline: # internal newlines allowed
194	continue
195	else:
196	p_die('Unexpected token in array literal',
197	loc.Word(w))
198
199	elif case(word_e.Compound):
200	words.append(cast(CompoundWord, w))
201
202	else:
203	raise AssertionError()
204
205	words2 = braces.BraceDetectAll(words)
206	words3 = word_.TildeDetectAll(words2)
207
208	typ = Id.Expr_CastedDummy
209
210	lit_part = ShArrayLiteral(left_tok, words3, close_tok)
211	opaque = cast(Token, lit_part) # HACK for expr_to_ast
212	done = p.addtoken(typ, opaque, gr.tokens[typ])
213	assert not done # can't end the expression
214
215	# Now push the closing )
216	ilabel = _Classify(gr, close_tok)
217	done = p.addtoken(tok.id, close_tok, ilabel)
218	assert not done # can't end the expression
219
220	continue
221
222	# $( @( ^(
223	if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
224	Id.Left_CaretParen):
225
226	left_token = tok
227
228	lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
229	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
230	c_parser = parse_ctx.MakeParserForCommandSub(
231	line_reader, lex, Id.Eof_RParen)
232	node = c_parser.ParseCommandSub()
233	# A little gross: Copied from osh/word_parse.py
234	right_token = c_parser.w_parser.cur_token
235
236	cs_part = CommandSub(left_token, node, right_token)
237
238	typ = Id.Expr_CastedDummy
239	opaque = cast(Token, cs_part) # HACK for expr_to_ast
240	done = p.addtoken(typ, opaque, gr.tokens[typ])
241	assert not done # can't end the expression
242
243	# Now push the closing )
244	ilabel = _Classify(gr, right_token)
245	done = p.addtoken(right_token.id, right_token, ilabel)
246	assert not done # can't end the expression
247
248	continue
249
250	# ", """ and ^"
251	if tok.id in (Id.Left_DoubleQuote, Id.Left_TDoubleQuote,
252	Id.Left_CaretDoubleQuote):
253	left_token = tok
254	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
255	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
256
257	parts = [] # type: List[word_part_t]
258	last_token = w_parser.ReadDoubleQuoted(left_token, parts)
259	expr_dq_part = DoubleQuoted(left_token, parts, last_token)
260
261	typ = Id.Expr_CastedDummy
262	opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
263	done = p.addtoken(typ, opaque, gr.tokens[typ])
264	assert not done # can't end the expression
265
266	continue
267
268	# ${
269	if tok.id == Id.Left_DollarBrace:
270	left_token = tok
271	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
272	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
273
274	part, last_token = w_parser.ReadBracedVarSub(left_token)
275
276	# It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
277	typ = Id.Expr_CastedDummy
278	opaque = cast(Token, part) # HACK for expr_to_ast
279	done = p.addtoken(typ, opaque, gr.tokens[typ])
280	assert not done # can't end the expression
281
282	continue
283
284	# 'x' '''x'''
285	# r'x' r'''x'''
286	# u'x' u'''x'''
287	# b'x' b'''x'''
288	# $'x'
289	if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
290	Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
291	Id.Left_USingleQuote, Id.Left_UTSingleQuote,
292	Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
293	Id.Left_DollarSingleQuote):
294	if tok.id == Id.Left_DollarSingleQuote:
295	sq_mode = lex_mode_e.SQ_C
296	elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
297	Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
298	sq_mode = lex_mode_e.J8_Str
299	else:
300	sq_mode = lex_mode_e.SQ_Raw
301
302	left_token = tok
303	line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
304	w_parser = parse_ctx.MakeWordParser(lex, line_reader)
305
306	tokens = [] # type: List[Token]
307	last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
308	True)
309
310	sq_part = SingleQuoted(left_token, tokens, last_token)
311
312	typ = Id.Expr_CastedDummy
313	opaque = cast(Token, sq_part) # HACK for expr_to_ast
314	done = p.addtoken(typ, opaque, gr.tokens[typ])
315	assert not done # can't end the expression
316	continue
317
318	else:
319	# We never broke out -- EOF is too soon (how can this happen???)
320	raise parse.ParseError("incomplete input", tok.id, tok)
321
322
323	class ExprParser(object):
324	"""A wrapper around a pgen2 parser."""
325
326	def __init__(self, parse_ctx, gr):
327	# type: (ParseContext, Grammar) -> None
328	self.parse_ctx = parse_ctx
329	self.gr = gr
330	# Reused multiple times.
331	self.push_parser = parse.Parser(gr)
332	self.pnode_alloc = None # type: Optional[PNodeAllocator]
333
334	def Parse(self, lexer, start_symbol):
335	# type: (Lexer, int) -> Tuple[PNode, Token]
336
337	# Reuse the parser
338	self.push_parser.setup(start_symbol, self.pnode_alloc)
339	try:
340	last_token = _PushOilTokens(self.parse_ctx, self.gr,
341	self.push_parser, lexer)
342	except parse.ParseError as e:
343	#log('ERROR %s', e)
344	# TODO:
345	# - Describe what lexer mode we're in (Invalid syntax in regex)
346	# - Maybe say where the mode started
347	# - Id.Unknown_Tok could say "This character is invalid"
348
349	# ParseError has a "too much input" case but I haven't been able to
350	# tickle it. Maybe it's because of the Eof tokens?
351
352	p_die(
353	'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
354	e.tok)
355
356	return self.push_parser.rootnode, last_token
357
358
359	class ctx_PNodeAllocator(object):
360
361	def __init__(self, ep):
362	# type: (ExprParser) -> None
363	self.expr_parser = ep
364	self.expr_parser.pnode_alloc = PNodeAllocator()
365
366	def __enter__(self):
367	# type: () -> None
368	pass
369
370	def __exit__(self, type, value, traceback):
371	# type: (Any, Any, Any) -> None
372	self.expr_parser.pnode_alloc.Clear()
373	self.expr_parser.pnode_alloc = None