osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2108 lines, 1123 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	NameTok,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	)
91	from core import alloc
92	from core.error import p_die
93	from mycpp.mylib import log
94	from core import pyutil
95	from core import ui
96	from frontend import consts
97	from frontend import lexer
98	from frontend import reader
99	from osh import tdop
100	from osh import arith_parse
101	from osh import braces
102	from osh import word_
103	from osh import word_compile
104	from mycpp.mylib import tagswitch
105
106	from typing import List, Optional, Tuple, cast
107	from typing import TYPE_CHECKING
108	if TYPE_CHECKING:
109	from frontend.lexer import Lexer
110	from frontend.parse_lib import ParseContext
111	from frontend.reader import _Reader
112	from osh.cmd_parse import VarChecker
113
114	unused1 = log
115	unused2 = Id_str
116
117	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120	class WordEmitter(object):
121	"""Common interface for [ and [["""
122
123	def __init__(self):
124	# type: () -> None
125	"""Empty constructor for mycpp."""
126	pass
127
128	def ReadWord(self, lex_mode):
129	# type: (lex_mode_t) -> word_t
130	raise NotImplementedError()
131
132
133	class WordParser(WordEmitter):
134
135	def __init__(self, parse_ctx, lexer, line_reader):
136	# type: (ParseContext, Lexer, _Reader) -> None
137	self.parse_ctx = parse_ctx
138	self.lexer = lexer
139	self.line_reader = line_reader
140	self.arena = line_reader.arena
141
142	self.parse_opts = parse_ctx.parse_opts
143	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144	self.parse_opts)
145	self.Reset()
146
147	def Init(self, lex_mode):
148	# type: (lex_mode_t) -> None
149	"""Used to parse arithmetic, see ParseContext."""
150	self.next_lex_mode = lex_mode
151
152	def Reset(self):
153	# type: () -> None
154	"""Called by interactive loop."""
155	# For _GetToken()
156	self.cur_token = None # type: Token
157	self.token_kind = Kind.Undefined
158	self.token_type = Id.Undefined_Tok
159
160	self.next_lex_mode = lex_mode_e.ShCommand
161
162	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163	# comments
164	self.emit_doc_token = False
165	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166	# multiline mode.
167	self.multiline = False
168
169	# For detecting invalid \n\n in multiline mode. Counts what we got
170	# directly from the lexer.
171	self.newline_state = 0
172	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173	# that consume words.
174	self.returned_newline = False
175
176	# For integration with pgen2
177	self.buffered_word = None # type: word_t
178
179	def _GetToken(self):
180	# type: () -> None
181	"""Call this when you need to make a decision based on any of:
182
183	self.token_type self.token_kind self.cur_token # contents
184	"""
185	if self.next_lex_mode != lex_mode_e.Undefined:
186	self.cur_token = self.lexer.Read(self.next_lex_mode)
187	self.token_type = self.cur_token.id
188	self.token_kind = consts.GetKind(self.token_type)
189
190	# number of consecutive newlines, ignoring whitespace
191	if self.token_type == Id.Op_Newline:
192	self.newline_state += 1
193	elif self.token_kind != Kind.WS:
194	self.newline_state = 0
195
196	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
197	self.next_lex_mode = lex_mode_e.Undefined
198
199	def _SetNext(self, lex_mode):
200	# type: (lex_mode_t) -> None
201	"""Set the next lex state, but don't actually read a token.
202
203	We need this for proper interactive parsing.
204	"""
205	self.next_lex_mode = lex_mode
206
207	def _ReadVarOpArg(self, arg_lex_mode):
208	# type: (lex_mode_t) -> rhs_word_t
209
210	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
211	# valid, even when unquoted.
212	self._SetNext(arg_lex_mode)
213	self._GetToken()
214
215	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
216	True) # empty_ok
217
218	# If the Compound has no parts, and we're in a double-quoted VarSub
219	# arg, and empty_ok, then return Empty. This is so it can evaluate to
220	# the empty string and not get elided.
221	#
222	# Examples:
223	# - "${s:-}", "${s/%pat/}"
224	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
225	# has the same potential problem of not having Token location info.
226	#
227	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
228	# return a Compound with no parts, which is explicitly checked with a
229	# custom error message.
230	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
231	return rhs_word.Empty
232
233	return w
234
235	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
236	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
237	"""Return a CompoundWord.
238
239	Helper function for _ReadVarOpArg and used directly by
240	_ReadPatSubVarOp.
241	"""
242	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
243	#log('w %s', w)
244	tilde = word_.TildeDetect(w)
245	if tilde:
246	w = tilde
247	return w
248
249	def _ReadSliceVarOp(self):
250	# type: () -> suffix_op.Slice
251	"""VarOf ':' ArithExpr (':' ArithExpr )?"""
252	self._SetNext(lex_mode_e.Arith)
253	self._GetToken()
254	cur_id = self.token_type # e.g. Id.Arith_Colon
255
256	if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
257	# no beginning specified
258	begin = None # type: Optional[arith_expr_t]
259	else:
260	begin = self.a_parser.Parse()
261	cur_id = self.a_parser.CurrentId()
262
263	if cur_id == Id.Arith_RBrace:
264	no_length = None # type: Optional[arith_expr_t] # No length specified
265	return suffix_op.Slice(begin, no_length)
266
267	# Id.Arith_Colon is a pun for Id.VOp2_Colon
268	if cur_id == Id.Arith_Colon:
269	self._SetNext(lex_mode_e.Arith)
270	length = self._ReadArithExpr(Id.Arith_RBrace)
271	return suffix_op.Slice(begin, length)
272
273	p_die("Expected : or } in slice", self.cur_token)
274	raise AssertionError() # for MyPy
275
276	def _ReadPatSubVarOp(self):
277	# type: () -> suffix_op.PatSub
278	"""Looking at the first '/' after VarOf:
279
280	VarSub = ...
281	\| VarOf '/' Match ( '/' WORD? )?
282	Match = '/' WORD # can't be empty
283	\| '#' WORD? # may be empty
284	\| '%' WORD?
285	"""
286	slash_tok = self.cur_token # location info
287	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
288
289	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
290
291	self._GetToken()
292	if self.token_type == Id.Right_DollarBrace:
293	pat = CompoundWord([])
294	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
295	slash_tok)
296
297	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
298	replace_mode = self.token_type
299	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
300
301	# Bash quirk:
302	# echo ${x/#/replace} has an empty pattern
303	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
304	empty_ok = replace_mode != Id.Lit_Slash
305	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
306	empty_ok)
307	#log('pat 1 %r', pat)
308
309	if self.token_type == Id.Lit_Slash:
310	# read until }
311	replace = self._ReadVarOpArg(
312	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
313	#log('r 1 %r', replace)
314	else:
315	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
316	replace = rhs_word.Empty
317
318	self._GetToken()
319	if self.token_type != Id.Right_DollarBrace:
320	# This happens on invalid code
321	p_die(
322	"Expected } after replacement string, got %s" %
323	ui.PrettyId(self.token_type), self.cur_token)
324
325	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
326
327	def _ReadSubscript(self):
328	# type: () -> bracket_op_t
329	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
330	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
331	# expression.
332	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
333	if next_id in (Id.Lit_At, Id.Arith_Star):
334	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
335
336	self._SetNext(lex_mode_e.Arith) # skip past [
337	self._GetToken()
338	self._SetNext(lex_mode_e.Arith) # skip past @
339	self._GetToken()
340	else:
341	self._SetNext(lex_mode_e.Arith) # skip past [
342	anode = self._ReadArithExpr(Id.Arith_RBracket)
343	op = bracket_op.ArrayIndex(anode)
344
345	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
346	p_die('Expected ] to close subscript', self.cur_token)
347
348	self._SetNext(lex_mode_e.VSub_2) # skip past ]
349	self._GetToken() # Needed to be in the same spot as no subscript
350
351	return op
352
353	def _ParseVarOf(self):
354	# type: () -> BracedVarSub
355	"""
356	VarOf = NAME Subscript?
357	\| NUMBER # no subscript allowed, none of these are arrays
358	# ${@[1]} doesn't work, even though slicing does
359	\| VarSymbol
360	"""
361	self._GetToken()
362	name_token = self.cur_token
363	self._SetNext(lex_mode_e.VSub_2)
364
365	self._GetToken() # Check for []
366	if self.token_type == Id.VOp2_LBracket:
367	bracket_op = self._ReadSubscript()
368	else:
369	bracket_op = None
370
371	part = BracedVarSub.CreateNull()
372	part.token = name_token
373	part.var_name = lexer.TokenVal(name_token)
374	part.bracket_op = bracket_op
375	return part
376
377	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
378	# type: (lex_mode_t, bool) -> BracedVarSub
379	"""Start parsing at the op -- we already skipped past the name."""
380	part = self._ParseVarOf()
381
382	self._GetToken()
383	if self.token_type == Id.Right_DollarBrace:
384	return part # no ops
385
386	op_kind = self.token_kind
387
388	if op_kind == Kind.VTest:
389	tok = self.cur_token
390	arg_word = self._ReadVarOpArg(arg_lex_mode)
391	if self.token_type != Id.Right_DollarBrace:
392	p_die('Expected } to close ${', self.cur_token)
393
394	part.suffix_op = suffix_op.Unary(tok, arg_word)
395
396	elif op_kind == Kind.VOpOil:
397	tok = self.cur_token
398	arg_word = self._ReadVarOpArg(arg_lex_mode)
399	if self.token_type != Id.Right_DollarBrace:
400	p_die('Expected } to close ${', self.cur_token)
401
402	UP_arg_word = arg_word
403	with tagswitch(arg_word) as case:
404	if case(rhs_word_e.Empty):
405	pass
406	elif case(rhs_word_e.Compound):
407	arg_word = cast(CompoundWord, UP_arg_word)
408	# This handles ${x\|html} and ${x %.3f} now
409	# However I think ${x %.3f} should be statically parsed? It can enter
410	# the printf lexer modes.
411	ok, arg, quoted = word_.StaticEval(arg_word)
412	if not ok or quoted:
413	p_die('Expected a constant argument',
414	loc.Word(arg_word))
415
416	part.suffix_op = suffix_op.Static(tok, arg)
417
418	elif op_kind == Kind.VOp0:
419	part.suffix_op = self.cur_token # Nullary
420	self._SetNext(lex_mode_e.VSub_2) # Expecting }
421	self._GetToken()
422
423	elif op_kind == Kind.VOp1: # % %% # ## etc.
424	tok = self.cur_token
425	# Weird exception that all shells have: these operators take a glob
426	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
427	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
428	if self.token_type != Id.Right_DollarBrace:
429	p_die('Expected } to close ${', self.cur_token)
430
431	part.suffix_op = suffix_op.Unary(tok, arg_word)
432
433	elif op_kind == Kind.VOp2: # / : [ ]
434	if self.token_type == Id.VOp2_Slash:
435	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
436	part.suffix_op = patsub_op
437
438	# Checked by the method above
439	assert self.token_type == Id.Right_DollarBrace, self.cur_token
440
441	elif self.token_type == Id.VOp2_Colon:
442	part.suffix_op = self._ReadSliceVarOp()
443	# NOTE: } in arithmetic mode.
444	if self.token_type != Id.Arith_RBrace:
445	# Token seems off; doesn't point to X in # ${a:1:2 X
446	p_die('Expected } to close ${', self.cur_token)
447
448	else:
449	# TODO: Does this ever happen?
450	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
451
452	elif op_kind == Kind.VOp3: # ${prefix@} etc.
453	if allow_query:
454	part.suffix_op = self.cur_token # Nullary
455	self._SetNext(lex_mode_e.VSub_2) # Expecting }
456	self._GetToken()
457	else:
458	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
459
460	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
461	# mode. It's redundantly checked above.
462	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
463	# ${a.} or ${!a.}
464	p_die('Expected } to close ${', self.cur_token)
465
466	# Now look for ops
467	return part
468
469	def ReadBracedVarSub(self, left_token):
470	# type: (Token) -> Tuple[BracedVarSub, Token]
471	""" For YSH expressions like var x = ${x:-"default"}. """
472	part = self._ReadBracedVarSub(left_token, d_quoted=False)
473	last_token = self.cur_token
474	return part, last_token
475
476	def _ReadBracedVarSub(self, left_token, d_quoted):
477	# type: (Token, bool) -> BracedVarSub
478	"""For the ${} expression language.
479
480	NAME = [a-zA-Z_][a-zA-Z0-9_]*
481	NUMBER = [0-9]+ # ${10}, ${11}, ...
482
483	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
484	VarSymbol = '!' \| '@' \| '#' \| ...
485	VarOf = NAME Subscript?
486	\| NUMBER # no subscript allowed, none of these are arrays
487	# ${@[1]} doesn't work, even though slicing does
488	\| VarSymbol
489
490	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
491
492	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
493	STRIP_OP = '#' \| '##' \| '%' \| '%%'
494	CASE_OP = ',' \| ',,' \| '^' \| '^^'
495	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
496
497	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
498	# SPACE is operator not %
499	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
500	VarExpr = VarOf
501	\| VarOf NULLARY_OP
502	\| VarOf UnaryOp WORD
503	\| VarOf YSH_UNARY STATIC_WORD
504	\| VarOf ':' ArithExpr (':' ArithExpr )?
505	\| VarOf '/' Match '/' WORD
506
507	LengthExpr = '#' VarOf # can't apply operators after length
508
509	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
510	# ${!ref[0]} vs ${!keys[@]} resolved later
511
512	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
513
514	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
515
516	VarSub = LengthExpr
517	\| RefOrKeys
518	\| PrefixQuery
519	\| VarExpr
520	\| BuiltinSub
521
522	NOTES:
523	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
524	slicing ${a:x+1:y+2}
525	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
526	- @ and * are technically arithmetic expressions in this implementation
527	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
528	it's also vectorized.
529
530	Strictness over bash:
531	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
532	grammar
533	- ! and # prefixes can't be composed, even though named refs can be
534	composed with other operators
535	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
536	a prefix, and it can also be a literal part of WORD.
537
538	From the parser's point of view, the prefix # can't be combined with
539	UnaryOp/slicing/matching, and the ! can. However
540
541	- ${a[@]:1:2} is not allowed
542	- ${#a[@]:1:2} is allowed, but gives the wrong answer
543	"""
544	if d_quoted:
545	arg_lex_mode = lex_mode_e.VSub_ArgDQ
546	else:
547	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
548
549	self._SetNext(lex_mode_e.VSub_1)
550	self._GetToken()
551
552	ty = self.token_type
553	first_tok = self.cur_token
554
555	if ty == Id.VSub_Pound:
556	# Disambiguate
557	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
558	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
559	# e.g. a name, '#' is the prefix
560	self._SetNext(lex_mode_e.VSub_1)
561	part = self._ParseVarOf()
562
563	self._GetToken()
564	if self.token_type != Id.Right_DollarBrace:
565	p_die('Expected } after length expression', self.cur_token)
566
567	part.prefix_op = first_tok
568
569	else: # not a prefix, '#' is the variable
570	part = self._ParseVarExpr(arg_lex_mode)
571
572	elif ty == Id.VSub_Bang:
573	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
574	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
575	# e.g. a name, '!' is the prefix
576	# ${!a} -- this is a ref
577	# ${!3} -- this is ref
578	# ${!a[1]} -- this is a ref
579	# ${!a[@]} -- this is a keys
580	# No lookahead -- do it in a second step, or at runtime
581	self._SetNext(lex_mode_e.VSub_1)
582	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
583
584	part.prefix_op = first_tok
585
586	else: # not a prefix, '!' is the variable
587	part = self._ParseVarExpr(arg_lex_mode)
588
589	elif ty == Id.VSub_Dot:
590	# Note: this will become a new builtin_sub type, so this method must
591	# return word_part_t rather than BracedVarSub. I don't think that
592	# should cause problems.
593	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
594
595	# VS_NAME, VS_NUMBER, symbol that isn't # or !
596	elif self.token_kind == Kind.VSub:
597	part = self._ParseVarExpr(arg_lex_mode)
598
599	else:
600	# e.g. ${^}
601	p_die('Unexpected token in ${}', self.cur_token)
602
603	part.left = left_token # attach the argument
604	part.right = self.cur_token
605	return part
606
607	def _ReadSingleQuoted(self, left_token, lex_mode):
608	# type: (Token, lex_mode_t) -> SingleQuoted
609	"""Internal method to read a word_part."""
610	tokens = [] # type: List[Token]
611	# In command mode, we never disallow backslashes like '\'
612	self.ReadSingleQuoted(lex_mode, left_token, tokens, False)
613	right_quote = self.cur_token
614	node = SingleQuoted(left_token, tokens, right_quote)
615	return node
616
617	def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_ysh_expr):
618	# type: (lex_mode_t, Token, List[Token], bool) -> Token
619	"""Appends to tokens
620
621	Used by expr_parse.py
622	"""
623
624	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
625	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
626
627	expected_end_tokens = 3 if left_token.id in (
628	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
629	Id.Left_BTSingleQuote) else 1
630	num_end_tokens = 0
631
632	while num_end_tokens < expected_end_tokens:
633	self._SetNext(lex_mode)
634	self._GetToken()
635
636	# Kind.Char emitted in DOLLAR_SQ state
637	if self.token_kind in (Kind.Lit, Kind.Char):
638	tok = self.cur_token
639	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
640	# r'one\two' or c'one\\two'
641	if no_backslashes and '\\' in tok.tval:
642	p_die(
643	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
644	tok)
645
646	if is_ysh_expr:
647	# Disallow var x = $'\001'. Arguably we don't need these
648	# checks because u'\u{1}' is the way to write it.
649	if self.token_type == Id.Char_Octal3:
650	p_die(
651	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
652	tok)
653
654	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
655	# disallow \xH
656	p_die(
657	r'Invalid hex escape in YSH string (must be \xHH)',
658	tok)
659
660	tokens.append(tok)
661
662	elif self.token_kind == Kind.Unknown:
663	tok = self.cur_token
664	assert tok.id == Id.Unknown_Backslash, tok
665
666	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
667	if is_ysh_expr or not self.parse_opts.parse_backslash():
668	p_die(
669	"Invalid char escape in C-style string literal (OILS-ERR-11)",
670	tok)
671
672	tokens.append(tok)
673
674	elif self.token_kind == Kind.Eof:
675	p_die('Unexpected EOF in single-quoted string that began here',
676	left_token)
677
678	elif self.token_kind == Kind.Right:
679	# assume Id.Right_SingleQuote
680	num_end_tokens += 1
681	tokens.append(self.cur_token)
682
683	else:
684	raise AssertionError(self.cur_token)
685
686	if self.token_kind != Kind.Right:
687	num_end_tokens = 0 # we need three in a ROW
688
689	if expected_end_tokens == 1:
690	tokens.pop()
691	elif expected_end_tokens == 3: # Get rid of spurious end tokens
692	tokens.pop()
693	tokens.pop()
694	tokens.pop()
695
696	# Remove space from ''' r''' $''' in both expression mode and command mode
697	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
698	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
699	word_compile.RemoveLeadingSpaceSQ(tokens)
700
701	# Validation after lexing - same 2 checks in j8.LexerDecoder
702	is_u_string = left_token.id in (Id.Left_USingleQuote,
703	Id.Left_UTSingleQuote)
704
705	for tok in tokens:
706	# u'\yff' is not valid, but b'\yff' is
707	if is_u_string and tok.id == Id.Char_YHex:
708	p_die(
709	r"%s escapes not allowed in u'' strings" %
710	lexer.TokenVal(tok), tok)
711	# \u{dc00} isn't valid
712	if tok.id == Id.Char_UBraced:
713	h = lexer.TokenSlice(tok, 3, -1) # \u{123456}
714	i = int(h, 16)
715	if 0xD800 <= i and i < 0xE000:
716	p_die(
717	r"%s escape is illegal because it's in the surrogate range"
718	% lexer.TokenVal(tok), tok)
719
720	return self.cur_token
721
722	def _ReadDoubleQuotedLeftParts(self):
723	# type: () -> word_part_t
724	"""Read substitution parts in a double quoted context."""
725	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
726	return self._ReadCommandSub(self.token_type, d_quoted=True)
727
728	if self.token_type == Id.Left_DollarBrace:
729	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
730
731	if self.token_type == Id.Left_DollarDParen:
732	return self._ReadArithSub()
733
734	if self.token_type == Id.Left_DollarBracket:
735	return self._ReadExprSub(lex_mode_e.DQ)
736
737	raise AssertionError(self.cur_token)
738
739	def _ReadYshSingleQuoted(self, left_id):
740	# type: (Id_t) -> CompoundWord
741	"""Read YSH style strings
742
743	r'' u'' b''
744	r''' ''' u''' ''' b''' '''
745	"""
746	#log('BEF self.cur_token %s', self.cur_token)
747	if left_id == Id.Left_RSingleQuote:
748	lexer_mode = lex_mode_e.SQ_Raw
749	triple_left_id = Id.Left_RTSingleQuote
750	elif left_id == Id.Left_USingleQuote:
751	lexer_mode = lex_mode_e.J8_Str
752	triple_left_id = Id.Left_UTSingleQuote
753	elif left_id == Id.Left_BSingleQuote:
754	lexer_mode = lex_mode_e.J8_Str
755	triple_left_id = Id.Left_BTSingleQuote
756	else:
757	raise AssertionError(left_id)
758
759	# Needed for syntax checks
760	left_tok = self.cur_token
761	left_tok.id = left_id
762
763	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
764
765	if (len(sq_part.tokens) == 0 and self.lexer.ByteLookAhead() == "'"):
766	self._SetNext(lex_mode_e.ShCommand)
767	self._GetToken()
768
769	assert self.token_type == Id.Left_SingleQuote
770	# HACK: magically transform the third ' in u''' to
771	# Id.Left_UTSingleQuote, so that ''' is the terminator
772	left_tok = self.cur_token
773	left_tok.id = triple_left_id
774
775	# Handles stripping leading whitespace
776	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
777
778	# Advance and validate
779	self._SetNext(lex_mode_e.ShCommand)
780
781	self._GetToken()
782	if self.token_kind not in KINDS_THAT_END_WORDS:
783	p_die('Unexpected token after YSH single-quoted string',
784	self.cur_token)
785
786	return CompoundWord([sq_part])
787
788	def _ReadUnquotedLeftParts(self, triple_out):
789	# type: (Optional[BoolParamBox]) -> word_part_t
790	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
791
792	If triple_out is set, then we try parsing triple quoted strings,
793	and set its value to True if we got one.
794	"""
795	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
796	# Note: $"" is a synonym for "". It might make sense if it added
797	# \n \0 \x00 \u{123} etc. But that's not what bash does!
798	dq_part = self._ReadDoubleQuoted(self.cur_token)
799	# Got empty word "" and there's a " after
800	if (triple_out and len(dq_part.parts) == 0 and
801	self.lexer.ByteLookAhead() == '"'):
802
803	self._SetNext(lex_mode_e.ShCommand)
804	self._GetToken()
805	# HACK: magically transform the third " in """ to
806	# Id.Left_TDoubleQuote, so that """ is the terminator
807	left_dq_token = self.cur_token
808	left_dq_token.id = Id.Left_TDoubleQuote
809	triple_out.b = True # let caller know we got it
810	return self._ReadDoubleQuoted(left_dq_token)
811
812	return dq_part
813
814	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
815	Id.Left_DollarSingleQuote):
816	if self.token_type == Id.Left_SingleQuote:
817	lexer_mode = lex_mode_e.SQ_Raw
818	triple_left_id = Id.Left_TSingleQuote
819	elif self.token_type == Id.Left_RSingleQuote:
820	lexer_mode = lex_mode_e.SQ_Raw
821	triple_left_id = Id.Left_RTSingleQuote
822	else:
823	lexer_mode = lex_mode_e.SQ_C
824	# there is no such thing as $'''
825	triple_left_id = Id.Undefined_Tok
826
827	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
828
829	# Got empty '' or r'' and there's a ' after
830	# u'' and b'' are handled in _ReadYshSingleQuoted
831	if (triple_left_id != Id.Undefined_Tok and
832	triple_out is not None and len(sq_part.tokens) == 0 and
833	self.lexer.ByteLookAhead() == "'"):
834
835	self._SetNext(lex_mode_e.ShCommand)
836	self._GetToken()
837
838	# HACK: magically transform the third ' in ''' to
839	# Id.Left_TSingleQuote, so that ''' is the terminator
840	left_sq_token = self.cur_token
841	left_sq_token.id = triple_left_id
842
843	triple_out.b = True # let caller know we got it
844	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
845
846	return sq_part
847
848	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
849	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
850	return self._ReadCommandSub(self.token_type, d_quoted=False)
851
852	if self.token_type == Id.Left_DollarBrace:
853	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
854
855	if self.token_type == Id.Left_DollarDParen:
856	return self._ReadArithSub()
857
858	if self.token_type == Id.Left_DollarBracket:
859	return self._ReadExprSub(lex_mode_e.ShCommand)
860
861	raise AssertionError(self.cur_token)
862
863	def _ReadExtGlob(self):
864	# type: () -> word_part.ExtGlob
865	"""
866	Grammar:
867	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
868	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
869	RIGHT = ')'
870	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
871	Compound includes ExtGlob
872	"""
873	left_token = self.cur_token
874	right_token = None # type: Token
875	arms = [] # type: List[CompoundWord]
876
877	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
878	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
879
880	read_word = False # did we just a read a word? To handle @(\|\|).
881
882	while True:
883	self._GetToken()
884
885	if self.token_type == Id.Right_ExtGlob:
886	if not read_word:
887	arms.append(CompoundWord([]))
888	right_token = self.cur_token
889	break
890
891	elif self.token_type == Id.Op_Pipe:
892	if not read_word:
893	arms.append(CompoundWord([]))
894	read_word = False
895	self._SetNext(lex_mode_e.ExtGlob)
896
897	# lex mode EXTGLOB should only produce these 4 kinds of tokens
898	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
899	Kind.ExtGlob):
900	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
901	arms.append(w)
902	read_word = True
903
904	elif self.token_kind == Kind.Eof:
905	p_die('Unexpected EOF reading extended glob that began here',
906	left_token)
907
908	else:
909	raise AssertionError(self.cur_token)
910
911	return word_part.ExtGlob(left_token, arms, right_token)
912
913	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
914	# type: (Optional[Token], bool, List[word_part_t]) -> None
915	"""
916	Args:
917	left_token: A token if we are reading a double quoted part, or None if
918	we're reading a here doc.
919	is_ysh_expr: Whether to disallow backticks and invalid char escapes
920	out_parts: list of word_part to append to
921	"""
922	if left_token:
923	expected_end_tokens = 3 if left_token.id == Id.Left_TDoubleQuote else 1
924	else:
925	expected_end_tokens = 1000 # here doc will break
926
927	num_end_tokens = 0
928	while num_end_tokens < expected_end_tokens:
929	self._SetNext(lex_mode_e.DQ)
930	self._GetToken()
931
932	if self.token_kind == Kind.Lit:
933	if self.token_type == Id.Lit_EscapedChar:
934	tok = self.cur_token
935	ch = lexer.TokenSliceLeft(tok, 1)
936	part = word_part.EscapedLiteral(tok,
937	ch) # type: word_part_t
938	else:
939	if self.token_type == Id.Lit_BadBackslash:
940	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
941	# YSH.
942	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
943	# recursion (unless parse_backslash)
944	if (is_ysh_expr or
945	not self.parse_opts.parse_backslash()):
946	p_die(
947	"Invalid char escape in double quoted string (OILS-ERR-12)",
948	self.cur_token)
949	elif self.token_type == Id.Lit_Dollar:
950	if is_ysh_expr or not self.parse_opts.parse_dollar():
951	p_die("Literal $ should be quoted like \$",
952	self.cur_token)
953
954	part = self.cur_token
955	out_parts.append(part)
956
957	elif self.token_kind == Kind.Left:
958	if self.token_type == Id.Left_Backtick and is_ysh_expr:
959	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
960	self.cur_token)
961
962	part = self._ReadDoubleQuotedLeftParts()
963	out_parts.append(part)
964
965	elif self.token_kind == Kind.VSub:
966	tok = self.cur_token
967	part = NameTok(tok, lexer.TokenSliceLeft(tok, 1))
968	out_parts.append(part)
969	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
970	# later.
971
972	elif self.token_kind == Kind.Right:
973	assert self.token_type == Id.Right_DoubleQuote, self.token_type
974	if left_token:
975	num_end_tokens += 1
976
977	# In a here doc, the right quote is literal!
978	out_parts.append(self.cur_token)
979
980	elif self.token_kind == Kind.Eof:
981	if left_token:
982	p_die(
983	'Unexpected EOF reading double-quoted string that began here',
984	left_token)
985	else: # here docs will have an EOF in their token stream
986	break
987
988	else:
989	raise AssertionError(self.cur_token)
990
991	if self.token_kind != Kind.Right:
992	num_end_tokens = 0 # """ must be CONSECUTIVE
993
994	if expected_end_tokens == 1:
995	out_parts.pop()
996	elif expected_end_tokens == 3:
997	out_parts.pop()
998	out_parts.pop()
999	out_parts.pop()
1000
1001	# Remove space from """ in both expression mode and command mode
1002	if left_token and left_token.id == Id.Left_TDoubleQuote:
1003	word_compile.RemoveLeadingSpaceDQ(out_parts)
1004
1005	# Return nothing, since we appended to 'out_parts'
1006
1007	def _ReadDoubleQuoted(self, left_token):
1008	# type: (Token) -> DoubleQuoted
1009	"""Helper function for "hello $name".
1010
1011	Args:
1012	eof_type: for stopping at }, Id.Lit_RBrace
1013	here_doc: Whether we are reading in a here doc context
1014
1015	Also ${foo%%a b c} # treat this as double quoted. until you hit
1016	"""
1017	parts = [] # type: List[word_part_t]
1018	self._ReadLikeDQ(left_token, False, parts)
1019
1020	right_quote = self.cur_token
1021	return DoubleQuoted(left_token, parts, right_quote)
1022
1023	def ReadDoubleQuoted(self, left_token, parts):
1024	# type: (Token, List[word_part_t]) -> Token
1025	"""For expression mode.
1026
1027	Read var x = "${dir:-}/$name"; etc.
1028	"""
1029	self._ReadLikeDQ(left_token, True, parts)
1030	return self.cur_token
1031
1032	def _ReadCommandSub(self, left_id, d_quoted=False):
1033	# type: (Id_t, bool) -> CommandSub
1034	"""
1035	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1036
1037	command_sub = '$(' command_list ')'
1038	\| '@(' command_list ')'
1039	\| '<(' command_list ')'
1040	\| '>(' command_list ')'
1041	\| ` command_list `
1042	"""
1043	left_token = self.cur_token
1044
1045	# Set the lexer in a state so ) becomes the EOF token.
1046	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1047	Id.Left_ProcSubOut):
1048	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1049
1050	right_id = Id.Eof_RParen
1051	self.lexer.PushHint(Id.Op_RParen, right_id)
1052	c_parser = self.parse_ctx.MakeParserForCommandSub(
1053	self.line_reader, self.lexer, right_id)
1054	# NOTE: This doesn't use something like main_loop because we don't want
1055	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1056	node = c_parser.ParseCommandSub()
1057
1058	right_token = c_parser.w_parser.cur_token
1059
1060	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1061	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1062	# test/osh2oil.
1063
1064	right_id = Id.Eof_Backtick
1065	self.lexer.PushHint(Id.Left_Backtick, right_id)
1066	c_parser = self.parse_ctx.MakeParserForCommandSub(
1067	self.line_reader, self.lexer, right_id)
1068	node = c_parser.ParseCommandSub()
1069	right_token = c_parser.w_parser.cur_token
1070
1071	elif left_id == Id.Left_Backtick:
1072	if not self.parse_opts.parse_backticks():
1073	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1074	left_token)
1075
1076	self._SetNext(lex_mode_e.Backtick) # advance past `
1077
1078	parts = [] # type: List[str]
1079	while True:
1080	self._GetToken()
1081	#log("TOK %s", self.cur_token)
1082
1083	if self.token_type == Id.Backtick_Quoted:
1084	# Remove leading \
1085	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1086
1087	elif self.token_type == Id.Backtick_DoubleQuote:
1088	# Compatibility: If backticks are double quoted, then double quotes
1089	# within them have to be \"
1090	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1091	# is)
1092	if d_quoted:
1093	# Remove leading \
1094	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1095	else:
1096	parts.append(lexer.TokenVal(self.cur_token))
1097
1098	elif self.token_type == Id.Backtick_Other:
1099	parts.append(lexer.TokenVal(self.cur_token))
1100
1101	elif self.token_type == Id.Backtick_Right:
1102	break
1103
1104	elif self.token_type == Id.Eof_Real:
1105	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1106	p_die('Unexpected EOF while looking for closing backtick',
1107	left_token)
1108
1109	else:
1110	raise AssertionError(self.cur_token)
1111
1112	self._SetNext(lex_mode_e.Backtick)
1113
1114	# Calculate right SPID on CommandSub BEFORE re-parsing.
1115	right_token = self.cur_token
1116
1117	code_str = ''.join(parts)
1118	#log('code %r', code_str)
1119
1120	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1121	# won't have the same location info as MakeParserForCommandSub(), because
1122	# the lexer is different.
1123	arena = self.parse_ctx.arena
1124	#arena = alloc.Arena()
1125	line_reader = reader.StringLineReader(code_str, arena)
1126	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1127	src = source.Reparsed('backticks', left_token, right_token)
1128	with alloc.ctx_SourceCode(arena, src):
1129	node = c_parser.ParseCommandSub()
1130
1131	else:
1132	raise AssertionError(left_id)
1133
1134	return CommandSub(left_token, node, right_token)
1135
1136	def _ReadExprSub(self, lex_mode):
1137	# type: (lex_mode_t) -> word_part.ExprSub
1138	"""$[d->key] $[obj.method()] etc."""
1139	left_token = self.cur_token
1140
1141	self._SetNext(lex_mode_e.Expr)
1142	enode, right_token = self.parse_ctx.ParseYshExpr(
1143	self.lexer, grammar_nt.ysh_expr_sub)
1144
1145	self._SetNext(lex_mode) # Move past ]
1146	return word_part.ExprSub(left_token, enode, right_token)
1147
1148	def ParseVarDecl(self, kw_token):
1149	# type: (Token) -> command.VarDecl
1150	"""
1151	oil_var_decl: name_type_list '=' testlist end_stmt
1152
1153	Note that assignments must end with \n ; } or EOF. Unlike shell
1154	assignments, we disallow:
1155
1156	var x = 42 \| wc -l
1157	var x = 42 && echo hi
1158	"""
1159	self._SetNext(lex_mode_e.Expr)
1160	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1161	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1162	# wants
1163	if last_token.id == Id.Op_RBrace:
1164	last_token.id = Id.Lit_RBrace
1165
1166	# Let the CommandParser see the Op_Semi or Op_Newline.
1167	self.buffered_word = last_token
1168	self._SetNext(lex_mode_e.ShCommand) # always back to this
1169	return enode
1170
1171	def ParseMutation(self, kw_token, var_checker):
1172	# type: (Token, VarChecker) -> command.Mutation
1173	"""
1174	setvar i = 42
1175	setvar i += 1
1176	setvar a[i] = 42
1177	setvar a[i] += 1
1178	setvar d.key = 42
1179	setvar d.key += 1
1180	"""
1181	self._SetNext(lex_mode_e.Expr)
1182	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1183	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1184	# wants
1185	if last_token.id == Id.Op_RBrace:
1186	last_token.id = Id.Lit_RBrace
1187
1188	for lhs in enode.lhs:
1189	UP_lhs = lhs
1190	with tagswitch(lhs) as case:
1191	if case(y_lhs_e.Var):
1192	lhs = cast(NameTok, UP_lhs)
1193	var_checker.Check(kw_token.id, lhs.var_name, lhs.left)
1194
1195	# Note: this does not cover cases like
1196	# setvar (a[0])[1] = v
1197	# setvar (d.key).other = v
1198	# This leaks into catching all typos statically, which may be
1199	# possible if 'use' makes all names explicit.
1200	elif case(y_lhs_e.Subscript):
1201	lhs = cast(Subscript, UP_lhs)
1202	if lhs.obj.tag() == expr_e.Var:
1203	v = cast(expr.Var, lhs.obj)
1204	var_checker.Check(kw_token.id, v.name, v.left)
1205
1206	elif case(y_lhs_e.Attribute):
1207	lhs = cast(Attribute, UP_lhs)
1208	if lhs.obj.tag() == expr_e.Var:
1209	v = cast(expr.Var, lhs.obj)
1210	var_checker.Check(kw_token.id, v.name, v.left)
1211
1212	# Let the CommandParser see the Op_Semi or Op_Newline.
1213	self.buffered_word = last_token
1214	self._SetNext(lex_mode_e.ShCommand) # always back to this
1215	return enode
1216
1217	def ParseBareDecl(self):
1218	# type: () -> expr_t
1219	"""
1220	x = {name: val}
1221	"""
1222	self._SetNext(lex_mode_e.Expr)
1223	self._GetToken()
1224	enode, last_token = self.parse_ctx.ParseYshExpr(
1225	self.lexer, grammar_nt.command_expr)
1226	if last_token.id == Id.Op_RBrace:
1227	last_token.id = Id.Lit_RBrace
1228	self.buffered_word = last_token
1229	self._SetNext(lex_mode_e.ShCommand)
1230	return enode
1231
1232	def ParseYshExprForCommand(self):
1233	# type: () -> expr_t
1234
1235	# Fudge for this case
1236	# for x in(y) {
1237	# versus
1238	# for x in (y) {
1239	#
1240	# In the former case, ReadWord on 'in' puts the lexer past (.
1241	# Also see LookPastSpace in CommandParers.
1242	# A simpler solution would be nicer.
1243
1244	if self.token_type == Id.Op_LParen:
1245	self.lexer.MaybeUnreadOne()
1246
1247	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1248
1249	self._SetNext(lex_mode_e.ShCommand)
1250	return enode
1251
1252	def ParseCommandExpr(self):
1253	# type: () -> expr_t
1254	"""
1255	= 1+2
1256	"""
1257	enode, last_token = self.parse_ctx.ParseYshExpr(
1258	self.lexer, grammar_nt.command_expr)
1259
1260	# In some cases, such as the case statement, we expect the lexer to be
1261	# pointing at the token right after the expression. But the expression
1262	# parser must have read to the `last_token`. Unreading places the lexer
1263	# back in the expected state. Ie:
1264	#
1265	# case (x) { case (x) {
1266	# (else) { = x } (else) { = x }
1267	# ^ The lexer is here ^ Unread to here
1268	# } }
1269	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1270	Id.Op_RBrace), last_token
1271	if last_token.id != Id.Eof_Real:
1272	# Eof_Real is the only token we cannot unread
1273	self.lexer.MaybeUnreadOne()
1274
1275	return enode
1276
1277	def ParseProc(self, node):
1278	# type: (Proc) -> None
1279
1280	# proc name-with-hyphens() must be accepted
1281	self._SetNext(lex_mode_e.ShCommand)
1282	self._GetToken()
1283	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1284	if self.token_type != Id.Lit_Chars:
1285	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1286	self.cur_token)
1287
1288	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1289	# for shell functions. Similar to IsValidVarName().
1290	node.name = self.cur_token
1291
1292	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1293
1294	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1295	assert last_token.id == Id.Op_LBrace
1296	last_token.id = Id.Lit_LBrace
1297	self.buffered_word = last_token
1298
1299	self._SetNext(lex_mode_e.ShCommand)
1300
1301	def ParseFunc(self, node):
1302	# type: (Func) -> None
1303	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1304
1305	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1306	assert last_token.id == Id.Op_LBrace
1307	last_token.id = Id.Lit_LBrace
1308	self.buffered_word = last_token
1309
1310	self._SetNext(lex_mode_e.ShCommand)
1311
1312	def ParseYshCasePattern(self):
1313	# type: () -> Tuple[pat_t, Token]
1314	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1315	self.lexer)
1316
1317	if last_token.id == Id.Op_LBrace:
1318	last_token.id = Id.Lit_LBrace
1319	self.buffered_word = last_token
1320
1321	return pat, left_tok
1322
1323	def NewlineOkForYshCase(self):
1324	# type: () -> Id_t
1325	"""Check for optional newline and consume it.
1326
1327	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1328	which crop up while parsing Ysh Case Arms. For more details, see
1329	#oil-dev > Progress On YSH Case Grammar on zulip.
1330
1331	Returns a token id which is filled with the choice of
1332
1333	word { echo word }
1334	(3) { echo expr }
1335	/e/ { echo eggex }
1336	} # right brace
1337	"""
1338	while True:
1339	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1340
1341	# Cannot lookahead past lines
1342	if next_id == Id.Unknown_Tok:
1343	self.lexer.MoveToNextLine()
1344	continue
1345
1346	next_kind = consts.GetKind(next_id)
1347	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1348	break
1349
1350	self.lexer.Read(lex_mode_e.Expr)
1351
1352	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1353	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1354	else:
1355	# Consume the trailing Op_Newline
1356	self._SetNext(lex_mode_e.ShCommand)
1357	self._GetToken()
1358
1359	return next_id
1360
1361	def _ReadArithExpr(self, end_id):
1362	# type: (Id_t) -> arith_expr_t
1363	"""Read and parse an arithmetic expression in various contexts.
1364
1365	$(( 1+2 ))
1366	(( a=1+2 ))
1367	${a[ 1+2 ]}
1368	${a : 1+2 : 1+2}
1369
1370	See tests/arith-context.test.sh for ambiguous cases.
1371
1372	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1373
1374	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1375
1376	See the assertion in ArithParser.Parse() -- unexpected extra input.
1377	"""
1378	# calls self.ReadWord(lex_mode_e.Arith)
1379	anode = self.a_parser.Parse()
1380	cur_id = self.a_parser.CurrentId()
1381	if end_id != Id.Undefined_Tok and cur_id != end_id:
1382	p_die(
1383	'Unexpected token after arithmetic expression (%s != %s)' %
1384	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1385	loc.Word(self.a_parser.cur_word))
1386	return anode
1387
1388	def _ReadArithSub(self):
1389	# type: () -> word_part.ArithSub
1390	"""Read an arith substitution, which contains an arith expression, e.g.
1391
1392	$((a + 1)).
1393	"""
1394	left_tok = self.cur_token
1395
1396	# The second one needs to be disambiguated in stuff like stuff like:
1397	# $(echo $(( 1+2 )) )
1398	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1399
1400	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1401	# could save the lexer/reader state here, and retry if the arithmetic parse
1402	# fails. But we can almost always catch this at parse time. There could
1403	# be some exceptions like:
1404	# $((echo * foo)) # looks like multiplication
1405	# $((echo / foo)) # looks like division
1406
1407	self._SetNext(lex_mode_e.Arith)
1408	anode = self._ReadArithExpr(Id.Arith_RParen)
1409
1410	# TODO: This could be DQ or Arith too
1411	self._SetNext(lex_mode_e.ShCommand)
1412
1413	# PROBLEM: $(echo $(( 1 + 2 )) )
1414	# Two right parens break the Id.Eof_RParen scheme
1415	self._GetToken()
1416	if self.token_type != Id.Right_DollarDParen:
1417	p_die('Expected second ) to end arith sub', self.cur_token)
1418
1419	right_tok = self.cur_token
1420	return word_part.ArithSub(left_tok, anode, right_tok)
1421
1422	def ReadDParen(self):
1423	# type: () -> Tuple[arith_expr_t, Token]
1424	"""Read ((1+ 2)) -- command context.
1425
1426	We're using the word parser because it's very similar to _ReadArithExpr
1427	above.
1428
1429	This also returns the terminating `Op_DRightParen` token for use as location
1430	tracking.
1431	"""
1432	# The second one needs to be disambiguated in stuff like stuff like:
1433	# TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1434	# Then you can get rid of this.
1435	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1436
1437	self._SetNext(lex_mode_e.Arith)
1438	anode = self._ReadArithExpr(Id.Arith_RParen)
1439
1440	self._SetNext(lex_mode_e.ShCommand)
1441
1442	# PROBLEM: $(echo $(( 1 + 2 )) )
1443	self._GetToken()
1444	right = self.cur_token
1445	if self.token_type != Id.Op_DRightParen:
1446	p_die('Expected second ) to end arith statement', self.cur_token)
1447
1448	self._SetNext(lex_mode_e.ShCommand)
1449
1450	return anode, right
1451
1452	def _SetNextNonSpace(self):
1453	# type: () -> None
1454	"""Same logic as _ReadWord, but for ReadForExpression."""
1455	while True:
1456	self._SetNext(lex_mode_e.Arith)
1457	self._GetToken()
1458	if self.token_kind not in (Kind.Ignored, Kind.WS):
1459	break
1460
1461	def ReadForExpression(self):
1462	# type: () -> command.ForExpr
1463	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1464	self._SetNextNonSpace() # skip over ((
1465
1466	self._GetToken()
1467	cur_id = self.token_type # for end of arith expressions
1468
1469	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1470	init_node = None # type: Optional[arith_expr_t]
1471	else:
1472	init_node = self.a_parser.Parse()
1473	cur_id = self.a_parser.CurrentId()
1474	self._SetNextNonSpace()
1475
1476	# It's odd to keep track of both cur_id and self.token_type in this
1477	# function, but it works, and is tested in 'test/parse_error.sh
1478	# arith-integration'
1479	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1480	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1481
1482	self._GetToken()
1483	cur_id = self.token_type
1484
1485	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1486	cond_node = None # type: Optional[arith_expr_t]
1487	else:
1488	cond_node = self.a_parser.Parse()
1489	cur_id = self.a_parser.CurrentId()
1490	self._SetNextNonSpace()
1491
1492	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1493	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1494
1495	self._GetToken()
1496	cur_id = self.token_type
1497
1498	if cur_id == Id.Arith_RParen: # for (( ; ; ))
1499	update_node = None # type: Optional[arith_expr_t]
1500	else:
1501	update_node = self._ReadArithExpr(Id.Arith_RParen)
1502	self._SetNextNonSpace()
1503
1504	self._GetToken()
1505	if self.token_type != Id.Arith_RParen:
1506	p_die('Expected ) to end for loop expression', self.cur_token)
1507	self._SetNext(lex_mode_e.ShCommand)
1508
1509	# redirects is None, will be assigned in CommandEvaluator
1510	node = command.ForExpr.CreateNull()
1511	node.init = init_node
1512	node.cond = cond_node
1513	node.update = update_node
1514	return node
1515
1516	def _ReadArrayLiteral(self):
1517	# type: () -> word_part_t
1518	"""a=(1 2 3)
1519
1520	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1521
1522	We want:
1523
1524	A=(['x']=1 ["x"]=2 [$x$y]=3)
1525
1526	Maybe allow this as a literal string? Because I think I've seen it before?
1527	Or maybe force people to patch to learn the rule.
1528
1529	A=([x]=4)
1530
1531	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1532	Maybe enforce that ALL have keys or NONE of have keys.
1533	"""
1534	self._SetNext(lex_mode_e.ShCommand) # advance past (
1535	self._GetToken()
1536	if self.cur_token.id != Id.Op_LParen:
1537	p_die('Expected ( after =', self.cur_token)
1538	left_token = self.cur_token
1539	right_token = None # type: Token
1540
1541	# MUST use a new word parser (with same lexer).
1542	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1543	words = [] # type: List[CompoundWord]
1544	done = False
1545	while not done:
1546	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1547	with tagswitch(w) as case:
1548	if case(word_e.Operator):
1549	tok = cast(Token, w)
1550	if tok.id == Id.Right_ShArrayLiteral:
1551	right_token = tok
1552	done = True # can't use break here
1553	# Unlike command parsing, array parsing allows embedded \n.
1554	elif tok.id == Id.Op_Newline:
1555	continue
1556	else:
1557	p_die('Unexpected token in array literal', loc.Word(w))
1558
1559	elif case(word_e.Compound):
1560	words.append(cast(CompoundWord, w))
1561
1562	else:
1563	raise AssertionError()
1564
1565	if len(words) == 0: # a=() is empty indexed array
1566	# Needed for type safety, doh
1567	no_words = [] # type: List[word_t]
1568	node = ShArrayLiteral(left_token, no_words, right_token)
1569	return node
1570
1571	pairs = [] # type: List[AssocPair]
1572	# If the first one is a key/value pair, then the rest are assumed to be.
1573	pair = word_.DetectAssocPair(words[0])
1574	if pair:
1575	pairs.append(pair)
1576
1577	n = len(words)
1578	for i in xrange(1, n):
1579	w2 = words[i]
1580	pair = word_.DetectAssocPair(w2)
1581	if not pair:
1582	p_die("Expected associative array pair", loc.Word(w2))
1583
1584	pairs.append(pair)
1585
1586	# invariant List?
1587	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1588
1589	# Brace detection for arrays but NOT associative arrays
1590	words2 = braces.BraceDetectAll(words)
1591	words3 = word_.TildeDetectAll(words2)
1592	return ShArrayLiteral(left_token, words3, right_token)
1593
1594	def ParseProcCallArgs(self, start_symbol):
1595	# type: (int) -> ArgList
1596	""" json write (x) """
1597	self.lexer.MaybeUnreadOne()
1598
1599	arg_list = ArgList.CreateNull(alloc_lists=True)
1600	arg_list.left = self.cur_token
1601	self.parse_ctx.ParseYshArgList(self.lexer, arg_list, start_symbol)
1602	return arg_list
1603
1604	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1605	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1606	"""Helper for _ReadCompoundWord3."""
1607	done = False
1608
1609	if self.token_type == Id.Lit_EscapedChar:
1610	tok = self.cur_token
1611	assert tok.length == 2
1612	ch = lexer.TokenSliceLeft(tok, 1)
1613	if not self.parse_opts.parse_backslash():
1614	if not pyutil.IsValidCharEscape(ch):
1615	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1616	self.cur_token)
1617
1618	part = word_part.EscapedLiteral(self.cur_token,
1619	ch) # type: word_part_t
1620	else:
1621	part = self.cur_token
1622
1623	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1624	parts.append(part)
1625	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1626	# _ReadWord.
1627	next_id = self.lexer.LookPastSpace(lex_mode)
1628	if next_id == Id.Op_LParen:
1629	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1630	part2 = self._ReadArrayLiteral()
1631	parts.append(part2)
1632
1633	# Array literal must be the last part of the word.
1634	self._SetNext(lex_mode)
1635	self._GetToken()
1636	# EOF, whitespace, newline, Right_Subshell
1637	if self.token_kind not in KINDS_THAT_END_WORDS:
1638	p_die('Unexpected token after array literal',
1639	self.cur_token)
1640	done = True
1641
1642	elif (is_first and self.parse_opts.parse_at() and
1643	self.token_type == Id.Lit_Splice):
1644
1645	splice_tok = self.cur_token
1646	part2 = word_part.Splice(splice_tok,
1647	lexer.TokenSliceLeft(splice_tok, 1))
1648
1649	parts.append(part2)
1650
1651	# @words must be the last part of the word
1652	self._SetNext(lex_mode)
1653	self._GetToken()
1654	# EOF, whitespace, newline, Right_Subshell
1655	if self.token_kind not in KINDS_THAT_END_WORDS:
1656	p_die('Unexpected token after array splice', self.cur_token)
1657	done = True
1658
1659	elif (is_first and self.parse_opts.parse_at() and
1660	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1661	part2 = self._ReadExprSub(lex_mode_e.DQ)
1662	parts.append(part2)
1663
1664	# @[split(x)]
1665	self._SetNext(lex_mode)
1666	self._GetToken()
1667	# EOF, whitespace, newline, Right_Subshell
1668	if self.token_kind not in KINDS_THAT_END_WORDS:
1669	p_die('Unexpected token after Expr splice', self.cur_token)
1670	done = True
1671
1672	elif (is_first and self.parse_opts.parse_at() and
1673	self.token_type == Id.Lit_AtLBraceDot):
1674	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1675
1676	elif (is_first and self.parse_opts.parse_at_all() and
1677	self.token_type == Id.Lit_At):
1678	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1679	# at the beginning of a word to be reserved.
1680
1681	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1682	# @_argv and
1683	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1684	self.cur_token)
1685
1686	else:
1687	# not a literal with lookahead; append it
1688	parts.append(part)
1689
1690	return done
1691
1692	def _ReadCompoundWord(self, lex_mode):
1693	# type: (lex_mode_t) -> CompoundWord
1694	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1695
1696	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1697	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1698	"""
1699	Precondition: Looking at the first token of the first word part
1700	Postcondition: Looking at the token after, e.g. space or operator
1701
1702	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1703	could be an operator delimiting a compound word. Can we change lexer modes
1704	and remove this special case?
1705	"""
1706	w = CompoundWord([])
1707	num_parts = 0
1708	brace_count = 0
1709	done = False
1710	is_triple_quoted = None # type: Optional[BoolParamBox]
1711
1712	while not done:
1713	self._GetToken()
1714
1715	allow_done = empty_ok or num_parts != 0
1716	if allow_done and self.token_type == eof_type:
1717	done = True # e.g. for ${foo//pat/replace}
1718
1719	# Keywords like "for" are treated like literals
1720	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1721	Kind.ControlFlow, Kind.BoolUnary,
1722	Kind.BoolBinary):
1723
1724	# Syntax error for { and }
1725	if self.token_type == Id.Lit_LBrace:
1726	brace_count += 1
1727	elif self.token_type == Id.Lit_RBrace:
1728	brace_count -= 1
1729	elif self.token_type == Id.Lit_Dollar:
1730	if not self.parse_opts.parse_dollar():
1731	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1732	next_byte = self.lexer.ByteLookAhead()
1733	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1734	if next_byte == '/':
1735	#log('next_byte %r', next_byte)
1736	pass
1737
1738	p_die('Literal $ should be quoted like \$',
1739	self.cur_token)
1740
1741	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1742	w.parts)
1743
1744	elif self.token_kind == Kind.VSub:
1745	vsub_token = self.cur_token
1746
1747	part = NameTok(vsub_token,
1748	lexer.TokenSliceLeft(vsub_token,
1749	1)) # type: word_part_t
1750	w.parts.append(part)
1751
1752	elif self.token_kind == Kind.ExtGlob:
1753	# If parse_at, we can take over @( to start @(seq 3)
1754	# Users can also use look at ,(.py\|.sh)
1755	if (self.parse_opts.parse_at() and
1756	self.token_type == Id.ExtGlob_At and num_parts == 0):
1757	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1758	d_quoted=False)
1759	# RARE mutation of tok.id!
1760	cs_part.left_token.id = Id.Left_AtParen
1761	part = cs_part # for type safety
1762
1763	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1764	# a=(one two)x and @arrayfunc(3)x.
1765	self._GetToken()
1766	if self.token_kind not in KINDS_THAT_END_WORDS:
1767	p_die('Unexpected token after @()', self.cur_token)
1768	done = True
1769
1770	else:
1771	part = self._ReadExtGlob()
1772	w.parts.append(part)
1773
1774	elif self.token_kind == Kind.Left:
1775	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1776	lex_mode == lex_mode_e.ShCommand and
1777	num_parts == 0)
1778
1779	# Save allocation
1780	if try_triple_quote:
1781	is_triple_quoted = BoolParamBox(False)
1782
1783	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1784	w.parts.append(part)
1785
1786	# NOT done yet, will advance below
1787	elif self.token_kind == Kind.Right:
1788	# Still part of the word; will be done on the next iter.
1789	if self.token_type == Id.Right_DoubleQuote:
1790	pass
1791	# Never happens, no PushHint for this case.
1792	#elif self.token_type == Id.Right_DollarParen:
1793	# pass
1794	elif self.token_type == Id.Right_Subshell:
1795	# LEXER HACK for (case x in x) ;; esac )
1796	# Rewind before it's used
1797	assert self.next_lex_mode == lex_mode_e.Undefined
1798	if self.lexer.MaybeUnreadOne():
1799	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1800	self._SetNext(lex_mode)
1801	done = True
1802	else:
1803	done = True
1804
1805	elif self.token_kind == Kind.Ignored:
1806	done = True
1807
1808	else:
1809	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1810	# so to test for ESAC, we can read ) before getting a chance to
1811	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1812	# token and do it again.
1813
1814	# We get Id.Op_RParen at top level: case x in x) ;; esac
1815	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1816	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1817	# Rewind before it's used
1818	assert self.next_lex_mode == lex_mode_e.Undefined
1819	if self.lexer.MaybeUnreadOne():
1820	if self.token_type == Id.Eof_RParen:
1821	# Redo translation
1822	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1823	self._SetNext(lex_mode)
1824
1825	done = True # anything we don't recognize means we're done
1826
1827	if not done:
1828	self._SetNext(lex_mode)
1829	num_parts += 1
1830
1831	if (self.parse_opts.parse_brace() and num_parts > 1 and
1832	brace_count != 0):
1833	# accept { and }, but not foo{
1834	p_die(
1835	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1836	loc.Word(w))
1837
1838	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1839	p_die('Unexpected parts after triple quoted string',
1840	loc.WordPart(w.parts[-1]))
1841
1842	return w
1843
1844	def _ReadArithWord(self):
1845	# type: () -> Optional[word_t]
1846	""" Helper for ReadArithWord() """
1847	self._GetToken()
1848
1849	if self.token_kind == Kind.Unknown:
1850	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1851	p_die(
1852	'Unexpected token while parsing arithmetic: %r' %
1853	lexer.TokenVal(self.cur_token), self.cur_token)
1854
1855	elif self.token_kind == Kind.Eof:
1856	return self.cur_token
1857
1858	elif self.token_kind == Kind.Ignored:
1859	# Space should be ignored.
1860	self._SetNext(lex_mode_e.Arith)
1861	return None
1862
1863	elif self.token_kind in (Kind.Arith, Kind.Right):
1864	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1865	self._SetNext(lex_mode_e.Arith)
1866	return self.cur_token
1867
1868	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1869	return self._ReadCompoundWord(lex_mode_e.Arith)
1870
1871	else:
1872	raise AssertionError(self.cur_token)
1873
1874	def _ReadWord(self, word_mode):
1875	# type: (lex_mode_t) -> Optional[word_t]
1876	"""Helper function for ReadWord()."""
1877
1878	# Change the pseudo lexer mode to a real lexer mode
1879	if word_mode == lex_mode_e.ShCommandBrack:
1880	lex_mode = lex_mode_e.ShCommand
1881	else:
1882	lex_mode = word_mode
1883
1884	self._GetToken()
1885
1886	if self.token_kind == Kind.Eof:
1887	# No advance
1888	return self.cur_token
1889
1890	# Allow Arith for ) at end of for loop?
1891	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1892	self._SetNext(lex_mode)
1893
1894	# Newlines are complicated. See 3x2 matrix in the comment about
1895	# self.multiline and self.newline_state above.
1896	if self.token_type == Id.Op_Newline:
1897	if self.multiline:
1898	if self.newline_state > 1:
1899	# This points at a blank line, but at least it gives the line number
1900	p_die('Invalid blank line in multiline mode',
1901	self.cur_token)
1902	return None
1903
1904	if self.returned_newline: # skip
1905	return None
1906
1907	return self.cur_token
1908
1909	elif self.token_kind == Kind.Right:
1910	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1911	Id.Right_CasePat,
1912	Id.Right_ShArrayLiteral):
1913	raise AssertionError(self.cur_token)
1914
1915	self._SetNext(lex_mode)
1916	return self.cur_token
1917
1918	elif self.token_kind in (Kind.Ignored, Kind.WS):
1919	self._SetNext(lex_mode)
1920	return None
1921
1922	else:
1923	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
1924	Kind.Left, Kind.KW, Kind.ControlFlow,
1925	Kind.BoolUnary, Kind.BoolBinary,
1926	Kind.ExtGlob), 'Unhandled token kind'
1927
1928	if (word_mode == lex_mode_e.ShCommandBrack and
1929	self.parse_opts.parse_bracket() and
1930	self.token_type == Id.Lit_LBracket):
1931	# Change [ from Kind.Lit -> Kind.Op
1932	# So CommandParser can treat
1933	# assert [42 === x]
1934	# like
1935	# json write (x)
1936	bracket_word = self.cur_token
1937	bracket_word.id = Id.Op_LBracket
1938
1939	self._SetNext(lex_mode)
1940	return bracket_word
1941
1942	# We're beginning a word. If we see Id.Lit_Pound, change to
1943	# lex_mode_e.Comment and read until end of line.
1944	if self.token_type == Id.Lit_Pound:
1945	self._SetNext(lex_mode_e.Comment)
1946	self._GetToken()
1947
1948	# NOTE: The # could be the last character in the file. It can't be
1949	# Eof_{RParen,Backtick} because #) and #` are comments.
1950	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
1951	self.cur_token
1952
1953	# The next iteration will go into Kind.Ignored and set lex state to
1954	# lex_mode_e.ShCommand/etc.
1955	return None # tell ReadWord() to try again after comment
1956
1957	elif self.token_type == Id.Lit_TPound: ### doc comment
1958	self._SetNext(lex_mode_e.Comment)
1959	self._GetToken()
1960
1961	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
1962	return self.cur_token
1963
1964	return None # tell ReadWord() to try again after comment
1965
1966	else:
1967	# r'' u'' b''
1968	if (self.token_type == Id.Lit_Chars and
1969	self.lexer.LookAheadOne(
1970	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
1971
1972	# When shopt -s parse_raw_string:
1973	# echo r'hi' is like echo 'hi'
1974	#
1975	# echo u'\u{3bc}' b'\yff' works
1976
1977	if (self.parse_opts.parse_ysh_string() and
1978	self.cur_token.tval in ('r', 'u', 'b')):
1979
1980	if self.cur_token.tval == 'r':
1981	left_id = Id.Left_RSingleQuote
1982	elif self.cur_token.tval == 'u':
1983	left_id = Id.Left_USingleQuote
1984	else:
1985	left_id = Id.Left_BSingleQuote
1986
1987	# skip the r, and then 'foo' will be read as normal
1988	self._SetNext(lex_mode_e.ShCommand)
1989
1990	self._GetToken()
1991	assert self.token_type == Id.Left_SingleQuote, self.token_type
1992
1993	# Read the word in a different lexer mode
1994	return self._ReadYshSingleQuoted(left_id)
1995
1996	return self._ReadCompoundWord(lex_mode)
1997
1998	def ParseVarRef(self):
1999	# type: () -> BracedVarSub
2000	"""DYNAMIC parsing of what's inside ${!ref}
2001
2002	# Same as VarOf production
2003	VarRefExpr = VarOf EOF
2004	"""
2005	self._SetNext(lex_mode_e.VSub_1)
2006
2007	self._GetToken()
2008	if self.token_kind != Kind.VSub:
2009	p_die('Expected var name', self.cur_token)
2010
2011	part = self._ParseVarOf()
2012	# NOTE: no ${ } means no part.left and part.right
2013	part.left = part.token # cheat to make test pass
2014	part.right = part.token
2015
2016	self._GetToken()
2017	if self.token_type != Id.Eof_Real:
2018	p_die('Expected end of var ref expression', self.cur_token)
2019	return part
2020
2021	def LookPastSpace(self):
2022	# type: () -> Id_t
2023	"""Look ahead to the next token.
2024
2025	For the CommandParser to recognize
2026	array= (1 2 3)
2027	YSH for ( versus bash for ((
2028	YSH if ( versus if test
2029	YSH while ( versus while test
2030	YSH bare assignment 'grep =' versus 'grep foo'
2031	"""
2032	assert self.token_type != Id.Undefined_Tok
2033	if self.cur_token.id == Id.WS_Space:
2034	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2035	else:
2036	id_ = self.cur_token.id
2037	return id_
2038
2039	def LookAheadFuncParens(self):
2040	# type: () -> bool
2041	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2042	assert self.token_type != Id.Undefined_Tok
2043
2044	# We have to handle 2 cases because we buffer a token
2045	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2046	return self.lexer.LookAheadFuncParens(1) # go back one char
2047
2048	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2049	return self.lexer.LookAheadFuncParens(0)
2050
2051	else:
2052	return False
2053
2054	def ReadWord(self, word_mode):
2055	# type: (lex_mode_t) -> word_t
2056	"""Read the next word, using the given lexer mode.
2057
2058	This is a stateful wrapper for the stateless _ReadWord function.
2059	"""
2060	assert word_mode in (lex_mode_e.ShCommand, lex_mode_e.ShCommandBrack,
2061	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2062
2063	if self.buffered_word: # For integration with pgen2
2064	w = self.buffered_word
2065	self.buffered_word = None
2066	else:
2067	while True:
2068	w = self._ReadWord(word_mode)
2069	if w is not None:
2070	break
2071
2072	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2073	return w
2074
2075	def ReadArithWord(self):
2076	# type: () -> word_t
2077	while True:
2078	w = self._ReadArithWord()
2079	if w is not None:
2080	break
2081	return w
2082
2083	def ReadHereDocBody(self, parts):
2084	# type: (List[word_part_t]) -> None
2085	"""
2086	A here doc is like a double quoted context, except " isn't special.
2087	"""
2088	self._ReadLikeDQ(None, False, parts)
2089	# Returns nothing
2090
2091	def ReadForPlugin(self):
2092	# type: () -> CompoundWord
2093	"""For $PS1, $PS4, etc.
2094
2095	This is just like reading a here doc line. "\n" is allowed, as
2096	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2097	"""
2098	w = CompoundWord([])
2099	self._ReadLikeDQ(None, False, w.parts)
2100	return w
2101
2102	def EmitDocToken(self, b):
2103	# type: (bool) -> None
2104	self.emit_doc_token = b
2105
2106	def Multiline(self, b):
2107	# type: (bool) -> None
2108	self.multiline = b