osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2195 lines, 1175 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	)
91	from core import alloc
92	from core.error import p_die
93	from mycpp.mylib import log
94	from core import pyutil
95	from core import ui
96	from frontend import consts
97	from frontend import lexer
98	from frontend import reader
99	from osh import tdop
100	from osh import arith_parse
101	from osh import braces
102	from osh import word_
103	from osh import word_compile
104	from mycpp.mylib import tagswitch
105
106	from typing import List, Optional, Tuple, cast
107	from typing import TYPE_CHECKING
108	if TYPE_CHECKING:
109	from frontend.lexer import Lexer
110	from frontend.parse_lib import ParseContext
111	from frontend.reader import _Reader
112	from osh.cmd_parse import VarChecker
113
114	unused1 = log
115	unused2 = Id_str
116
117	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120	class WordEmitter(object):
121	"""Common interface for [ and [["""
122
123	def __init__(self):
124	# type: () -> None
125	"""Empty constructor for mycpp."""
126	pass
127
128	def ReadWord(self, lex_mode):
129	# type: (lex_mode_t) -> word_t
130	raise NotImplementedError()
131
132
133	class WordParser(WordEmitter):
134
135	def __init__(self, parse_ctx, lexer, line_reader):
136	# type: (ParseContext, Lexer, _Reader) -> None
137	self.parse_ctx = parse_ctx
138	self.lexer = lexer
139	self.line_reader = line_reader
140	self.arena = line_reader.arena
141
142	self.parse_opts = parse_ctx.parse_opts
143	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144	self.parse_opts)
145	self.Reset()
146
147	def Init(self, lex_mode):
148	# type: (lex_mode_t) -> None
149	"""Used to parse arithmetic, see ParseContext."""
150	self.next_lex_mode = lex_mode
151
152	def Reset(self):
153	# type: () -> None
154	"""Called by interactive loop."""
155	# For _GetToken()
156	self.cur_token = None # type: Token
157	self.token_kind = Kind.Undefined
158	self.token_type = Id.Undefined_Tok
159
160	self.next_lex_mode = lex_mode_e.ShCommand
161
162	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163	# comments
164	self.emit_doc_token = False
165	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166	# multiline mode.
167	self.multiline = False
168
169	# For detecting invalid \n\n in multiline mode. Counts what we got
170	# directly from the lexer.
171	self.newline_state = 0
172	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173	# that consume words.
174	self.returned_newline = False
175
176	# For integration with pgen2
177	self.buffered_word = None # type: word_t
178
179	def _GetToken(self):
180	# type: () -> None
181	"""Call this when you need to make a decision based on any of:
182
183	self.token_type
184	self.token_kind
185	self.cur_token
186	"""
187	if self.next_lex_mode == lex_mode_e.Undefined:
188	return # _SetNext() not called, so do nothing
189
190	is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
191	real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
192
193	self.cur_token = self.lexer.Read(real_mode)
194
195	# MUTATE TOKEN for fake lexer mode.
196	# This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
197	if (is_fake and self.cur_token.id
198	in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
199	self.cur_token.id = Id.Lit_Chars
200
201	self.token_type = self.cur_token.id
202	self.token_kind = consts.GetKind(self.token_type)
203
204	# number of consecutive newlines, ignoring whitespace
205	if self.token_type == Id.Op_Newline:
206	self.newline_state += 1
207	elif self.token_kind != Kind.WS:
208	self.newline_state = 0
209
210	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
211	self.next_lex_mode = lex_mode_e.Undefined
212
213	def _SetNext(self, lex_mode):
214	# type: (lex_mode_t) -> None
215	"""Set the next lex state, but don't actually read a token.
216
217	We need this for proper interactive parsing.
218	"""
219	self.next_lex_mode = lex_mode
220
221	def _ReadVarOpArg(self, arg_lex_mode):
222	# type: (lex_mode_t) -> rhs_word_t
223
224	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
225	# valid, even when unquoted.
226	self._SetNext(arg_lex_mode)
227	self._GetToken()
228
229	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
230	True) # empty_ok
231
232	# If the Compound has no parts, and we're in a double-quoted VarSub
233	# arg, and empty_ok, then return Empty. This is so it can evaluate to
234	# the empty string and not get elided.
235	#
236	# Examples:
237	# - "${s:-}", "${s/%pat/}"
238	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
239	# has the same potential problem of not having Token location info.
240	#
241	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
242	# return a Compound with no parts, which is explicitly checked with a
243	# custom error message.
244	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
245	return rhs_word.Empty
246
247	return w
248
249	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
250	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
251	"""Return a CompoundWord.
252
253	Helper function for _ReadVarOpArg and used directly by
254	_ReadPatSubVarOp.
255	"""
256	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
257	#log('w %s', w)
258	tilde = word_.TildeDetect(w)
259	if tilde:
260	w = tilde
261	return w
262
263	def _ReadSliceVarOp(self):
264	# type: () -> suffix_op.Slice
265	"""VarOf ':' ArithExpr (':' ArithExpr )?"""
266	self._SetNext(lex_mode_e.Arith)
267	self._GetToken()
268	cur_id = self.token_type # e.g. Id.Arith_Colon
269
270	if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
271	# no beginning specified
272	begin = None # type: Optional[arith_expr_t]
273	else:
274	begin = self.a_parser.Parse()
275	cur_id = self.a_parser.CurrentId()
276
277	if cur_id == Id.Arith_RBrace:
278	no_length = None # type: Optional[arith_expr_t] # No length specified
279	return suffix_op.Slice(begin, no_length)
280
281	# Id.Arith_Colon is a pun for Id.VOp2_Colon
282	if cur_id == Id.Arith_Colon:
283	self._SetNext(lex_mode_e.Arith)
284	length = self._ReadArithExpr(Id.Arith_RBrace)
285	return suffix_op.Slice(begin, length)
286
287	p_die("Expected : or } in slice", self.cur_token)
288	raise AssertionError() # for MyPy
289
290	def _ReadPatSubVarOp(self):
291	# type: () -> suffix_op.PatSub
292	"""Looking at the first '/' after VarOf:
293
294	VarSub = ...
295	\| VarOf '/' Match ( '/' WORD? )?
296	Match = '/' WORD # can't be empty
297	\| '#' WORD? # may be empty
298	\| '%' WORD?
299	"""
300	slash_tok = self.cur_token # location info
301	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
302
303	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
304
305	self._GetToken()
306	if self.token_type == Id.Right_DollarBrace:
307	pat = CompoundWord([])
308	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
309	slash_tok)
310
311	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
312	replace_mode = self.token_type
313	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
314
315	# Bash quirk:
316	# echo ${x/#/replace} has an empty pattern
317	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
318	empty_ok = replace_mode != Id.Lit_Slash
319	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
320	empty_ok)
321	#log('pat 1 %r', pat)
322
323	if self.token_type == Id.Lit_Slash:
324	# read until }
325	replace = self._ReadVarOpArg(
326	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
327	#log('r 1 %r', replace)
328	else:
329	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
330	replace = rhs_word.Empty
331
332	self._GetToken()
333	if self.token_type != Id.Right_DollarBrace:
334	# This happens on invalid code
335	p_die(
336	"Expected } after replacement string, got %s" %
337	ui.PrettyId(self.token_type), self.cur_token)
338
339	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
340
341	def _ReadSubscript(self):
342	# type: () -> bracket_op_t
343	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
344	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
345	# expression.
346	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
347	if next_id in (Id.Lit_At, Id.Arith_Star):
348	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
349
350	self._SetNext(lex_mode_e.Arith) # skip past [
351	self._GetToken()
352	self._SetNext(lex_mode_e.Arith) # skip past @
353	self._GetToken()
354	else:
355	self._SetNext(lex_mode_e.Arith) # skip past [
356	anode = self._ReadArithExpr(Id.Arith_RBracket)
357	op = bracket_op.ArrayIndex(anode)
358
359	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
360	p_die('Expected ] to close subscript', self.cur_token)
361
362	self._SetNext(lex_mode_e.VSub_2) # skip past ]
363	self._GetToken() # Needed to be in the same spot as no subscript
364
365	return op
366
367	def _ParseVarOf(self):
368	# type: () -> BracedVarSub
369	"""
370	VarOf = NAME Subscript?
371	\| NUMBER # no subscript allowed, none of these are arrays
372	# ${@[1]} doesn't work, even though slicing does
373	\| VarSymbol
374	"""
375	self._GetToken()
376	name_token = self.cur_token
377	self._SetNext(lex_mode_e.VSub_2)
378
379	self._GetToken() # Check for []
380	if self.token_type == Id.VOp2_LBracket:
381	bracket_op = self._ReadSubscript()
382	else:
383	bracket_op = None
384
385	part = BracedVarSub.CreateNull()
386	part.token = name_token
387	part.var_name = lexer.TokenVal(name_token)
388	part.bracket_op = bracket_op
389	return part
390
391	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
392	# type: (lex_mode_t, bool) -> BracedVarSub
393	"""Start parsing at the op -- we already skipped past the name."""
394	part = self._ParseVarOf()
395
396	self._GetToken()
397	if self.token_type == Id.Right_DollarBrace:
398	return part # no ops
399
400	op_kind = self.token_kind
401
402	if op_kind == Kind.VTest:
403	tok = self.cur_token
404	arg_word = self._ReadVarOpArg(arg_lex_mode)
405	if self.token_type != Id.Right_DollarBrace:
406	p_die('Expected } to close ${', self.cur_token)
407
408	part.suffix_op = suffix_op.Unary(tok, arg_word)
409
410	elif op_kind == Kind.VOpYsh:
411	tok = self.cur_token
412	arg_word = self._ReadVarOpArg(arg_lex_mode)
413	if self.token_type != Id.Right_DollarBrace:
414	p_die('Expected } to close ${', self.cur_token)
415
416	UP_arg_word = arg_word
417	with tagswitch(arg_word) as case:
418	if case(rhs_word_e.Empty):
419	pass
420	elif case(rhs_word_e.Compound):
421	arg_word = cast(CompoundWord, UP_arg_word)
422	# This handles ${x\|html} and ${x %.3f} now
423	# However I think ${x %.3f} should be statically parsed? It can enter
424	# the printf lexer modes.
425	ok, arg, quoted = word_.StaticEval(arg_word)
426	if not ok or quoted:
427	p_die('Expected a constant argument',
428	loc.Word(arg_word))
429
430	part.suffix_op = suffix_op.Static(tok, arg)
431
432	elif op_kind == Kind.VOp0:
433	part.suffix_op = self.cur_token # Nullary
434	self._SetNext(lex_mode_e.VSub_2) # Expecting }
435	self._GetToken()
436
437	elif op_kind == Kind.VOp1: # % %% # ## etc.
438	tok = self.cur_token
439	# Weird exception that all shells have: these operators take a glob
440	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
441	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
442	if self.token_type != Id.Right_DollarBrace:
443	p_die('Expected } to close ${', self.cur_token)
444
445	part.suffix_op = suffix_op.Unary(tok, arg_word)
446
447	elif op_kind == Kind.VOp2: # / : [ ]
448	if self.token_type == Id.VOp2_Slash:
449	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
450	part.suffix_op = patsub_op
451
452	# Checked by the method above
453	assert self.token_type == Id.Right_DollarBrace, self.cur_token
454
455	elif self.token_type == Id.VOp2_Colon:
456	part.suffix_op = self._ReadSliceVarOp()
457	# NOTE: } in arithmetic mode.
458	if self.token_type != Id.Arith_RBrace:
459	# Token seems off; doesn't point to X in # ${a:1:2 X
460	p_die('Expected } to close ${', self.cur_token)
461
462	else:
463	# TODO: Does this ever happen?
464	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
465
466	elif op_kind == Kind.VOp3: # ${prefix@} etc.
467	if allow_query:
468	part.suffix_op = self.cur_token # Nullary
469	self._SetNext(lex_mode_e.VSub_2) # Expecting }
470	self._GetToken()
471	else:
472	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
473
474	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
475	# mode. It's redundantly checked above.
476	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
477	# ${a.} or ${!a.}
478	p_die('Expected } to close ${', self.cur_token)
479
480	# Now look for ops
481	return part
482
483	def _ReadZshVarSub(self, left_token):
484	# type: (Token) -> word_part.ZshVarSub
485
486	self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
487
488	# Can be empty
489	w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
490	True)
491	self._GetToken()
492	return word_part.ZshVarSub(left_token, w, self.cur_token)
493
494	def ReadBracedVarSub(self, left_token):
495	# type: (Token) -> Tuple[BracedVarSub, Token]
496	""" For YSH expressions like var x = ${x:-"default"}. """
497	part = self._ReadBracedVarSub(left_token, d_quoted=False)
498	last_token = self.cur_token
499	return part, last_token
500
501	def _ReadBracedVarSub(self, left_token, d_quoted):
502	# type: (Token, bool) -> BracedVarSub
503	"""For the ${} expression language.
504
505	NAME = [a-zA-Z_][a-zA-Z0-9_]*
506	NUMBER = [0-9]+ # ${10}, ${11}, ...
507
508	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
509	VarSymbol = '!' \| '@' \| '#' \| ...
510	VarOf = NAME Subscript?
511	\| NUMBER # no subscript allowed, none of these are arrays
512	# ${@[1]} doesn't work, even though slicing does
513	\| VarSymbol
514
515	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
516
517	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
518	STRIP_OP = '#' \| '##' \| '%' \| '%%'
519	CASE_OP = ',' \| ',,' \| '^' \| '^^'
520	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
521
522	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
523	# SPACE is operator not %
524	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
525	VarExpr = VarOf
526	\| VarOf NULLARY_OP
527	\| VarOf UnaryOp WORD
528	\| VarOf YSH_UNARY STATIC_WORD
529	\| VarOf ':' ArithExpr (':' ArithExpr )?
530	\| VarOf '/' Match '/' WORD
531
532	LengthExpr = '#' VarOf # can't apply operators after length
533
534	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
535	# ${!ref[0]} vs ${!keys[@]} resolved later
536
537	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
538
539	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
540
541	VarSub = LengthExpr
542	\| RefOrKeys
543	\| PrefixQuery
544	\| VarExpr
545	\| BuiltinSub
546
547	NOTES:
548	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
549	slicing ${a:x+1:y+2}
550	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
551	- @ and * are technically arithmetic expressions in this implementation
552	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
553	it's also vectorized.
554
555	Strictness over bash:
556	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
557	grammar
558	- ! and # prefixes can't be composed, even though named refs can be
559	composed with other operators
560	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
561	a prefix, and it can also be a literal part of WORD.
562
563	From the parser's point of view, the prefix # can't be combined with
564	UnaryOp/slicing/matching, and the ! can. However
565
566	- ${a[@]:1:2} is not allowed
567	- ${#a[@]:1:2} is allowed, but gives the wrong answer
568	"""
569	if d_quoted:
570	arg_lex_mode = lex_mode_e.VSub_ArgDQ
571	else:
572	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
573
574	self._SetNext(lex_mode_e.VSub_1)
575	self._GetToken()
576
577	ty = self.token_type
578	first_tok = self.cur_token
579
580	if ty == Id.VSub_Pound:
581	# Disambiguate
582	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
583	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
584	# e.g. a name, '#' is the prefix
585	self._SetNext(lex_mode_e.VSub_1)
586	part = self._ParseVarOf()
587
588	self._GetToken()
589	if self.token_type != Id.Right_DollarBrace:
590	p_die('Expected } after length expression', self.cur_token)
591
592	part.prefix_op = first_tok
593
594	else: # not a prefix, '#' is the variable
595	part = self._ParseVarExpr(arg_lex_mode)
596
597	elif ty == Id.VSub_Bang:
598	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
599	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
600	# e.g. a name, '!' is the prefix
601	# ${!a} -- this is a ref
602	# ${!3} -- this is ref
603	# ${!a[1]} -- this is a ref
604	# ${!a[@]} -- this is a keys
605	# No lookahead -- do it in a second step, or at runtime
606	self._SetNext(lex_mode_e.VSub_1)
607	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
608
609	part.prefix_op = first_tok
610
611	else: # not a prefix, '!' is the variable
612	part = self._ParseVarExpr(arg_lex_mode)
613
614	elif ty == Id.VSub_Dot:
615	# Note: this will become a new builtin_sub type, so this method must
616	# return word_part_t rather than BracedVarSub. I don't think that
617	# should cause problems.
618	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
619
620	# VS_NAME, VS_NUMBER, symbol that isn't # or !
621	elif self.token_kind == Kind.VSub:
622	part = self._ParseVarExpr(arg_lex_mode)
623
624	else:
625	# e.g. ${^}
626	p_die('Unexpected token in ${}', self.cur_token)
627
628	part.left = left_token # attach the argument
629	part.right = self.cur_token
630	return part
631
632	def _ReadSingleQuoted(self, left_token, lex_mode):
633	# type: (Token, lex_mode_t) -> SingleQuoted
634	"""Internal method to read a word_part."""
635	tokens = [] # type: List[Token]
636	# In command mode, we never disallow backslashes like '\'
637	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
638	False)
639	sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
640	node = SingleQuoted(left_token, sval, right_quote)
641	return node
642
643	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
644	# type: (lex_mode_t, Token, List[Token], bool) -> Token
645	"""Appends to out_tokens; returns last token
646
647	Used by expr_parse.py
648	"""
649	# TODO: Remove and use out_tokens
650	tokens = [] # type: List[Token]
651
652	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
653	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
654
655	expected_end_tokens = 3 if left_token.id in (
656	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
657	Id.Left_BTSingleQuote) else 1
658	num_end_tokens = 0
659
660	while num_end_tokens < expected_end_tokens:
661	self._SetNext(lex_mode)
662	self._GetToken()
663
664	# Kind.Char emitted in lex_mode.SQ_C
665	if self.token_kind in (Kind.Lit, Kind.Char):
666	tok = self.cur_token
667	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
668	# r'one\two' or c'one\\two'
669	if no_backslashes and lexer.TokenContains(tok, '\\'):
670	p_die(
671	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
672	tok)
673
674	if is_ysh_expr:
675	# Disallow var x = $'\001'. Arguably we don't need these
676	# checks because u'\u{1}' is the way to write it.
677	if self.token_type == Id.Char_Octal3:
678	p_die(
679	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
680	tok)
681
682	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
683	# disallow \xH
684	p_die(
685	r'Invalid hex escape in YSH string (must be \xHH)',
686	tok)
687
688	tokens.append(tok)
689
690	elif self.token_kind == Kind.Unknown:
691	tok = self.cur_token
692	assert tok.id == Id.Unknown_Backslash, tok
693
694	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
695	if is_ysh_expr or not self.parse_opts.parse_backslash():
696	p_die(
697	"Invalid char escape in C-style string literal (OILS-ERR-11)",
698	tok)
699
700	tokens.append(tok)
701
702	elif self.token_kind == Kind.Eof:
703	p_die('Unexpected EOF in single-quoted string that began here',
704	left_token)
705
706	elif self.token_kind == Kind.Right:
707	# assume Id.Right_SingleQuote
708	num_end_tokens += 1
709	tokens.append(self.cur_token)
710
711	else:
712	raise AssertionError(self.cur_token)
713
714	if self.token_kind != Kind.Right:
715	num_end_tokens = 0 # we need three in a ROW
716
717	if expected_end_tokens == 1:
718	tokens.pop()
719	elif expected_end_tokens == 3: # Get rid of spurious end tokens
720	tokens.pop()
721	tokens.pop()
722	tokens.pop()
723
724	# Remove space from ''' r''' $''' in both expression mode and command mode
725	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
726	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
727	word_compile.RemoveLeadingSpaceSQ(tokens)
728
729	# Validation after lexing - same 2 checks in j8.LexerDecoder
730	is_u_string = left_token.id in (Id.Left_USingleQuote,
731	Id.Left_UTSingleQuote)
732
733	for tok in tokens:
734	# u'\yff' is not valid, but b'\yff' is
735	if is_u_string and tok.id == Id.Char_YHex:
736	p_die(
737	r"%s escapes not allowed in u'' strings" %
738	lexer.TokenVal(tok), tok)
739	# \u{dc00} isn't valid
740	if tok.id == Id.Char_UBraced:
741	h = lexer.TokenSlice(tok, 3, -1) # \u{123456}
742	i = int(h, 16)
743	if 0xD800 <= i and i < 0xE000:
744	p_die(
745	r"%s escape is illegal because it's in the surrogate range"
746	% lexer.TokenVal(tok), tok)
747
748	out_tokens.extend(tokens)
749	return self.cur_token
750
751	def _ReadDoubleQuotedLeftParts(self):
752	# type: () -> word_part_t
753	"""Read substitution parts in a double quoted context."""
754	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
755	return self._ReadCommandSub(self.token_type, d_quoted=True)
756
757	if self.token_type == Id.Left_DollarBrace:
758	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
759
760	if self.token_type == Id.Left_DollarDParen:
761	return self._ReadArithSub()
762
763	if self.token_type == Id.Left_DollarBracket:
764	return self._ReadExprSub(lex_mode_e.DQ)
765
766	raise AssertionError(self.cur_token)
767
768	def _ReadYshSingleQuoted(self, left_id):
769	# type: (Id_t) -> CompoundWord
770	"""Read YSH style strings
771
772	r'' u'' b''
773	r''' ''' u''' ''' b''' '''
774	"""
775	#log('BEF self.cur_token %s', self.cur_token)
776	if left_id == Id.Left_RSingleQuote:
777	lexer_mode = lex_mode_e.SQ_Raw
778	triple_left_id = Id.Left_RTSingleQuote
779	elif left_id == Id.Left_USingleQuote:
780	lexer_mode = lex_mode_e.J8_Str
781	triple_left_id = Id.Left_UTSingleQuote
782	elif left_id == Id.Left_BSingleQuote:
783	lexer_mode = lex_mode_e.J8_Str
784	triple_left_id = Id.Left_BTSingleQuote
785	else:
786	raise AssertionError(left_id)
787
788	# Needed for syntax checks
789	left_tok = self.cur_token
790	left_tok.id = left_id
791
792	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
793
794	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
795	self._SetNext(lex_mode_e.ShCommand)
796	self._GetToken()
797
798	assert self.token_type == Id.Left_SingleQuote
799	# HACK: magically transform the third ' in u''' to
800	# Id.Left_UTSingleQuote, so that ''' is the terminator
801	left_tok = self.cur_token
802	left_tok.id = triple_left_id
803
804	# Handles stripping leading whitespace
805	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
806
807	# Advance and validate
808	self._SetNext(lex_mode_e.ShCommand)
809
810	self._GetToken()
811	if self.token_kind not in KINDS_THAT_END_WORDS:
812	p_die('Unexpected token after YSH single-quoted string',
813	self.cur_token)
814
815	return CompoundWord([sq_part])
816
817	def _ReadUnquotedLeftParts(self, triple_out):
818	# type: (Optional[BoolParamBox]) -> word_part_t
819	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
820
821	If triple_out is set, then we try parsing triple quoted strings,
822	and set its value to True if we got one.
823	"""
824	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
825	# Note: $"" is a synonym for "". It might make sense if it added
826	# \n \0 \x00 \u{123} etc. But that's not what bash does!
827	dq_part = self._ReadDoubleQuoted(self.cur_token)
828	# Got empty word "" and there's a " after
829	if (triple_out and len(dq_part.parts) == 0 and
830	self.lexer.ByteLookAhead() == '"'):
831
832	self._SetNext(lex_mode_e.ShCommand)
833	self._GetToken()
834	# HACK: magically transform the third " in """ to
835	# Id.Left_TDoubleQuote, so that """ is the terminator
836	left_dq_token = self.cur_token
837	left_dq_token.id = Id.Left_TDoubleQuote
838	triple_out.b = True # let caller know we got it
839	return self._ReadDoubleQuoted(left_dq_token)
840
841	return dq_part
842
843	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
844	Id.Left_DollarSingleQuote):
845	if self.token_type == Id.Left_SingleQuote:
846	lexer_mode = lex_mode_e.SQ_Raw
847	triple_left_id = Id.Left_TSingleQuote
848	elif self.token_type == Id.Left_RSingleQuote:
849	lexer_mode = lex_mode_e.SQ_Raw
850	triple_left_id = Id.Left_RTSingleQuote
851	else:
852	lexer_mode = lex_mode_e.SQ_C
853	# there is no such thing as $'''
854	triple_left_id = Id.Undefined_Tok
855
856	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
857
858	# Got empty '' or r'' and there's a ' after
859	# u'' and b'' are handled in _ReadYshSingleQuoted
860	if (triple_left_id != Id.Undefined_Tok and
861	triple_out is not None and len(sq_part.sval) == 0 and
862	self.lexer.ByteLookAhead() == "'"):
863
864	self._SetNext(lex_mode_e.ShCommand)
865	self._GetToken()
866
867	# HACK: magically transform the third ' in ''' to
868	# Id.Left_TSingleQuote, so that ''' is the terminator
869	left_sq_token = self.cur_token
870	left_sq_token.id = triple_left_id
871
872	triple_out.b = True # let caller know we got it
873	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
874
875	return sq_part
876
877	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
878	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
879	return self._ReadCommandSub(self.token_type, d_quoted=False)
880
881	if self.token_type == Id.Left_DollarBrace:
882	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
883
884	if self.token_type == Id.Left_DollarDParen:
885	return self._ReadArithSub()
886
887	if self.token_type == Id.Left_DollarBracket:
888	return self._ReadExprSub(lex_mode_e.ShCommand)
889
890	if self.token_type == Id.Left_DollarBraceZsh:
891	return self._ReadZshVarSub(self.cur_token)
892
893	raise AssertionError(self.cur_token)
894
895	def _ReadExtGlob(self):
896	# type: () -> word_part.ExtGlob
897	"""
898	Grammar:
899	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
900	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
901	RIGHT = ')'
902	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
903	Compound includes ExtGlob
904	"""
905	left_token = self.cur_token
906	right_token = None # type: Token
907	arms = [] # type: List[CompoundWord]
908
909	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
910	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
911
912	read_word = False # did we just a read a word? To handle @(\|\|).
913
914	while True:
915	self._GetToken()
916
917	if self.token_type == Id.Right_ExtGlob:
918	if not read_word:
919	arms.append(CompoundWord([]))
920	right_token = self.cur_token
921	break
922
923	elif self.token_type == Id.Op_Pipe:
924	if not read_word:
925	arms.append(CompoundWord([]))
926	read_word = False
927	self._SetNext(lex_mode_e.ExtGlob)
928
929	# lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
930	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
931	Kind.ExtGlob):
932	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
933	arms.append(w)
934	read_word = True
935
936	elif self.token_kind == Kind.Eof:
937	p_die('Unexpected EOF reading extended glob that began here',
938	left_token)
939
940	else:
941	raise AssertionError(self.cur_token)
942
943	return word_part.ExtGlob(left_token, arms, right_token)
944
945	def _ReadBashRegexGroup(self):
946	# type: () -> word_part.BashRegexGroup
947	"""
948	Grammar:
949	BashRegexGroup = '(' WORD? ')
950	"""
951	left_token = self.cur_token
952	assert left_token.id == Id.BashRegex_LParen, left_token
953
954	right_token = None # type: Token
955	arms = [] # type: List[CompoundWord]
956
957	self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
958	self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
959
960	self._GetToken()
961	if self.token_type == Id.Right_BashRegexGroup: # empty ()
962	return word_part.BashRegexGroup(left_token, None, self.cur_token)
963
964	# lex_mode_e.BashRegex should only produce these 4 kinds of tokens
965	if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
966	# Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
967	# To allow bash style [[ s =~ (a b) ]]
968	w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
969	arms.append(w)
970
971	self._GetToken()
972	if self.token_type != Id.Right_BashRegexGroup:
973	p_die('Expected ) to close bash regex group', self.cur_token)
974
975	return word_part.BashRegexGroup(left_token, w, self.cur_token)
976
977	p_die('Expected word after ( opening bash regex group', self.cur_token)
978
979	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
980	# type: (Optional[Token], bool, List[word_part_t]) -> None
981	"""
982	Args:
983	left_token: A token if we are reading a double quoted part, or None if
984	we're reading a here doc.
985	is_ysh_expr: Whether to disallow backticks and invalid char escapes
986	out_parts: list of word_part to append to
987	"""
988	if left_token:
989	expected_end_tokens = 3 if left_token.id == Id.Left_TDoubleQuote else 1
990	else:
991	expected_end_tokens = 1000 # here doc will break
992
993	num_end_tokens = 0
994	while num_end_tokens < expected_end_tokens:
995	self._SetNext(lex_mode_e.DQ)
996	self._GetToken()
997
998	if self.token_kind == Kind.Lit:
999	if self.token_type == Id.Lit_EscapedChar:
1000	tok = self.cur_token
1001	ch = lexer.TokenSliceLeft(tok, 1)
1002	part = word_part.EscapedLiteral(tok,
1003	ch) # type: word_part_t
1004	else:
1005	if self.token_type == Id.Lit_BadBackslash:
1006	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1007	# YSH.
1008	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1009	# recursion (unless parse_backslash)
1010	if (is_ysh_expr or
1011	not self.parse_opts.parse_backslash()):
1012	p_die(
1013	"Invalid char escape in double quoted string (OILS-ERR-12)",
1014	self.cur_token)
1015	elif self.token_type == Id.Lit_Dollar:
1016	if is_ysh_expr or not self.parse_opts.parse_dollar():
1017	p_die("Literal $ should be quoted like \$",
1018	self.cur_token)
1019
1020	part = self.cur_token
1021	out_parts.append(part)
1022
1023	elif self.token_kind == Kind.Left:
1024	if self.token_type == Id.Left_Backtick and is_ysh_expr:
1025	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1026	self.cur_token)
1027
1028	part = self._ReadDoubleQuotedLeftParts()
1029	out_parts.append(part)
1030
1031	elif self.token_kind == Kind.VSub:
1032	tok = self.cur_token
1033	part = SimpleVarSub(tok)
1034	out_parts.append(part)
1035	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1036	# later.
1037
1038	elif self.token_kind == Kind.Right:
1039	assert self.token_type == Id.Right_DoubleQuote, self.token_type
1040	if left_token:
1041	num_end_tokens += 1
1042
1043	# In a here doc, the right quote is literal!
1044	out_parts.append(self.cur_token)
1045
1046	elif self.token_kind == Kind.Eof:
1047	if left_token:
1048	p_die(
1049	'Unexpected EOF reading double-quoted string that began here',
1050	left_token)
1051	else: # here docs will have an EOF in their token stream
1052	break
1053
1054	else:
1055	raise AssertionError(self.cur_token)
1056
1057	if self.token_kind != Kind.Right:
1058	num_end_tokens = 0 # """ must be CONSECUTIVE
1059
1060	if expected_end_tokens == 1:
1061	out_parts.pop()
1062	elif expected_end_tokens == 3:
1063	out_parts.pop()
1064	out_parts.pop()
1065	out_parts.pop()
1066
1067	# Remove space from """ in both expression mode and command mode
1068	if left_token and left_token.id == Id.Left_TDoubleQuote:
1069	word_compile.RemoveLeadingSpaceDQ(out_parts)
1070
1071	# Return nothing, since we appended to 'out_parts'
1072
1073	def _ReadDoubleQuoted(self, left_token):
1074	# type: (Token) -> DoubleQuoted
1075	"""Helper function for "hello $name".
1076
1077	Args:
1078	eof_type: for stopping at }, Id.Lit_RBrace
1079	here_doc: Whether we are reading in a here doc context
1080
1081	Also ${foo%%a b c} # treat this as double quoted. until you hit
1082	"""
1083	parts = [] # type: List[word_part_t]
1084	self._ReadLikeDQ(left_token, False, parts)
1085
1086	right_quote = self.cur_token
1087	return DoubleQuoted(left_token, parts, right_quote)
1088
1089	def ReadDoubleQuoted(self, left_token, parts):
1090	# type: (Token, List[word_part_t]) -> Token
1091	"""For expression mode.
1092
1093	Read var x = "${dir:-}/$name"; etc.
1094	"""
1095	self._ReadLikeDQ(left_token, True, parts)
1096	return self.cur_token
1097
1098	def _ReadCommandSub(self, left_id, d_quoted=False):
1099	# type: (Id_t, bool) -> CommandSub
1100	"""
1101	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1102
1103	command_sub = '$(' command_list ')'
1104	\| '@(' command_list ')'
1105	\| '<(' command_list ')'
1106	\| '>(' command_list ')'
1107	\| ` command_list `
1108	"""
1109	left_token = self.cur_token
1110
1111	# Set the lexer in a state so ) becomes the EOF token.
1112	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1113	Id.Left_ProcSubOut):
1114	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1115
1116	right_id = Id.Eof_RParen
1117	self.lexer.PushHint(Id.Op_RParen, right_id)
1118	c_parser = self.parse_ctx.MakeParserForCommandSub(
1119	self.line_reader, self.lexer, right_id)
1120	# NOTE: This doesn't use something like main_loop because we don't want
1121	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1122	node = c_parser.ParseCommandSub()
1123
1124	right_token = c_parser.w_parser.cur_token
1125
1126	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1127	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1128	# test/osh2oil.
1129
1130	right_id = Id.Eof_Backtick
1131	self.lexer.PushHint(Id.Left_Backtick, right_id)
1132	c_parser = self.parse_ctx.MakeParserForCommandSub(
1133	self.line_reader, self.lexer, right_id)
1134	node = c_parser.ParseCommandSub()
1135	right_token = c_parser.w_parser.cur_token
1136
1137	elif left_id == Id.Left_Backtick:
1138	if not self.parse_opts.parse_backticks():
1139	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1140	left_token)
1141
1142	self._SetNext(lex_mode_e.Backtick) # advance past `
1143
1144	parts = [] # type: List[str]
1145	while True:
1146	self._GetToken()
1147	#log("TOK %s", self.cur_token)
1148
1149	if self.token_type == Id.Backtick_Quoted:
1150	# Remove leading \
1151	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1152
1153	elif self.token_type == Id.Backtick_DoubleQuote:
1154	# Compatibility: If backticks are double quoted, then double quotes
1155	# within them have to be \"
1156	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1157	# is)
1158	if d_quoted:
1159	# Remove leading \
1160	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1161	else:
1162	parts.append(lexer.TokenVal(self.cur_token))
1163
1164	elif self.token_type == Id.Backtick_Other:
1165	parts.append(lexer.TokenVal(self.cur_token))
1166
1167	elif self.token_type == Id.Backtick_Right:
1168	break
1169
1170	elif self.token_type == Id.Eof_Real:
1171	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1172	p_die('Unexpected EOF while looking for closing backtick',
1173	left_token)
1174
1175	else:
1176	raise AssertionError(self.cur_token)
1177
1178	self._SetNext(lex_mode_e.Backtick)
1179
1180	# Calculate right SPID on CommandSub BEFORE re-parsing.
1181	right_token = self.cur_token
1182
1183	code_str = ''.join(parts)
1184	#log('code %r', code_str)
1185
1186	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1187	# won't have the same location info as MakeParserForCommandSub(), because
1188	# the lexer is different.
1189	arena = self.parse_ctx.arena
1190	#arena = alloc.Arena()
1191	line_reader = reader.StringLineReader(code_str, arena)
1192	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1193	src = source.Reparsed('backticks', left_token, right_token)
1194	with alloc.ctx_SourceCode(arena, src):
1195	node = c_parser.ParseCommandSub()
1196
1197	else:
1198	raise AssertionError(left_id)
1199
1200	return CommandSub(left_token, node, right_token)
1201
1202	def _ReadExprSub(self, lex_mode):
1203	# type: (lex_mode_t) -> word_part.ExprSub
1204	"""$[d->key] $[obj.method()] etc."""
1205	left_token = self.cur_token
1206
1207	self._SetNext(lex_mode_e.Expr)
1208	enode, right_token = self.parse_ctx.ParseYshExpr(
1209	self.lexer, grammar_nt.ysh_expr_sub)
1210
1211	self._SetNext(lex_mode) # Move past ]
1212	return word_part.ExprSub(left_token, enode, right_token)
1213
1214	def ParseVarDecl(self, kw_token):
1215	# type: (Token) -> command.VarDecl
1216	"""
1217	oil_var_decl: name_type_list '=' testlist end_stmt
1218
1219	Note that assignments must end with \n ; } or EOF. Unlike shell
1220	assignments, we disallow:
1221
1222	var x = 42 \| wc -l
1223	var x = 42 && echo hi
1224	"""
1225	self._SetNext(lex_mode_e.Expr)
1226	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1227	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1228	# wants
1229	if last_token.id == Id.Op_RBrace:
1230	last_token.id = Id.Lit_RBrace
1231
1232	# Let the CommandParser see the Op_Semi or Op_Newline.
1233	self.buffered_word = last_token
1234	self._SetNext(lex_mode_e.ShCommand) # always back to this
1235	return enode
1236
1237	def ParseMutation(self, kw_token, var_checker):
1238	# type: (Token, VarChecker) -> command.Mutation
1239	"""
1240	setvar i = 42
1241	setvar i += 1
1242	setvar a[i] = 42
1243	setvar a[i] += 1
1244	setvar d.key = 42
1245	setvar d.key += 1
1246	"""
1247	self._SetNext(lex_mode_e.Expr)
1248	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1249	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1250	# wants
1251	if last_token.id == Id.Op_RBrace:
1252	last_token.id = Id.Lit_RBrace
1253
1254	for lhs in enode.lhs:
1255	UP_lhs = lhs
1256	with tagswitch(lhs) as case:
1257	if case(y_lhs_e.Var):
1258	lhs = cast(Token, UP_lhs)
1259	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1260
1261	# Note: this does not cover cases like
1262	# setvar (a[0])[1] = v
1263	# setvar (d.key).other = v
1264	# This leaks into catching all typos statically, which may be
1265	# possible if 'use' makes all names explicit.
1266	elif case(y_lhs_e.Subscript):
1267	lhs = cast(Subscript, UP_lhs)
1268	if lhs.obj.tag() == expr_e.Var:
1269	v = cast(expr.Var, lhs.obj)
1270	var_checker.Check(kw_token.id, v.name, v.left)
1271
1272	elif case(y_lhs_e.Attribute):
1273	lhs = cast(Attribute, UP_lhs)
1274	if lhs.obj.tag() == expr_e.Var:
1275	v = cast(expr.Var, lhs.obj)
1276	var_checker.Check(kw_token.id, v.name, v.left)
1277
1278	# Let the CommandParser see the Op_Semi or Op_Newline.
1279	self.buffered_word = last_token
1280	self._SetNext(lex_mode_e.ShCommand) # always back to this
1281	return enode
1282
1283	def ParseBareDecl(self):
1284	# type: () -> expr_t
1285	"""
1286	x = {name: val}
1287	"""
1288	self._SetNext(lex_mode_e.Expr)
1289	self._GetToken()
1290	enode, last_token = self.parse_ctx.ParseYshExpr(
1291	self.lexer, grammar_nt.command_expr)
1292	if last_token.id == Id.Op_RBrace:
1293	last_token.id = Id.Lit_RBrace
1294	self.buffered_word = last_token
1295	self._SetNext(lex_mode_e.ShCommand)
1296	return enode
1297
1298	def ParseYshExprForCommand(self):
1299	# type: () -> expr_t
1300
1301	# Fudge for this case
1302	# for x in(y) {
1303	# versus
1304	# for x in (y) {
1305	#
1306	# In the former case, ReadWord on 'in' puts the lexer past (.
1307	# Also see LookPastSpace in CommandParers.
1308	# A simpler solution would be nicer.
1309
1310	if self.token_type == Id.Op_LParen:
1311	self.lexer.MaybeUnreadOne()
1312
1313	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1314
1315	self._SetNext(lex_mode_e.ShCommand)
1316	return enode
1317
1318	def ParseCommandExpr(self):
1319	# type: () -> expr_t
1320	"""
1321	= 1+2
1322	"""
1323	enode, last_token = self.parse_ctx.ParseYshExpr(
1324	self.lexer, grammar_nt.command_expr)
1325
1326	# In some cases, such as the case statement, we expect the lexer to be
1327	# pointing at the token right after the expression. But the expression
1328	# parser must have read to the `last_token`. Unreading places the lexer
1329	# back in the expected state. Ie:
1330	#
1331	# case (x) { case (x) {
1332	# (else) { = x } (else) { = x }
1333	# ^ The lexer is here ^ Unread to here
1334	# } }
1335	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1336	Id.Op_RBrace), last_token
1337	if last_token.id != Id.Eof_Real:
1338	# Eof_Real is the only token we cannot unread
1339	self.lexer.MaybeUnreadOne()
1340
1341	return enode
1342
1343	def ParseProc(self, node):
1344	# type: (Proc) -> None
1345
1346	# proc name-with-hyphens() must be accepted
1347	self._SetNext(lex_mode_e.ShCommand)
1348	self._GetToken()
1349	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1350	if self.token_type != Id.Lit_Chars:
1351	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1352	self.cur_token)
1353
1354	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1355	# for shell functions. Similar to IsValidVarName().
1356	node.name = self.cur_token
1357
1358	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1359
1360	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1361	assert last_token.id == Id.Op_LBrace
1362	last_token.id = Id.Lit_LBrace
1363	self.buffered_word = last_token
1364
1365	self._SetNext(lex_mode_e.ShCommand)
1366
1367	def ParseFunc(self, node):
1368	# type: (Func) -> None
1369	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1370
1371	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1372	assert last_token.id == Id.Op_LBrace
1373	last_token.id = Id.Lit_LBrace
1374	self.buffered_word = last_token
1375
1376	self._SetNext(lex_mode_e.ShCommand)
1377
1378	def ParseYshCasePattern(self):
1379	# type: () -> Tuple[pat_t, Token]
1380	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1381	self.lexer)
1382
1383	if last_token.id == Id.Op_LBrace:
1384	last_token.id = Id.Lit_LBrace
1385	self.buffered_word = last_token
1386
1387	return pat, left_tok
1388
1389	def NewlineOkForYshCase(self):
1390	# type: () -> Id_t
1391	"""Check for optional newline and consume it.
1392
1393	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1394	which crop up while parsing Ysh Case Arms. For more details, see
1395	#oil-dev > Progress On YSH Case Grammar on zulip.
1396
1397	Returns a token id which is filled with the choice of
1398
1399	word { echo word }
1400	(3) { echo expr }
1401	/e/ { echo eggex }
1402	} # right brace
1403	"""
1404	while True:
1405	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1406
1407	# Cannot lookahead past lines
1408	if next_id == Id.Unknown_Tok:
1409	self.lexer.MoveToNextLine()
1410	continue
1411
1412	next_kind = consts.GetKind(next_id)
1413	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1414	break
1415
1416	self.lexer.Read(lex_mode_e.Expr)
1417
1418	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1419	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1420	else:
1421	# Consume the trailing Op_Newline
1422	self._SetNext(lex_mode_e.ShCommand)
1423	self._GetToken()
1424
1425	return next_id
1426
1427	def _ReadArithExpr(self, end_id):
1428	# type: (Id_t) -> arith_expr_t
1429	"""Read and parse an arithmetic expression in various contexts.
1430
1431	$(( 1+2 ))
1432	(( a=1+2 ))
1433	${a[ 1+2 ]}
1434	${a : 1+2 : 1+2}
1435
1436	See tests/arith-context.test.sh for ambiguous cases.
1437
1438	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1439
1440	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1441
1442	See the assertion in ArithParser.Parse() -- unexpected extra input.
1443	"""
1444	# calls self.ReadWord(lex_mode_e.Arith)
1445	anode = self.a_parser.Parse()
1446	cur_id = self.a_parser.CurrentId()
1447	if end_id != Id.Undefined_Tok and cur_id != end_id:
1448	p_die(
1449	'Unexpected token after arithmetic expression (%s != %s)' %
1450	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1451	loc.Word(self.a_parser.cur_word))
1452	return anode
1453
1454	def _ReadArithSub(self):
1455	# type: () -> word_part.ArithSub
1456	"""Read an arith substitution, which contains an arith expression, e.g.
1457
1458	$((a + 1)).
1459	"""
1460	left_tok = self.cur_token
1461
1462	# The second one needs to be disambiguated in stuff like stuff like:
1463	# $(echo $(( 1+2 )) )
1464	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1465
1466	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1467	# could save the lexer/reader state here, and retry if the arithmetic parse
1468	# fails. But we can almost always catch this at parse time. There could
1469	# be some exceptions like:
1470	# $((echo * foo)) # looks like multiplication
1471	# $((echo / foo)) # looks like division
1472
1473	self._SetNext(lex_mode_e.Arith)
1474	anode = self._ReadArithExpr(Id.Arith_RParen)
1475
1476	# TODO: This could be DQ or Arith too
1477	self._SetNext(lex_mode_e.ShCommand)
1478
1479	# PROBLEM: $(echo $(( 1 + 2 )) )
1480	# Two right parens break the Id.Eof_RParen scheme
1481	self._GetToken()
1482	if self.token_type != Id.Right_DollarDParen:
1483	p_die('Expected second ) to end arith sub', self.cur_token)
1484
1485	right_tok = self.cur_token
1486	return word_part.ArithSub(left_tok, anode, right_tok)
1487
1488	def ReadDParen(self):
1489	# type: () -> Tuple[arith_expr_t, Token]
1490	"""Read ((1+ 2)) -- command context.
1491
1492	We're using the word parser because it's very similar to _ReadArithExpr
1493	above.
1494
1495	This also returns the terminating `Op_DRightParen` token for use as location
1496	tracking.
1497	"""
1498	# The second one needs to be disambiguated in stuff like stuff like:
1499	# TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1500	# Then you can get rid of this.
1501	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1502
1503	self._SetNext(lex_mode_e.Arith)
1504	anode = self._ReadArithExpr(Id.Arith_RParen)
1505
1506	self._SetNext(lex_mode_e.ShCommand)
1507
1508	# PROBLEM: $(echo $(( 1 + 2 )) )
1509	self._GetToken()
1510	right = self.cur_token
1511	if self.token_type != Id.Op_DRightParen:
1512	p_die('Expected second ) to end arith statement', self.cur_token)
1513
1514	self._SetNext(lex_mode_e.ShCommand)
1515
1516	return anode, right
1517
1518	def _SetNextNonSpace(self):
1519	# type: () -> None
1520	"""Same logic as _ReadWord, but for ReadForExpression."""
1521	while True:
1522	self._SetNext(lex_mode_e.Arith)
1523	self._GetToken()
1524	if self.token_kind not in (Kind.Ignored, Kind.WS):
1525	break
1526
1527	def ReadForExpression(self):
1528	# type: () -> command.ForExpr
1529	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1530	self._SetNextNonSpace() # skip over ((
1531
1532	self._GetToken()
1533	cur_id = self.token_type # for end of arith expressions
1534
1535	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1536	init_node = None # type: Optional[arith_expr_t]
1537	else:
1538	init_node = self.a_parser.Parse()
1539	cur_id = self.a_parser.CurrentId()
1540	self._SetNextNonSpace()
1541
1542	# It's odd to keep track of both cur_id and self.token_type in this
1543	# function, but it works, and is tested in 'test/parse_error.sh
1544	# arith-integration'
1545	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1546	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1547
1548	self._GetToken()
1549	cur_id = self.token_type
1550
1551	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1552	cond_node = None # type: Optional[arith_expr_t]
1553	else:
1554	cond_node = self.a_parser.Parse()
1555	cur_id = self.a_parser.CurrentId()
1556	self._SetNextNonSpace()
1557
1558	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1559	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1560
1561	self._GetToken()
1562	cur_id = self.token_type
1563
1564	if cur_id == Id.Arith_RParen: # for (( ; ; ))
1565	update_node = None # type: Optional[arith_expr_t]
1566	else:
1567	update_node = self._ReadArithExpr(Id.Arith_RParen)
1568	self._SetNextNonSpace()
1569
1570	self._GetToken()
1571	if self.token_type != Id.Arith_RParen:
1572	p_die('Expected ) to end for loop expression', self.cur_token)
1573	self._SetNext(lex_mode_e.ShCommand)
1574
1575	# redirects is None, will be assigned in CommandEvaluator
1576	node = command.ForExpr.CreateNull()
1577	node.init = init_node
1578	node.cond = cond_node
1579	node.update = update_node
1580	return node
1581
1582	def _ReadArrayLiteral(self):
1583	# type: () -> word_part_t
1584	"""a=(1 2 3)
1585
1586	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1587
1588	We want:
1589
1590	A=(['x']=1 ["x"]=2 [$x$y]=3)
1591
1592	Maybe allow this as a literal string? Because I think I've seen it before?
1593	Or maybe force people to patch to learn the rule.
1594
1595	A=([x]=4)
1596
1597	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1598	Maybe enforce that ALL have keys or NONE of have keys.
1599	"""
1600	self._SetNext(lex_mode_e.ShCommand) # advance past (
1601	self._GetToken()
1602	if self.cur_token.id != Id.Op_LParen:
1603	p_die('Expected ( after =', self.cur_token)
1604	left_token = self.cur_token
1605	right_token = None # type: Token
1606
1607	# MUST use a new word parser (with same lexer).
1608	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1609	words = [] # type: List[CompoundWord]
1610	done = False
1611	while not done:
1612	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1613	with tagswitch(w) as case:
1614	if case(word_e.Operator):
1615	tok = cast(Token, w)
1616	if tok.id == Id.Right_ShArrayLiteral:
1617	right_token = tok
1618	done = True # can't use break here
1619	# Unlike command parsing, array parsing allows embedded \n.
1620	elif tok.id == Id.Op_Newline:
1621	continue
1622	else:
1623	p_die('Unexpected token in array literal', loc.Word(w))
1624
1625	elif case(word_e.Compound):
1626	words.append(cast(CompoundWord, w))
1627
1628	else:
1629	raise AssertionError()
1630
1631	if len(words) == 0: # a=() is empty indexed array
1632	# Needed for type safety, doh
1633	no_words = [] # type: List[word_t]
1634	node = ShArrayLiteral(left_token, no_words, right_token)
1635	return node
1636
1637	pairs = [] # type: List[AssocPair]
1638	# If the first one is a key/value pair, then the rest are assumed to be.
1639	pair = word_.DetectAssocPair(words[0])
1640	if pair:
1641	pairs.append(pair)
1642
1643	n = len(words)
1644	for i in xrange(1, n):
1645	w2 = words[i]
1646	pair = word_.DetectAssocPair(w2)
1647	if not pair:
1648	p_die("Expected associative array pair", loc.Word(w2))
1649
1650	pairs.append(pair)
1651
1652	# invariant List?
1653	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1654
1655	# Brace detection for arrays but NOT associative arrays
1656	words2 = braces.BraceDetectAll(words)
1657	words3 = word_.TildeDetectAll(words2)
1658	return ShArrayLiteral(left_token, words3, right_token)
1659
1660	def ParseProcCallArgs(self, start_symbol):
1661	# type: (int) -> ArgList
1662	""" json write (x) """
1663	self.lexer.MaybeUnreadOne()
1664
1665	arg_list = ArgList.CreateNull(alloc_lists=True)
1666	arg_list.left = self.cur_token
1667	self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1668	return arg_list
1669
1670	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1671	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1672	"""Helper for _ReadCompoundWord3."""
1673	done = False
1674
1675	if self.token_type == Id.Lit_EscapedChar:
1676	tok = self.cur_token
1677	assert tok.length == 2
1678	ch = lexer.TokenSliceLeft(tok, 1)
1679	if not self.parse_opts.parse_backslash():
1680	if not pyutil.IsValidCharEscape(ch):
1681	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1682	self.cur_token)
1683
1684	part = word_part.EscapedLiteral(self.cur_token,
1685	ch) # type: word_part_t
1686	else:
1687	part = self.cur_token
1688
1689	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1690	parts.append(part)
1691	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1692	# _ReadWord.
1693	next_id = self.lexer.LookPastSpace(lex_mode)
1694	if next_id == Id.Op_LParen:
1695	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1696	part2 = self._ReadArrayLiteral()
1697	parts.append(part2)
1698
1699	# Array literal must be the last part of the word.
1700	self._SetNext(lex_mode)
1701	self._GetToken()
1702	# EOF, whitespace, newline, Right_Subshell
1703	if self.token_kind not in KINDS_THAT_END_WORDS:
1704	p_die('Unexpected token after array literal',
1705	self.cur_token)
1706	done = True
1707
1708	elif (is_first and self.parse_opts.parse_at() and
1709	self.token_type == Id.Lit_Splice):
1710
1711	splice_tok = self.cur_token
1712	part2 = word_part.Splice(splice_tok,
1713	lexer.TokenSliceLeft(splice_tok, 1))
1714
1715	parts.append(part2)
1716
1717	# @words must be the last part of the word
1718	self._SetNext(lex_mode)
1719	self._GetToken()
1720	# EOF, whitespace, newline, Right_Subshell
1721	if self.token_kind not in KINDS_THAT_END_WORDS:
1722	p_die('Unexpected token after array splice', self.cur_token)
1723	done = True
1724
1725	elif (is_first and self.parse_opts.parse_at() and
1726	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1727	part2 = self._ReadExprSub(lex_mode_e.DQ)
1728	parts.append(part2)
1729
1730	# @[split(x)]
1731	self._SetNext(lex_mode)
1732	self._GetToken()
1733	# EOF, whitespace, newline, Right_Subshell
1734	if self.token_kind not in KINDS_THAT_END_WORDS:
1735	p_die('Unexpected token after Expr splice', self.cur_token)
1736	done = True
1737
1738	elif (is_first and self.parse_opts.parse_at() and
1739	self.token_type == Id.Lit_AtLBraceDot):
1740	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1741
1742	elif (is_first and self.parse_opts.parse_at_all() and
1743	self.token_type == Id.Lit_At):
1744	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1745	# at the beginning of a word to be reserved.
1746
1747	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1748	# @_argv and
1749	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1750	self.cur_token)
1751
1752	else:
1753	# not a literal with lookahead; append it
1754	parts.append(part)
1755
1756	return done
1757
1758	def _ReadCompoundWord(self, lex_mode):
1759	# type: (lex_mode_t) -> CompoundWord
1760	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1761
1762	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1763	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1764	"""
1765	Precondition: Looking at the first token of the first word part
1766	Postcondition: Looking at the token after, e.g. space or operator
1767
1768	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1769	could be an operator delimiting a compound word. Can we change lexer modes
1770	and remove this special case?
1771	"""
1772	w = CompoundWord([])
1773	num_parts = 0
1774	brace_count = 0
1775	done = False
1776	is_triple_quoted = None # type: Optional[BoolParamBox]
1777
1778	while not done:
1779	self._GetToken()
1780
1781	allow_done = empty_ok or num_parts != 0
1782	if allow_done and self.token_type == eof_type:
1783	done = True # e.g. for ${foo//pat/replace}
1784
1785	# Keywords like "for" are treated like literals
1786	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1787	Kind.ControlFlow, Kind.BoolUnary,
1788	Kind.BoolBinary):
1789
1790	# Syntax error for { and }
1791	if self.token_type == Id.Lit_LBrace:
1792	brace_count += 1
1793	elif self.token_type == Id.Lit_RBrace:
1794	brace_count -= 1
1795	elif self.token_type == Id.Lit_Dollar:
1796	if not self.parse_opts.parse_dollar():
1797	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1798	next_byte = self.lexer.ByteLookAhead()
1799	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1800	if next_byte == '/':
1801	#log('next_byte %r', next_byte)
1802	pass
1803
1804	p_die('Literal $ should be quoted like \$',
1805	self.cur_token)
1806
1807	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1808	w.parts)
1809
1810	elif self.token_kind == Kind.VSub:
1811	vsub_token = self.cur_token
1812
1813	part = SimpleVarSub(vsub_token) # type: word_part_t
1814	w.parts.append(part)
1815
1816	elif self.token_kind == Kind.ExtGlob:
1817	# If parse_at, we can take over @( to start @(seq 3)
1818	# Users can also use look at ,(.py\|.sh)
1819	if (self.parse_opts.parse_at() and
1820	self.token_type == Id.ExtGlob_At and num_parts == 0):
1821	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1822	d_quoted=False)
1823	# RARE mutation of tok.id!
1824	cs_part.left_token.id = Id.Left_AtParen
1825	part = cs_part # for type safety
1826
1827	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1828	# a=(one two)x and @arrayfunc(3)x.
1829	self._GetToken()
1830	if self.token_kind not in KINDS_THAT_END_WORDS:
1831	p_die('Unexpected token after @()', self.cur_token)
1832	done = True
1833
1834	else:
1835	part = self._ReadExtGlob()
1836	w.parts.append(part)
1837
1838	elif self.token_kind == Kind.BashRegex:
1839	if self.token_type == Id.BashRegex_LParen: # Opening (
1840	part = self._ReadBashRegexGroup()
1841	w.parts.append(part)
1842	else:
1843	assert self.token_type == Id.BashRegex_AllowedInParens
1844	p_die('Invalid token in bash regex', self.cur_token)
1845
1846	elif self.token_kind == Kind.Left:
1847	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1848	lex_mode == lex_mode_e.ShCommand and
1849	num_parts == 0)
1850
1851	# Save allocation
1852	if try_triple_quote:
1853	is_triple_quoted = BoolParamBox(False)
1854
1855	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1856	w.parts.append(part)
1857
1858	# NOT done yet, will advance below
1859	elif self.token_kind == Kind.Right:
1860	# Still part of the word; will be done on the next iter.
1861	if self.token_type == Id.Right_DoubleQuote:
1862	pass
1863	# Never happens, no PushHint for this case.
1864	#elif self.token_type == Id.Right_DollarParen:
1865	# pass
1866	elif self.token_type == Id.Right_Subshell:
1867	# LEXER HACK for (case x in x) ;; esac )
1868	# Rewind before it's used
1869	assert self.next_lex_mode == lex_mode_e.Undefined
1870	if self.lexer.MaybeUnreadOne():
1871	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1872	self._SetNext(lex_mode)
1873	done = True
1874	else:
1875	done = True
1876
1877	elif self.token_kind == Kind.Ignored:
1878	done = True
1879
1880	else:
1881	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1882	# so to test for ESAC, we can read ) before getting a chance to
1883	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1884	# token and do it again.
1885
1886	# We get Id.Op_RParen at top level: case x in x) ;; esac
1887	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1888	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1889	# Rewind before it's used
1890	assert self.next_lex_mode == lex_mode_e.Undefined
1891	if self.lexer.MaybeUnreadOne():
1892	if self.token_type == Id.Eof_RParen:
1893	# Redo translation
1894	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1895	self._SetNext(lex_mode)
1896
1897	done = True # anything we don't recognize means we're done
1898
1899	if not done:
1900	self._SetNext(lex_mode)
1901	num_parts += 1
1902
1903	if (self.parse_opts.parse_brace() and num_parts > 1 and
1904	brace_count != 0):
1905	# accept { and }, but not foo{
1906	p_die(
1907	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1908	loc.Word(w))
1909
1910	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1911	p_die('Unexpected parts after triple quoted string',
1912	loc.WordPart(w.parts[-1]))
1913
1914	if 0:
1915	from _devbuild.gen.syntax_asdl import word_part_str
1916	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1917	WORD_HIST[word_key] += 1
1918	return w
1919
1920	def _ReadArithWord(self):
1921	# type: () -> Optional[word_t]
1922	""" Helper for ReadArithWord() """
1923	self._GetToken()
1924
1925	if self.token_kind == Kind.Unknown:
1926	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1927	p_die(
1928	'Unexpected token while parsing arithmetic: %r' %
1929	lexer.TokenVal(self.cur_token), self.cur_token)
1930
1931	elif self.token_kind == Kind.Eof:
1932	return self.cur_token
1933
1934	elif self.token_kind == Kind.Ignored:
1935	# Space should be ignored.
1936	self._SetNext(lex_mode_e.Arith)
1937	return None
1938
1939	elif self.token_kind in (Kind.Arith, Kind.Right):
1940	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1941	self._SetNext(lex_mode_e.Arith)
1942	return self.cur_token
1943
1944	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1945	return self._ReadCompoundWord(lex_mode_e.Arith)
1946
1947	else:
1948	raise AssertionError(self.cur_token)
1949
1950	def _ReadWord(self, word_mode):
1951	# type: (lex_mode_t) -> Optional[word_t]
1952	"""Helper function for ReadWord()."""
1953
1954	# Change the pseudo lexer mode to a real lexer mode
1955	if word_mode == lex_mode_e.ShCommandFakeBrack:
1956	lex_mode = lex_mode_e.ShCommand
1957	else:
1958	lex_mode = word_mode
1959
1960	self._GetToken()
1961
1962	if self.token_kind == Kind.Eof:
1963	# No advance
1964	return self.cur_token
1965
1966	# Allow Arith for ) at end of for loop?
1967	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1968	self._SetNext(lex_mode)
1969
1970	# Newlines are complicated. See 3x2 matrix in the comment about
1971	# self.multiline and self.newline_state above.
1972	if self.token_type == Id.Op_Newline:
1973	if self.multiline:
1974	if self.newline_state > 1:
1975	# This points at a blank line, but at least it gives the line number
1976	p_die('Invalid blank line in multiline mode',
1977	self.cur_token)
1978	return None
1979
1980	if self.returned_newline: # skip
1981	return None
1982
1983	return self.cur_token
1984
1985	elif self.token_kind == Kind.Right:
1986	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1987	Id.Right_CasePat,
1988	Id.Right_ShArrayLiteral):
1989	raise AssertionError(self.cur_token)
1990
1991	self._SetNext(lex_mode)
1992	return self.cur_token
1993
1994	elif self.token_kind in (Kind.Ignored, Kind.WS):
1995	self._SetNext(lex_mode)
1996	return None
1997
1998	else:
1999	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2000	Kind.Left, Kind.KW, Kind.ControlFlow,
2001	Kind.BoolUnary, Kind.BoolBinary,
2002	Kind.ExtGlob,
2003	Kind.BashRegex), 'Unhandled token kind'
2004
2005	if (word_mode == lex_mode_e.ShCommandFakeBrack and
2006	self.parse_opts.parse_bracket() and
2007	self.token_type == Id.Lit_LBracket):
2008	# Change [ from Kind.Lit -> Kind.Op
2009	# So CommandParser can treat
2010	# assert [42 === x]
2011	# like
2012	# json write (x)
2013	bracket_word = self.cur_token
2014	bracket_word.id = Id.Op_LBracket
2015
2016	self._SetNext(lex_mode)
2017	return bracket_word
2018
2019	# We're beginning a word. If we see Id.Lit_Pound, change to
2020	# lex_mode_e.Comment and read until end of line.
2021	if self.token_type == Id.Lit_Pound:
2022	self._SetNext(lex_mode_e.Comment)
2023	self._GetToken()
2024
2025	# NOTE: The # could be the last character in the file. It can't be
2026	# Eof_{RParen,Backtick} because #) and #` are comments.
2027	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2028	self.cur_token
2029
2030	# The next iteration will go into Kind.Ignored and set lex state to
2031	# lex_mode_e.ShCommand/etc.
2032	return None # tell ReadWord() to try again after comment
2033
2034	elif self.token_type == Id.Lit_TPound: ### doc comment
2035	self._SetNext(lex_mode_e.Comment)
2036	self._GetToken()
2037
2038	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2039	return self.cur_token
2040
2041	return None # tell ReadWord() to try again after comment
2042
2043	else:
2044	# r'' u'' b''
2045	if (self.token_type == Id.Lit_Chars and
2046	self.lexer.LookAheadOne(
2047	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2048
2049	# When shopt -s parse_raw_string:
2050	# echo r'hi' is like echo 'hi'
2051	#
2052	# echo u'\u{3bc}' b'\yff' works
2053
2054	tok = self.cur_token
2055	if self.parse_opts.parse_ysh_string():
2056	if lexer.TokenEquals(tok, 'r'):
2057	left_id = Id.Left_RSingleQuote
2058	elif lexer.TokenEquals(tok, 'u'):
2059	left_id = Id.Left_USingleQuote
2060	elif lexer.TokenEquals(tok, 'b'):
2061	left_id = Id.Left_BSingleQuote
2062	else:
2063	left_id = Id.Undefined_Tok
2064
2065	if left_id != Id.Undefined_Tok:
2066	# skip the r, and then 'foo' will be read as normal
2067	self._SetNext(lex_mode_e.ShCommand)
2068
2069	self._GetToken()
2070	assert self.token_type == Id.Left_SingleQuote, self.token_type
2071
2072	# Read the word in a different lexer mode
2073	return self._ReadYshSingleQuoted(left_id)
2074
2075	return self._ReadCompoundWord(lex_mode)
2076
2077	def ParseVarRef(self):
2078	# type: () -> BracedVarSub
2079	"""DYNAMIC parsing of what's inside ${!ref}
2080
2081	# Same as VarOf production
2082	VarRefExpr = VarOf EOF
2083	"""
2084	self._SetNext(lex_mode_e.VSub_1)
2085
2086	self._GetToken()
2087	if self.token_kind != Kind.VSub:
2088	p_die('Expected var name', self.cur_token)
2089
2090	part = self._ParseVarOf()
2091	# NOTE: no ${ } means no part.left and part.right
2092	part.left = part.token # cheat to make test pass
2093	part.right = part.token
2094
2095	self._GetToken()
2096	if self.token_type != Id.Eof_Real:
2097	p_die('Expected end of var ref expression', self.cur_token)
2098	return part
2099
2100	def LookPastSpace(self):
2101	# type: () -> Id_t
2102	"""Look ahead to the next token.
2103
2104	For the CommandParser to recognize
2105	array= (1 2 3)
2106	YSH for ( versus bash for ((
2107	YSH if ( versus if test
2108	YSH while ( versus while test
2109	YSH bare assignment 'grep =' versus 'grep foo'
2110	"""
2111	assert self.token_type != Id.Undefined_Tok
2112	if self.cur_token.id == Id.WS_Space:
2113	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2114	else:
2115	id_ = self.cur_token.id
2116	return id_
2117
2118	def LookAheadFuncParens(self):
2119	# type: () -> bool
2120	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2121	assert self.token_type != Id.Undefined_Tok
2122
2123	# We have to handle 2 cases because we buffer a token
2124	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2125	return self.lexer.LookAheadFuncParens(1) # go back one char
2126
2127	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2128	return self.lexer.LookAheadFuncParens(0)
2129
2130	else:
2131	return False
2132
2133	def ReadWord(self, word_mode):
2134	# type: (lex_mode_t) -> word_t
2135	"""Read the next word, using the given lexer mode.
2136
2137	This is a stateful wrapper for the stateless _ReadWord function.
2138	"""
2139	assert word_mode in (lex_mode_e.ShCommand,
2140	lex_mode_e.ShCommandFakeBrack,
2141	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2142
2143	if self.buffered_word: # For integration with pgen2
2144	w = self.buffered_word
2145	self.buffered_word = None
2146	else:
2147	while True:
2148	w = self._ReadWord(word_mode)
2149	if w is not None:
2150	break
2151
2152	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2153	return w
2154
2155	def ReadArithWord(self):
2156	# type: () -> word_t
2157	while True:
2158	w = self._ReadArithWord()
2159	if w is not None:
2160	break
2161	return w
2162
2163	def ReadHereDocBody(self, parts):
2164	# type: (List[word_part_t]) -> None
2165	"""
2166	A here doc is like a double quoted context, except " isn't special.
2167	"""
2168	self._ReadLikeDQ(None, False, parts)
2169	# Returns nothing
2170
2171	def ReadForPlugin(self):
2172	# type: () -> CompoundWord
2173	"""For $PS1, $PS4, etc.
2174
2175	This is just like reading a here doc line. "\n" is allowed, as
2176	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2177	"""
2178	w = CompoundWord([])
2179	self._ReadLikeDQ(None, False, w.parts)
2180	return w
2181
2182	def EmitDocToken(self, b):
2183	# type: (bool) -> None
2184	self.emit_doc_token = b
2185
2186	def Multiline(self, b):
2187	# type: (bool) -> None
2188	self.multiline = b
2189
2190
2191	if 0:
2192	import collections
2193	WORD_HIST = collections.Counter()
2194
2195	# vim: sw=4