osh/word_parse.py

OILS / osh / word_parse.py View on Github | oilshell.org

2121 lines, 1134 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	word_parse.py - Parse the shell word language.
9
10	Hairy example:
11
12	hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14	Substitutions can be nested, but which inner subs are allowed depends on the
15	outer sub. Notes:
16
17	lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18	All subs and quotes are allowed:
19	$v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21	lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22	Var, Command, Arith, but no quotes.
23	$v ${v} $() `` $(())
24	No process substitution.
25
26	lex_mode_e.Arith
27	Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28	allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29	need those for associative array indexing.
30
31	lex_mode_e.VSub_ArgUnquoted
32	Like ShCommand, everything is allowed (even process substitutions), but we
33	stop at }, and space is SIGNIFICANT.
34
35	Example: ${a:- b }
36
37	${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38	${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40	lex_mode_e.VSub_ArgDQ
41	In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42	"${x:-"default"}".
43
44	In contrast, VSub_ArgUnquoted respects single quotes and process
45	substitution.
46
47	It's weird that double quotes are allowed. Space is also significant here,
48	e.g. "${x:-a "b"}".
49	"""
50
51	from _devbuild.gen import grammar_nt
52	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53	from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54	from _devbuild.gen.syntax_asdl import (
55	BoolParamBox,
56	Token,
57	SimpleVarSub,
58	loc,
59	source,
60	DoubleQuoted,
61	SingleQuoted,
62	BracedVarSub,
63	CommandSub,
64	ShArrayLiteral,
65	AssocPair,
66	bracket_op,
67	bracket_op_t,
68	suffix_op,
69	suffix_op_t,
70	rhs_word,
71	rhs_word_e,
72	rhs_word_t,
73	word_e,
74	word_t,
75	CompoundWord,
76	word_part,
77	word_part_t,
78	y_lhs_e,
79	arith_expr_t,
80	command,
81	expr,
82	expr_e,
83	expr_t,
84	pat_t,
85	ArgList,
86	Proc,
87	Func,
88	Subscript,
89	Attribute,
90	)
91	from core import alloc
92	from core.error import p_die
93	from mycpp.mylib import log
94	from core import pyutil
95	from core import ui
96	from frontend import consts
97	from frontend import lexer
98	from frontend import reader
99	from osh import tdop
100	from osh import arith_parse
101	from osh import braces
102	from osh import word_
103	from osh import word_compile
104	from mycpp.mylib import tagswitch
105
106	from typing import List, Optional, Tuple, cast
107	from typing import TYPE_CHECKING
108	if TYPE_CHECKING:
109	from frontend.lexer import Lexer
110	from frontend.parse_lib import ParseContext
111	from frontend.reader import _Reader
112	from osh.cmd_parse import VarChecker
113
114	unused1 = log
115	unused2 = Id_str
116
117	KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120	class WordEmitter(object):
121	"""Common interface for [ and [["""
122
123	def __init__(self):
124	# type: () -> None
125	"""Empty constructor for mycpp."""
126	pass
127
128	def ReadWord(self, lex_mode):
129	# type: (lex_mode_t) -> word_t
130	raise NotImplementedError()
131
132
133	class WordParser(WordEmitter):
134
135	def __init__(self, parse_ctx, lexer, line_reader):
136	# type: (ParseContext, Lexer, _Reader) -> None
137	self.parse_ctx = parse_ctx
138	self.lexer = lexer
139	self.line_reader = line_reader
140	self.arena = line_reader.arena
141
142	self.parse_opts = parse_ctx.parse_opts
143	self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144	self.parse_opts)
145	self.Reset()
146
147	def Init(self, lex_mode):
148	# type: (lex_mode_t) -> None
149	"""Used to parse arithmetic, see ParseContext."""
150	self.next_lex_mode = lex_mode
151
152	def Reset(self):
153	# type: () -> None
154	"""Called by interactive loop."""
155	# For _GetToken()
156	self.cur_token = None # type: Token
157	self.token_kind = Kind.Undefined
158	self.token_type = Id.Undefined_Tok
159
160	self.next_lex_mode = lex_mode_e.ShCommand
161
162	# Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163	# comments
164	self.emit_doc_token = False
165	# Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166	# multiline mode.
167	self.multiline = False
168
169	# For detecting invalid \n\n in multiline mode. Counts what we got
170	# directly from the lexer.
171	self.newline_state = 0
172	# For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173	# that consume words.
174	self.returned_newline = False
175
176	# For integration with pgen2
177	self.buffered_word = None # type: word_t
178
179	def _GetToken(self):
180	# type: () -> None
181	"""Call this when you need to make a decision based on any of:
182
183	self.token_type self.token_kind self.cur_token # contents
184	"""
185	if self.next_lex_mode != lex_mode_e.Undefined:
186	self.cur_token = self.lexer.Read(self.next_lex_mode)
187	self.token_type = self.cur_token.id
188	self.token_kind = consts.GetKind(self.token_type)
189
190	# number of consecutive newlines, ignoring whitespace
191	if self.token_type == Id.Op_Newline:
192	self.newline_state += 1
193	elif self.token_kind != Kind.WS:
194	self.newline_state = 0
195
196	self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
197	self.next_lex_mode = lex_mode_e.Undefined
198
199	def _SetNext(self, lex_mode):
200	# type: (lex_mode_t) -> None
201	"""Set the next lex state, but don't actually read a token.
202
203	We need this for proper interactive parsing.
204	"""
205	self.next_lex_mode = lex_mode
206
207	def _ReadVarOpArg(self, arg_lex_mode):
208	# type: (lex_mode_t) -> rhs_word_t
209
210	# NOTE: Operators like \| and < are not treated as special, so ${a:- \| >} is
211	# valid, even when unquoted.
212	self._SetNext(arg_lex_mode)
213	self._GetToken()
214
215	w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
216	True) # empty_ok
217
218	# If the Compound has no parts, and we're in a double-quoted VarSub
219	# arg, and empty_ok, then return Empty. This is so it can evaluate to
220	# the empty string and not get elided.
221	#
222	# Examples:
223	# - "${s:-}", "${s/%pat/}"
224	# It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
225	# has the same potential problem of not having Token location info.
226	#
227	# NOTE: empty_ok is False only for the PatSub pattern, which means we'll
228	# return a Compound with no parts, which is explicitly checked with a
229	# custom error message.
230	if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
231	return rhs_word.Empty
232
233	return w
234
235	def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
236	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
237	"""Return a CompoundWord.
238
239	Helper function for _ReadVarOpArg and used directly by
240	_ReadPatSubVarOp.
241	"""
242	w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
243	#log('w %s', w)
244	tilde = word_.TildeDetect(w)
245	if tilde:
246	w = tilde
247	return w
248
249	def _ReadSliceVarOp(self):
250	# type: () -> suffix_op.Slice
251	"""VarOf ':' ArithExpr (':' ArithExpr )?"""
252	self._SetNext(lex_mode_e.Arith)
253	self._GetToken()
254	cur_id = self.token_type # e.g. Id.Arith_Colon
255
256	if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
257	# no beginning specified
258	begin = None # type: Optional[arith_expr_t]
259	else:
260	begin = self.a_parser.Parse()
261	cur_id = self.a_parser.CurrentId()
262
263	if cur_id == Id.Arith_RBrace:
264	no_length = None # type: Optional[arith_expr_t] # No length specified
265	return suffix_op.Slice(begin, no_length)
266
267	# Id.Arith_Colon is a pun for Id.VOp2_Colon
268	if cur_id == Id.Arith_Colon:
269	self._SetNext(lex_mode_e.Arith)
270	length = self._ReadArithExpr(Id.Arith_RBrace)
271	return suffix_op.Slice(begin, length)
272
273	p_die("Expected : or } in slice", self.cur_token)
274	raise AssertionError() # for MyPy
275
276	def _ReadPatSubVarOp(self):
277	# type: () -> suffix_op.PatSub
278	"""Looking at the first '/' after VarOf:
279
280	VarSub = ...
281	\| VarOf '/' Match ( '/' WORD? )?
282	Match = '/' WORD # can't be empty
283	\| '#' WORD? # may be empty
284	\| '%' WORD?
285	"""
286	slash_tok = self.cur_token # location info
287	replace_mode = Id.Undefined_Tok # bizarre syntax / # %
288
289	self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
290
291	self._GetToken()
292	if self.token_type == Id.Right_DollarBrace:
293	pat = CompoundWord([])
294	return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
295	slash_tok)
296
297	if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
298	replace_mode = self.token_type
299	self._SetNext(lex_mode_e.VSub_ArgUnquoted)
300
301	# Bash quirk:
302	# echo ${x/#/replace} has an empty pattern
303	# echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
304	empty_ok = replace_mode != Id.Lit_Slash
305	pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
306	empty_ok)
307	#log('pat 1 %r', pat)
308
309	if self.token_type == Id.Lit_Slash:
310	# read until }
311	replace = self._ReadVarOpArg(
312	lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
313	#log('r 1 %r', replace)
314	else:
315	# e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
316	replace = rhs_word.Empty
317
318	self._GetToken()
319	if self.token_type != Id.Right_DollarBrace:
320	# This happens on invalid code
321	p_die(
322	"Expected } after replacement string, got %s" %
323	ui.PrettyId(self.token_type), self.cur_token)
324
325	return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
326
327	def _ReadSubscript(self):
328	# type: () -> bracket_op_t
329	""" Subscript = '[' ('@' \| '*' \| ArithExpr) ']' """
330	# Lookahead to see if we get @ or *. Otherwise read a full arithmetic
331	# expression.
332	next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
333	if next_id in (Id.Lit_At, Id.Arith_Star):
334	op = bracket_op.WholeArray(next_id) # type: bracket_op_t
335
336	self._SetNext(lex_mode_e.Arith) # skip past [
337	self._GetToken()
338	self._SetNext(lex_mode_e.Arith) # skip past @
339	self._GetToken()
340	else:
341	self._SetNext(lex_mode_e.Arith) # skip past [
342	anode = self._ReadArithExpr(Id.Arith_RBracket)
343	op = bracket_op.ArrayIndex(anode)
344
345	if self.token_type != Id.Arith_RBracket: # Should be looking at ]
346	p_die('Expected ] to close subscript', self.cur_token)
347
348	self._SetNext(lex_mode_e.VSub_2) # skip past ]
349	self._GetToken() # Needed to be in the same spot as no subscript
350
351	return op
352
353	def _ParseVarOf(self):
354	# type: () -> BracedVarSub
355	"""
356	VarOf = NAME Subscript?
357	\| NUMBER # no subscript allowed, none of these are arrays
358	# ${@[1]} doesn't work, even though slicing does
359	\| VarSymbol
360	"""
361	self._GetToken()
362	name_token = self.cur_token
363	self._SetNext(lex_mode_e.VSub_2)
364
365	self._GetToken() # Check for []
366	if self.token_type == Id.VOp2_LBracket:
367	bracket_op = self._ReadSubscript()
368	else:
369	bracket_op = None
370
371	part = BracedVarSub.CreateNull()
372	part.token = name_token
373	part.var_name = lexer.TokenVal(name_token)
374	part.bracket_op = bracket_op
375	return part
376
377	def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
378	# type: (lex_mode_t, bool) -> BracedVarSub
379	"""Start parsing at the op -- we already skipped past the name."""
380	part = self._ParseVarOf()
381
382	self._GetToken()
383	if self.token_type == Id.Right_DollarBrace:
384	return part # no ops
385
386	op_kind = self.token_kind
387
388	if op_kind == Kind.VTest:
389	tok = self.cur_token
390	arg_word = self._ReadVarOpArg(arg_lex_mode)
391	if self.token_type != Id.Right_DollarBrace:
392	p_die('Expected } to close ${', self.cur_token)
393
394	part.suffix_op = suffix_op.Unary(tok, arg_word)
395
396	elif op_kind == Kind.VOpYsh:
397	tok = self.cur_token
398	arg_word = self._ReadVarOpArg(arg_lex_mode)
399	if self.token_type != Id.Right_DollarBrace:
400	p_die('Expected } to close ${', self.cur_token)
401
402	UP_arg_word = arg_word
403	with tagswitch(arg_word) as case:
404	if case(rhs_word_e.Empty):
405	pass
406	elif case(rhs_word_e.Compound):
407	arg_word = cast(CompoundWord, UP_arg_word)
408	# This handles ${x\|html} and ${x %.3f} now
409	# However I think ${x %.3f} should be statically parsed? It can enter
410	# the printf lexer modes.
411	ok, arg, quoted = word_.StaticEval(arg_word)
412	if not ok or quoted:
413	p_die('Expected a constant argument',
414	loc.Word(arg_word))
415
416	part.suffix_op = suffix_op.Static(tok, arg)
417
418	elif op_kind == Kind.VOp0:
419	part.suffix_op = self.cur_token # Nullary
420	self._SetNext(lex_mode_e.VSub_2) # Expecting }
421	self._GetToken()
422
423	elif op_kind == Kind.VOp1: # % %% # ## etc.
424	tok = self.cur_token
425	# Weird exception that all shells have: these operators take a glob
426	# pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
427	arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
428	if self.token_type != Id.Right_DollarBrace:
429	p_die('Expected } to close ${', self.cur_token)
430
431	part.suffix_op = suffix_op.Unary(tok, arg_word)
432
433	elif op_kind == Kind.VOp2: # / : [ ]
434	if self.token_type == Id.VOp2_Slash:
435	patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
436	part.suffix_op = patsub_op
437
438	# Checked by the method above
439	assert self.token_type == Id.Right_DollarBrace, self.cur_token
440
441	elif self.token_type == Id.VOp2_Colon:
442	part.suffix_op = self._ReadSliceVarOp()
443	# NOTE: } in arithmetic mode.
444	if self.token_type != Id.Arith_RBrace:
445	# Token seems off; doesn't point to X in # ${a:1:2 X
446	p_die('Expected } to close ${', self.cur_token)
447
448	else:
449	# TODO: Does this ever happen?
450	p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
451
452	elif op_kind == Kind.VOp3: # ${prefix@} etc.
453	if allow_query:
454	part.suffix_op = self.cur_token # Nullary
455	self._SetNext(lex_mode_e.VSub_2) # Expecting }
456	self._GetToken()
457	else:
458	p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
459
460	# NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
461	# mode. It's redundantly checked above.
462	if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
463	# ${a.} or ${!a.}
464	p_die('Expected } to close ${', self.cur_token)
465
466	# Now look for ops
467	return part
468
469	def ReadBracedVarSub(self, left_token):
470	# type: (Token) -> Tuple[BracedVarSub, Token]
471	""" For YSH expressions like var x = ${x:-"default"}. """
472	part = self._ReadBracedVarSub(left_token, d_quoted=False)
473	last_token = self.cur_token
474	return part, last_token
475
476	def _ReadBracedVarSub(self, left_token, d_quoted):
477	# type: (Token, bool) -> BracedVarSub
478	"""For the ${} expression language.
479
480	NAME = [a-zA-Z_][a-zA-Z0-9_]*
481	NUMBER = [0-9]+ # ${10}, ${11}, ...
482
483	Subscript = '[' ('@' \| '*' \| ArithExpr) ']'
484	VarSymbol = '!' \| '@' \| '#' \| ...
485	VarOf = NAME Subscript?
486	\| NUMBER # no subscript allowed, none of these are arrays
487	# ${@[1]} doesn't work, even though slicing does
488	\| VarSymbol
489
490	NULLARY_OP = '@Q' \| '@E' \| '@P' \| '@A' \| '@a' # VOp0
491
492	TEST_OP = '-' \| ':-' \| '=' \| ':=' \| '+' \| ':+' \| '?' \| ':?'
493	STRIP_OP = '#' \| '##' \| '%' \| '%%'
494	CASE_OP = ',' \| ',,' \| '^' \| '^^'
495	UnaryOp = TEST_OP \| STRIP_OP \| CASE_OP
496
497	YSH_UNARY = '\|' \| ' ' # ${x\|html} and ${x %.3f}.
498	# SPACE is operator not %
499	Match = ('/' \| '#' \| '%') WORD # match all / prefix / suffix
500	VarExpr = VarOf
501	\| VarOf NULLARY_OP
502	\| VarOf UnaryOp WORD
503	\| VarOf YSH_UNARY STATIC_WORD
504	\| VarOf ':' ArithExpr (':' ArithExpr )?
505	\| VarOf '/' Match '/' WORD
506
507	LengthExpr = '#' VarOf # can't apply operators after length
508
509	RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
510	# ${!ref[0]} vs ${!keys[@]} resolved later
511
512	PrefixQuery = '!' NAME ('*' \| '@') # list variable names with a prefix
513
514	BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
515
516	VarSub = LengthExpr
517	\| RefOrKeys
518	\| PrefixQuery
519	\| VarExpr
520	\| BuiltinSub
521
522	NOTES:
523	- Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
524	slicing ${a:x+1:y+2}
525	- ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
526	- @ and * are technically arithmetic expressions in this implementation
527	- We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
528	it's also vectorized.
529
530	Strictness over bash:
531	- echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
532	grammar
533	- ! and # prefixes can't be composed, even though named refs can be
534	composed with other operators
535	- '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
536	a prefix, and it can also be a literal part of WORD.
537
538	From the parser's point of view, the prefix # can't be combined with
539	UnaryOp/slicing/matching, and the ! can. However
540
541	- ${a[@]:1:2} is not allowed
542	- ${#a[@]:1:2} is allowed, but gives the wrong answer
543	"""
544	if d_quoted:
545	arg_lex_mode = lex_mode_e.VSub_ArgDQ
546	else:
547	arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
548
549	self._SetNext(lex_mode_e.VSub_1)
550	self._GetToken()
551
552	ty = self.token_type
553	first_tok = self.cur_token
554
555	if ty == Id.VSub_Pound:
556	# Disambiguate
557	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
558	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
559	# e.g. a name, '#' is the prefix
560	self._SetNext(lex_mode_e.VSub_1)
561	part = self._ParseVarOf()
562
563	self._GetToken()
564	if self.token_type != Id.Right_DollarBrace:
565	p_die('Expected } after length expression', self.cur_token)
566
567	part.prefix_op = first_tok
568
569	else: # not a prefix, '#' is the variable
570	part = self._ParseVarExpr(arg_lex_mode)
571
572	elif ty == Id.VSub_Bang:
573	next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
574	if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
575	# e.g. a name, '!' is the prefix
576	# ${!a} -- this is a ref
577	# ${!3} -- this is ref
578	# ${!a[1]} -- this is a ref
579	# ${!a[@]} -- this is a keys
580	# No lookahead -- do it in a second step, or at runtime
581	self._SetNext(lex_mode_e.VSub_1)
582	part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
583
584	part.prefix_op = first_tok
585
586	else: # not a prefix, '!' is the variable
587	part = self._ParseVarExpr(arg_lex_mode)
588
589	elif ty == Id.VSub_Dot:
590	# Note: this will become a new builtin_sub type, so this method must
591	# return word_part_t rather than BracedVarSub. I don't think that
592	# should cause problems.
593	p_die('TODO: ${.myproc builtin sub}', self.cur_token)
594
595	# VS_NAME, VS_NUMBER, symbol that isn't # or !
596	elif self.token_kind == Kind.VSub:
597	part = self._ParseVarExpr(arg_lex_mode)
598
599	else:
600	# e.g. ${^}
601	p_die('Unexpected token in ${}', self.cur_token)
602
603	part.left = left_token # attach the argument
604	part.right = self.cur_token
605	return part
606
607	def _ReadSingleQuoted(self, left_token, lex_mode):
608	# type: (Token, lex_mode_t) -> SingleQuoted
609	"""Internal method to read a word_part."""
610	tokens = [] # type: List[Token]
611	# In command mode, we never disallow backslashes like '\'
612	right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
613	False)
614	sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
615	node = SingleQuoted(left_token, sval, right_quote)
616	return node
617
618	def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
619	# type: (lex_mode_t, Token, List[Token], bool) -> Token
620	"""Appends to out_tokens; returns last token
621
622	Used by expr_parse.py
623	"""
624	# TODO: Remove and use out_tokens
625	tokens = [] # type: List[Token]
626
627	# echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
628	no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
629
630	expected_end_tokens = 3 if left_token.id in (
631	Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
632	Id.Left_BTSingleQuote) else 1
633	num_end_tokens = 0
634
635	while num_end_tokens < expected_end_tokens:
636	self._SetNext(lex_mode)
637	self._GetToken()
638
639	# Kind.Char emitted in lex_mode.SQ_C
640	if self.token_kind in (Kind.Lit, Kind.Char):
641	tok = self.cur_token
642	# Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
643	# r'one\two' or c'one\\two'
644	if no_backslashes and lexer.TokenContains(tok, '\\'):
645	p_die(
646	r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
647	tok)
648
649	if is_ysh_expr:
650	# Disallow var x = $'\001'. Arguably we don't need these
651	# checks because u'\u{1}' is the way to write it.
652	if self.token_type == Id.Char_Octal3:
653	p_die(
654	r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
655	tok)
656
657	if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
658	# disallow \xH
659	p_die(
660	r'Invalid hex escape in YSH string (must be \xHH)',
661	tok)
662
663	tokens.append(tok)
664
665	elif self.token_kind == Kind.Unknown:
666	tok = self.cur_token
667	assert tok.id == Id.Unknown_Backslash, tok
668
669	# x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
670	if is_ysh_expr or not self.parse_opts.parse_backslash():
671	p_die(
672	"Invalid char escape in C-style string literal (OILS-ERR-11)",
673	tok)
674
675	tokens.append(tok)
676
677	elif self.token_kind == Kind.Eof:
678	p_die('Unexpected EOF in single-quoted string that began here',
679	left_token)
680
681	elif self.token_kind == Kind.Right:
682	# assume Id.Right_SingleQuote
683	num_end_tokens += 1
684	tokens.append(self.cur_token)
685
686	else:
687	raise AssertionError(self.cur_token)
688
689	if self.token_kind != Kind.Right:
690	num_end_tokens = 0 # we need three in a ROW
691
692	if expected_end_tokens == 1:
693	tokens.pop()
694	elif expected_end_tokens == 3: # Get rid of spurious end tokens
695	tokens.pop()
696	tokens.pop()
697	tokens.pop()
698
699	# Remove space from ''' r''' $''' in both expression mode and command mode
700	if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
701	Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
702	word_compile.RemoveLeadingSpaceSQ(tokens)
703
704	# Validation after lexing - same 2 checks in j8.LexerDecoder
705	is_u_string = left_token.id in (Id.Left_USingleQuote,
706	Id.Left_UTSingleQuote)
707
708	for tok in tokens:
709	# u'\yff' is not valid, but b'\yff' is
710	if is_u_string and tok.id == Id.Char_YHex:
711	p_die(
712	r"%s escapes not allowed in u'' strings" %
713	lexer.TokenVal(tok), tok)
714	# \u{dc00} isn't valid
715	if tok.id == Id.Char_UBraced:
716	h = lexer.TokenSlice(tok, 3, -1) # \u{123456}
717	i = int(h, 16)
718	if 0xD800 <= i and i < 0xE000:
719	p_die(
720	r"%s escape is illegal because it's in the surrogate range"
721	% lexer.TokenVal(tok), tok)
722
723	out_tokens.extend(tokens)
724	return self.cur_token
725
726	def _ReadDoubleQuotedLeftParts(self):
727	# type: () -> word_part_t
728	"""Read substitution parts in a double quoted context."""
729	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
730	return self._ReadCommandSub(self.token_type, d_quoted=True)
731
732	if self.token_type == Id.Left_DollarBrace:
733	return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
734
735	if self.token_type == Id.Left_DollarDParen:
736	return self._ReadArithSub()
737
738	if self.token_type == Id.Left_DollarBracket:
739	return self._ReadExprSub(lex_mode_e.DQ)
740
741	raise AssertionError(self.cur_token)
742
743	def _ReadYshSingleQuoted(self, left_id):
744	# type: (Id_t) -> CompoundWord
745	"""Read YSH style strings
746
747	r'' u'' b''
748	r''' ''' u''' ''' b''' '''
749	"""
750	#log('BEF self.cur_token %s', self.cur_token)
751	if left_id == Id.Left_RSingleQuote:
752	lexer_mode = lex_mode_e.SQ_Raw
753	triple_left_id = Id.Left_RTSingleQuote
754	elif left_id == Id.Left_USingleQuote:
755	lexer_mode = lex_mode_e.J8_Str
756	triple_left_id = Id.Left_UTSingleQuote
757	elif left_id == Id.Left_BSingleQuote:
758	lexer_mode = lex_mode_e.J8_Str
759	triple_left_id = Id.Left_BTSingleQuote
760	else:
761	raise AssertionError(left_id)
762
763	# Needed for syntax checks
764	left_tok = self.cur_token
765	left_tok.id = left_id
766
767	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
768
769	if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
770	self._SetNext(lex_mode_e.ShCommand)
771	self._GetToken()
772
773	assert self.token_type == Id.Left_SingleQuote
774	# HACK: magically transform the third ' in u''' to
775	# Id.Left_UTSingleQuote, so that ''' is the terminator
776	left_tok = self.cur_token
777	left_tok.id = triple_left_id
778
779	# Handles stripping leading whitespace
780	sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
781
782	# Advance and validate
783	self._SetNext(lex_mode_e.ShCommand)
784
785	self._GetToken()
786	if self.token_kind not in KINDS_THAT_END_WORDS:
787	p_die('Unexpected token after YSH single-quoted string',
788	self.cur_token)
789
790	return CompoundWord([sq_part])
791
792	def _ReadUnquotedLeftParts(self, triple_out):
793	# type: (Optional[BoolParamBox]) -> word_part_t
794	"""Read substitutions and quoted strings (for lex_mode_e.ShCommand).
795
796	If triple_out is set, then we try parsing triple quoted strings,
797	and set its value to True if we got one.
798	"""
799	if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
800	# Note: $"" is a synonym for "". It might make sense if it added
801	# \n \0 \x00 \u{123} etc. But that's not what bash does!
802	dq_part = self._ReadDoubleQuoted(self.cur_token)
803	# Got empty word "" and there's a " after
804	if (triple_out and len(dq_part.parts) == 0 and
805	self.lexer.ByteLookAhead() == '"'):
806
807	self._SetNext(lex_mode_e.ShCommand)
808	self._GetToken()
809	# HACK: magically transform the third " in """ to
810	# Id.Left_TDoubleQuote, so that """ is the terminator
811	left_dq_token = self.cur_token
812	left_dq_token.id = Id.Left_TDoubleQuote
813	triple_out.b = True # let caller know we got it
814	return self._ReadDoubleQuoted(left_dq_token)
815
816	return dq_part
817
818	if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
819	Id.Left_DollarSingleQuote):
820	if self.token_type == Id.Left_SingleQuote:
821	lexer_mode = lex_mode_e.SQ_Raw
822	triple_left_id = Id.Left_TSingleQuote
823	elif self.token_type == Id.Left_RSingleQuote:
824	lexer_mode = lex_mode_e.SQ_Raw
825	triple_left_id = Id.Left_RTSingleQuote
826	else:
827	lexer_mode = lex_mode_e.SQ_C
828	# there is no such thing as $'''
829	triple_left_id = Id.Undefined_Tok
830
831	sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
832
833	# Got empty '' or r'' and there's a ' after
834	# u'' and b'' are handled in _ReadYshSingleQuoted
835	if (triple_left_id != Id.Undefined_Tok and
836	triple_out is not None and len(sq_part.sval) == 0 and
837	self.lexer.ByteLookAhead() == "'"):
838
839	self._SetNext(lex_mode_e.ShCommand)
840	self._GetToken()
841
842	# HACK: magically transform the third ' in ''' to
843	# Id.Left_TSingleQuote, so that ''' is the terminator
844	left_sq_token = self.cur_token
845	left_sq_token.id = triple_left_id
846
847	triple_out.b = True # let caller know we got it
848	return self._ReadSingleQuoted(left_sq_token, lexer_mode)
849
850	return sq_part
851
852	if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
853	Id.Left_ProcSubIn, Id.Left_ProcSubOut):
854	return self._ReadCommandSub(self.token_type, d_quoted=False)
855
856	if self.token_type == Id.Left_DollarBrace:
857	return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
858
859	if self.token_type == Id.Left_DollarDParen:
860	return self._ReadArithSub()
861
862	if self.token_type == Id.Left_DollarBracket:
863	return self._ReadExprSub(lex_mode_e.ShCommand)
864
865	raise AssertionError(self.cur_token)
866
867	def _ReadExtGlob(self):
868	# type: () -> word_part.ExtGlob
869	"""
870	Grammar:
871	Item = CompoundWord \| EPSILON # important: @(foo\|) is allowed
872	LEFT = '@(' \| '*(' \| '+(' \| '?(' \| '!('
873	RIGHT = ')'
874	ExtGlob = LEFT (Item '\|')* Item RIGHT # ITEM may be empty
875	Compound includes ExtGlob
876	"""
877	left_token = self.cur_token
878	right_token = None # type: Token
879	arms = [] # type: List[CompoundWord]
880
881	self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
882	self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
883
884	read_word = False # did we just a read a word? To handle @(\|\|).
885
886	while True:
887	self._GetToken()
888
889	if self.token_type == Id.Right_ExtGlob:
890	if not read_word:
891	arms.append(CompoundWord([]))
892	right_token = self.cur_token
893	break
894
895	elif self.token_type == Id.Op_Pipe:
896	if not read_word:
897	arms.append(CompoundWord([]))
898	read_word = False
899	self._SetNext(lex_mode_e.ExtGlob)
900
901	# lex mode EXTGLOB should only produce these 4 kinds of tokens
902	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
903	Kind.ExtGlob):
904	w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
905	arms.append(w)
906	read_word = True
907
908	elif self.token_kind == Kind.Eof:
909	p_die('Unexpected EOF reading extended glob that began here',
910	left_token)
911
912	else:
913	raise AssertionError(self.cur_token)
914
915	return word_part.ExtGlob(left_token, arms, right_token)
916
917	def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
918	# type: (Optional[Token], bool, List[word_part_t]) -> None
919	"""
920	Args:
921	left_token: A token if we are reading a double quoted part, or None if
922	we're reading a here doc.
923	is_ysh_expr: Whether to disallow backticks and invalid char escapes
924	out_parts: list of word_part to append to
925	"""
926	if left_token:
927	expected_end_tokens = 3 if left_token.id == Id.Left_TDoubleQuote else 1
928	else:
929	expected_end_tokens = 1000 # here doc will break
930
931	num_end_tokens = 0
932	while num_end_tokens < expected_end_tokens:
933	self._SetNext(lex_mode_e.DQ)
934	self._GetToken()
935
936	if self.token_kind == Kind.Lit:
937	if self.token_type == Id.Lit_EscapedChar:
938	tok = self.cur_token
939	ch = lexer.TokenSliceLeft(tok, 1)
940	part = word_part.EscapedLiteral(tok,
941	ch) # type: word_part_t
942	else:
943	if self.token_type == Id.Lit_BadBackslash:
944	# echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
945	# YSH.
946	# Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
947	# recursion (unless parse_backslash)
948	if (is_ysh_expr or
949	not self.parse_opts.parse_backslash()):
950	p_die(
951	"Invalid char escape in double quoted string (OILS-ERR-12)",
952	self.cur_token)
953	elif self.token_type == Id.Lit_Dollar:
954	if is_ysh_expr or not self.parse_opts.parse_dollar():
955	p_die("Literal $ should be quoted like \$",
956	self.cur_token)
957
958	part = self.cur_token
959	out_parts.append(part)
960
961	elif self.token_kind == Kind.Left:
962	if self.token_type == Id.Left_Backtick and is_ysh_expr:
963	p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
964	self.cur_token)
965
966	part = self._ReadDoubleQuotedLeftParts()
967	out_parts.append(part)
968
969	elif self.token_kind == Kind.VSub:
970	tok = self.cur_token
971	part = SimpleVarSub(tok)
972	out_parts.append(part)
973	# NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
974	# later.
975
976	elif self.token_kind == Kind.Right:
977	assert self.token_type == Id.Right_DoubleQuote, self.token_type
978	if left_token:
979	num_end_tokens += 1
980
981	# In a here doc, the right quote is literal!
982	out_parts.append(self.cur_token)
983
984	elif self.token_kind == Kind.Eof:
985	if left_token:
986	p_die(
987	'Unexpected EOF reading double-quoted string that began here',
988	left_token)
989	else: # here docs will have an EOF in their token stream
990	break
991
992	else:
993	raise AssertionError(self.cur_token)
994
995	if self.token_kind != Kind.Right:
996	num_end_tokens = 0 # """ must be CONSECUTIVE
997
998	if expected_end_tokens == 1:
999	out_parts.pop()
1000	elif expected_end_tokens == 3:
1001	out_parts.pop()
1002	out_parts.pop()
1003	out_parts.pop()
1004
1005	# Remove space from """ in both expression mode and command mode
1006	if left_token and left_token.id == Id.Left_TDoubleQuote:
1007	word_compile.RemoveLeadingSpaceDQ(out_parts)
1008
1009	# Return nothing, since we appended to 'out_parts'
1010
1011	def _ReadDoubleQuoted(self, left_token):
1012	# type: (Token) -> DoubleQuoted
1013	"""Helper function for "hello $name".
1014
1015	Args:
1016	eof_type: for stopping at }, Id.Lit_RBrace
1017	here_doc: Whether we are reading in a here doc context
1018
1019	Also ${foo%%a b c} # treat this as double quoted. until you hit
1020	"""
1021	parts = [] # type: List[word_part_t]
1022	self._ReadLikeDQ(left_token, False, parts)
1023
1024	right_quote = self.cur_token
1025	return DoubleQuoted(left_token, parts, right_quote)
1026
1027	def ReadDoubleQuoted(self, left_token, parts):
1028	# type: (Token, List[word_part_t]) -> Token
1029	"""For expression mode.
1030
1031	Read var x = "${dir:-}/$name"; etc.
1032	"""
1033	self._ReadLikeDQ(left_token, True, parts)
1034	return self.cur_token
1035
1036	def _ReadCommandSub(self, left_id, d_quoted=False):
1037	# type: (Id_t, bool) -> CommandSub
1038	"""
1039	NOTE: This is not in the grammar, because word parts aren't in the grammar!
1040
1041	command_sub = '$(' command_list ')'
1042	\| '@(' command_list ')'
1043	\| '<(' command_list ')'
1044	\| '>(' command_list ')'
1045	\| ` command_list `
1046	"""
1047	left_token = self.cur_token
1048
1049	# Set the lexer in a state so ) becomes the EOF token.
1050	if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1051	Id.Left_ProcSubOut):
1052	self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1053
1054	right_id = Id.Eof_RParen
1055	self.lexer.PushHint(Id.Op_RParen, right_id)
1056	c_parser = self.parse_ctx.MakeParserForCommandSub(
1057	self.line_reader, self.lexer, right_id)
1058	# NOTE: This doesn't use something like main_loop because we don't want
1059	# to interleave parsing and execution! Unlike 'source' and 'eval'.
1060	node = c_parser.ParseCommandSub()
1061
1062	right_token = c_parser.w_parser.cur_token
1063
1064	elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1065	# NOTE: This is an APPROXIMATE solution for translation ONLY. See
1066	# test/osh2oil.
1067
1068	right_id = Id.Eof_Backtick
1069	self.lexer.PushHint(Id.Left_Backtick, right_id)
1070	c_parser = self.parse_ctx.MakeParserForCommandSub(
1071	self.line_reader, self.lexer, right_id)
1072	node = c_parser.ParseCommandSub()
1073	right_token = c_parser.w_parser.cur_token
1074
1075	elif left_id == Id.Left_Backtick:
1076	if not self.parse_opts.parse_backticks():
1077	p_die('Use $(cmd) instead of backticks (parse_backticks)',
1078	left_token)
1079
1080	self._SetNext(lex_mode_e.Backtick) # advance past `
1081
1082	parts = [] # type: List[str]
1083	while True:
1084	self._GetToken()
1085	#log("TOK %s", self.cur_token)
1086
1087	if self.token_type == Id.Backtick_Quoted:
1088	# Remove leading \
1089	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1090
1091	elif self.token_type == Id.Backtick_DoubleQuote:
1092	# Compatibility: If backticks are double quoted, then double quotes
1093	# within them have to be \"
1094	# Shells aren't smart enough to match nested " and ` quotes (but OSH
1095	# is)
1096	if d_quoted:
1097	# Remove leading \
1098	parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1099	else:
1100	parts.append(lexer.TokenVal(self.cur_token))
1101
1102	elif self.token_type == Id.Backtick_Other:
1103	parts.append(lexer.TokenVal(self.cur_token))
1104
1105	elif self.token_type == Id.Backtick_Right:
1106	break
1107
1108	elif self.token_type == Id.Eof_Real:
1109	# Note: this parse error is in the ORIGINAL context. No code_str yet.
1110	p_die('Unexpected EOF while looking for closing backtick',
1111	left_token)
1112
1113	else:
1114	raise AssertionError(self.cur_token)
1115
1116	self._SetNext(lex_mode_e.Backtick)
1117
1118	# Calculate right SPID on CommandSub BEFORE re-parsing.
1119	right_token = self.cur_token
1120
1121	code_str = ''.join(parts)
1122	#log('code %r', code_str)
1123
1124	# NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1125	# won't have the same location info as MakeParserForCommandSub(), because
1126	# the lexer is different.
1127	arena = self.parse_ctx.arena
1128	#arena = alloc.Arena()
1129	line_reader = reader.StringLineReader(code_str, arena)
1130	c_parser = self.parse_ctx.MakeOshParser(line_reader)
1131	src = source.Reparsed('backticks', left_token, right_token)
1132	with alloc.ctx_SourceCode(arena, src):
1133	node = c_parser.ParseCommandSub()
1134
1135	else:
1136	raise AssertionError(left_id)
1137
1138	return CommandSub(left_token, node, right_token)
1139
1140	def _ReadExprSub(self, lex_mode):
1141	# type: (lex_mode_t) -> word_part.ExprSub
1142	"""$[d->key] $[obj.method()] etc."""
1143	left_token = self.cur_token
1144
1145	self._SetNext(lex_mode_e.Expr)
1146	enode, right_token = self.parse_ctx.ParseYshExpr(
1147	self.lexer, grammar_nt.ysh_expr_sub)
1148
1149	self._SetNext(lex_mode) # Move past ]
1150	return word_part.ExprSub(left_token, enode, right_token)
1151
1152	def ParseVarDecl(self, kw_token):
1153	# type: (Token) -> command.VarDecl
1154	"""
1155	oil_var_decl: name_type_list '=' testlist end_stmt
1156
1157	Note that assignments must end with \n ; } or EOF. Unlike shell
1158	assignments, we disallow:
1159
1160	var x = 42 \| wc -l
1161	var x = 42 && echo hi
1162	"""
1163	self._SetNext(lex_mode_e.Expr)
1164	enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1165	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1166	# wants
1167	if last_token.id == Id.Op_RBrace:
1168	last_token.id = Id.Lit_RBrace
1169
1170	# Let the CommandParser see the Op_Semi or Op_Newline.
1171	self.buffered_word = last_token
1172	self._SetNext(lex_mode_e.ShCommand) # always back to this
1173	return enode
1174
1175	def ParseMutation(self, kw_token, var_checker):
1176	# type: (Token, VarChecker) -> command.Mutation
1177	"""
1178	setvar i = 42
1179	setvar i += 1
1180	setvar a[i] = 42
1181	setvar a[i] += 1
1182	setvar d.key = 42
1183	setvar d.key += 1
1184	"""
1185	self._SetNext(lex_mode_e.Expr)
1186	enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1187	# Hack to move } from what the Expr lexer modes gives to what CommandParser
1188	# wants
1189	if last_token.id == Id.Op_RBrace:
1190	last_token.id = Id.Lit_RBrace
1191
1192	for lhs in enode.lhs:
1193	UP_lhs = lhs
1194	with tagswitch(lhs) as case:
1195	if case(y_lhs_e.Var):
1196	lhs = cast(Token, UP_lhs)
1197	var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1198
1199	# Note: this does not cover cases like
1200	# setvar (a[0])[1] = v
1201	# setvar (d.key).other = v
1202	# This leaks into catching all typos statically, which may be
1203	# possible if 'use' makes all names explicit.
1204	elif case(y_lhs_e.Subscript):
1205	lhs = cast(Subscript, UP_lhs)
1206	if lhs.obj.tag() == expr_e.Var:
1207	v = cast(expr.Var, lhs.obj)
1208	var_checker.Check(kw_token.id, v.name, v.left)
1209
1210	elif case(y_lhs_e.Attribute):
1211	lhs = cast(Attribute, UP_lhs)
1212	if lhs.obj.tag() == expr_e.Var:
1213	v = cast(expr.Var, lhs.obj)
1214	var_checker.Check(kw_token.id, v.name, v.left)
1215
1216	# Let the CommandParser see the Op_Semi or Op_Newline.
1217	self.buffered_word = last_token
1218	self._SetNext(lex_mode_e.ShCommand) # always back to this
1219	return enode
1220
1221	def ParseBareDecl(self):
1222	# type: () -> expr_t
1223	"""
1224	x = {name: val}
1225	"""
1226	self._SetNext(lex_mode_e.Expr)
1227	self._GetToken()
1228	enode, last_token = self.parse_ctx.ParseYshExpr(
1229	self.lexer, grammar_nt.command_expr)
1230	if last_token.id == Id.Op_RBrace:
1231	last_token.id = Id.Lit_RBrace
1232	self.buffered_word = last_token
1233	self._SetNext(lex_mode_e.ShCommand)
1234	return enode
1235
1236	def ParseYshExprForCommand(self):
1237	# type: () -> expr_t
1238
1239	# Fudge for this case
1240	# for x in(y) {
1241	# versus
1242	# for x in (y) {
1243	#
1244	# In the former case, ReadWord on 'in' puts the lexer past (.
1245	# Also see LookPastSpace in CommandParers.
1246	# A simpler solution would be nicer.
1247
1248	if self.token_type == Id.Op_LParen:
1249	self.lexer.MaybeUnreadOne()
1250
1251	enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1252
1253	self._SetNext(lex_mode_e.ShCommand)
1254	return enode
1255
1256	def ParseCommandExpr(self):
1257	# type: () -> expr_t
1258	"""
1259	= 1+2
1260	"""
1261	enode, last_token = self.parse_ctx.ParseYshExpr(
1262	self.lexer, grammar_nt.command_expr)
1263
1264	# In some cases, such as the case statement, we expect the lexer to be
1265	# pointing at the token right after the expression. But the expression
1266	# parser must have read to the `last_token`. Unreading places the lexer
1267	# back in the expected state. Ie:
1268	#
1269	# case (x) { case (x) {
1270	# (else) { = x } (else) { = x }
1271	# ^ The lexer is here ^ Unread to here
1272	# } }
1273	assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1274	Id.Op_RBrace), last_token
1275	if last_token.id != Id.Eof_Real:
1276	# Eof_Real is the only token we cannot unread
1277	self.lexer.MaybeUnreadOne()
1278
1279	return enode
1280
1281	def ParseProc(self, node):
1282	# type: (Proc) -> None
1283
1284	# proc name-with-hyphens() must be accepted
1285	self._SetNext(lex_mode_e.ShCommand)
1286	self._GetToken()
1287	# example: 'proc f[' gets you Lit_ArrayLhsOpen
1288	if self.token_type != Id.Lit_Chars:
1289	p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1290	self.cur_token)
1291
1292	# TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1293	# for shell functions. Similar to IsValidVarName().
1294	node.name = self.cur_token
1295
1296	last_token = self.parse_ctx.ParseProc(self.lexer, node)
1297
1298	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1299	assert last_token.id == Id.Op_LBrace
1300	last_token.id = Id.Lit_LBrace
1301	self.buffered_word = last_token
1302
1303	self._SetNext(lex_mode_e.ShCommand)
1304
1305	def ParseFunc(self, node):
1306	# type: (Func) -> None
1307	last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1308
1309	# Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1310	assert last_token.id == Id.Op_LBrace
1311	last_token.id = Id.Lit_LBrace
1312	self.buffered_word = last_token
1313
1314	self._SetNext(lex_mode_e.ShCommand)
1315
1316	def ParseYshCasePattern(self):
1317	# type: () -> Tuple[pat_t, Token]
1318	pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1319	self.lexer)
1320
1321	if last_token.id == Id.Op_LBrace:
1322	last_token.id = Id.Lit_LBrace
1323	self.buffered_word = last_token
1324
1325	return pat, left_tok
1326
1327	def NewlineOkForYshCase(self):
1328	# type: () -> Id_t
1329	"""Check for optional newline and consume it.
1330
1331	This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1332	which crop up while parsing Ysh Case Arms. For more details, see
1333	#oil-dev > Progress On YSH Case Grammar on zulip.
1334
1335	Returns a token id which is filled with the choice of
1336
1337	word { echo word }
1338	(3) { echo expr }
1339	/e/ { echo eggex }
1340	} # right brace
1341	"""
1342	while True:
1343	next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1344
1345	# Cannot lookahead past lines
1346	if next_id == Id.Unknown_Tok:
1347	self.lexer.MoveToNextLine()
1348	continue
1349
1350	next_kind = consts.GetKind(next_id)
1351	if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1352	break
1353
1354	self.lexer.Read(lex_mode_e.Expr)
1355
1356	if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1357	self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1358	else:
1359	# Consume the trailing Op_Newline
1360	self._SetNext(lex_mode_e.ShCommand)
1361	self._GetToken()
1362
1363	return next_id
1364
1365	def _ReadArithExpr(self, end_id):
1366	# type: (Id_t) -> arith_expr_t
1367	"""Read and parse an arithmetic expression in various contexts.
1368
1369	$(( 1+2 ))
1370	(( a=1+2 ))
1371	${a[ 1+2 ]}
1372	${a : 1+2 : 1+2}
1373
1374	See tests/arith-context.test.sh for ambiguous cases.
1375
1376	${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1377
1378	${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1379
1380	See the assertion in ArithParser.Parse() -- unexpected extra input.
1381	"""
1382	# calls self.ReadWord(lex_mode_e.Arith)
1383	anode = self.a_parser.Parse()
1384	cur_id = self.a_parser.CurrentId()
1385	if end_id != Id.Undefined_Tok and cur_id != end_id:
1386	p_die(
1387	'Unexpected token after arithmetic expression (%s != %s)' %
1388	(ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1389	loc.Word(self.a_parser.cur_word))
1390	return anode
1391
1392	def _ReadArithSub(self):
1393	# type: () -> word_part.ArithSub
1394	"""Read an arith substitution, which contains an arith expression, e.g.
1395
1396	$((a + 1)).
1397	"""
1398	left_tok = self.cur_token
1399
1400	# The second one needs to be disambiguated in stuff like stuff like:
1401	# $(echo $(( 1+2 )) )
1402	self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1403
1404	# NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1405	# could save the lexer/reader state here, and retry if the arithmetic parse
1406	# fails. But we can almost always catch this at parse time. There could
1407	# be some exceptions like:
1408	# $((echo * foo)) # looks like multiplication
1409	# $((echo / foo)) # looks like division
1410
1411	self._SetNext(lex_mode_e.Arith)
1412	anode = self._ReadArithExpr(Id.Arith_RParen)
1413
1414	# TODO: This could be DQ or Arith too
1415	self._SetNext(lex_mode_e.ShCommand)
1416
1417	# PROBLEM: $(echo $(( 1 + 2 )) )
1418	# Two right parens break the Id.Eof_RParen scheme
1419	self._GetToken()
1420	if self.token_type != Id.Right_DollarDParen:
1421	p_die('Expected second ) to end arith sub', self.cur_token)
1422
1423	right_tok = self.cur_token
1424	return word_part.ArithSub(left_tok, anode, right_tok)
1425
1426	def ReadDParen(self):
1427	# type: () -> Tuple[arith_expr_t, Token]
1428	"""Read ((1+ 2)) -- command context.
1429
1430	We're using the word parser because it's very similar to _ReadArithExpr
1431	above.
1432
1433	This also returns the terminating `Op_DRightParen` token for use as location
1434	tracking.
1435	"""
1436	# The second one needs to be disambiguated in stuff like stuff like:
1437	# TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1438	# Then you can get rid of this.
1439	self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1440
1441	self._SetNext(lex_mode_e.Arith)
1442	anode = self._ReadArithExpr(Id.Arith_RParen)
1443
1444	self._SetNext(lex_mode_e.ShCommand)
1445
1446	# PROBLEM: $(echo $(( 1 + 2 )) )
1447	self._GetToken()
1448	right = self.cur_token
1449	if self.token_type != Id.Op_DRightParen:
1450	p_die('Expected second ) to end arith statement', self.cur_token)
1451
1452	self._SetNext(lex_mode_e.ShCommand)
1453
1454	return anode, right
1455
1456	def _SetNextNonSpace(self):
1457	# type: () -> None
1458	"""Same logic as _ReadWord, but for ReadForExpression."""
1459	while True:
1460	self._SetNext(lex_mode_e.Arith)
1461	self._GetToken()
1462	if self.token_kind not in (Kind.Ignored, Kind.WS):
1463	break
1464
1465	def ReadForExpression(self):
1466	# type: () -> command.ForExpr
1467	"""Read ((i=0; i<5; ++i)) -- part of command context."""
1468	self._SetNextNonSpace() # skip over ((
1469
1470	self._GetToken()
1471	cur_id = self.token_type # for end of arith expressions
1472
1473	if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1474	init_node = None # type: Optional[arith_expr_t]
1475	else:
1476	init_node = self.a_parser.Parse()
1477	cur_id = self.a_parser.CurrentId()
1478	self._SetNextNonSpace()
1479
1480	# It's odd to keep track of both cur_id and self.token_type in this
1481	# function, but it works, and is tested in 'test/parse_error.sh
1482	# arith-integration'
1483	if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1484	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1485
1486	self._GetToken()
1487	cur_id = self.token_type
1488
1489	if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1490	cond_node = None # type: Optional[arith_expr_t]
1491	else:
1492	cond_node = self.a_parser.Parse()
1493	cur_id = self.a_parser.CurrentId()
1494	self._SetNextNonSpace()
1495
1496	if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1497	p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1498
1499	self._GetToken()
1500	cur_id = self.token_type
1501
1502	if cur_id == Id.Arith_RParen: # for (( ; ; ))
1503	update_node = None # type: Optional[arith_expr_t]
1504	else:
1505	update_node = self._ReadArithExpr(Id.Arith_RParen)
1506	self._SetNextNonSpace()
1507
1508	self._GetToken()
1509	if self.token_type != Id.Arith_RParen:
1510	p_die('Expected ) to end for loop expression', self.cur_token)
1511	self._SetNext(lex_mode_e.ShCommand)
1512
1513	# redirects is None, will be assigned in CommandEvaluator
1514	node = command.ForExpr.CreateNull()
1515	node.init = init_node
1516	node.cond = cond_node
1517	node.update = update_node
1518	return node
1519
1520	def _ReadArrayLiteral(self):
1521	# type: () -> word_part_t
1522	"""a=(1 2 3)
1523
1524	TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1525
1526	We want:
1527
1528	A=(['x']=1 ["x"]=2 [$x$y]=3)
1529
1530	Maybe allow this as a literal string? Because I think I've seen it before?
1531	Or maybe force people to patch to learn the rule.
1532
1533	A=([x]=4)
1534
1535	Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1536	Maybe enforce that ALL have keys or NONE of have keys.
1537	"""
1538	self._SetNext(lex_mode_e.ShCommand) # advance past (
1539	self._GetToken()
1540	if self.cur_token.id != Id.Op_LParen:
1541	p_die('Expected ( after =', self.cur_token)
1542	left_token = self.cur_token
1543	right_token = None # type: Token
1544
1545	# MUST use a new word parser (with same lexer).
1546	w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1547	words = [] # type: List[CompoundWord]
1548	done = False
1549	while not done:
1550	w = w_parser.ReadWord(lex_mode_e.ShCommand)
1551	with tagswitch(w) as case:
1552	if case(word_e.Operator):
1553	tok = cast(Token, w)
1554	if tok.id == Id.Right_ShArrayLiteral:
1555	right_token = tok
1556	done = True # can't use break here
1557	# Unlike command parsing, array parsing allows embedded \n.
1558	elif tok.id == Id.Op_Newline:
1559	continue
1560	else:
1561	p_die('Unexpected token in array literal', loc.Word(w))
1562
1563	elif case(word_e.Compound):
1564	words.append(cast(CompoundWord, w))
1565
1566	else:
1567	raise AssertionError()
1568
1569	if len(words) == 0: # a=() is empty indexed array
1570	# Needed for type safety, doh
1571	no_words = [] # type: List[word_t]
1572	node = ShArrayLiteral(left_token, no_words, right_token)
1573	return node
1574
1575	pairs = [] # type: List[AssocPair]
1576	# If the first one is a key/value pair, then the rest are assumed to be.
1577	pair = word_.DetectAssocPair(words[0])
1578	if pair:
1579	pairs.append(pair)
1580
1581	n = len(words)
1582	for i in xrange(1, n):
1583	w2 = words[i]
1584	pair = word_.DetectAssocPair(w2)
1585	if not pair:
1586	p_die("Expected associative array pair", loc.Word(w2))
1587
1588	pairs.append(pair)
1589
1590	# invariant List?
1591	return word_part.BashAssocLiteral(left_token, pairs, right_token)
1592
1593	# Brace detection for arrays but NOT associative arrays
1594	words2 = braces.BraceDetectAll(words)
1595	words3 = word_.TildeDetectAll(words2)
1596	return ShArrayLiteral(left_token, words3, right_token)
1597
1598	def ParseProcCallArgs(self, start_symbol):
1599	# type: (int) -> ArgList
1600	""" json write (x) """
1601	self.lexer.MaybeUnreadOne()
1602
1603	arg_list = ArgList.CreateNull(alloc_lists=True)
1604	arg_list.left = self.cur_token
1605	self.parse_ctx.ParseYshArgList(self.lexer, arg_list, start_symbol)
1606	return arg_list
1607
1608	def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1609	# type: (bool, lex_mode_t, List[word_part_t]) -> bool
1610	"""Helper for _ReadCompoundWord3."""
1611	done = False
1612
1613	if self.token_type == Id.Lit_EscapedChar:
1614	tok = self.cur_token
1615	assert tok.length == 2
1616	ch = lexer.TokenSliceLeft(tok, 1)
1617	if not self.parse_opts.parse_backslash():
1618	if not pyutil.IsValidCharEscape(ch):
1619	p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1620	self.cur_token)
1621
1622	part = word_part.EscapedLiteral(self.cur_token,
1623	ch) # type: word_part_t
1624	else:
1625	part = self.cur_token
1626
1627	if is_first and self.token_type == Id.Lit_VarLike: # foo=
1628	parts.append(part)
1629	# Unfortunately it's awkward to pull the check for a=(1 2) up to
1630	# _ReadWord.
1631	next_id = self.lexer.LookPastSpace(lex_mode)
1632	if next_id == Id.Op_LParen:
1633	self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1634	part2 = self._ReadArrayLiteral()
1635	parts.append(part2)
1636
1637	# Array literal must be the last part of the word.
1638	self._SetNext(lex_mode)
1639	self._GetToken()
1640	# EOF, whitespace, newline, Right_Subshell
1641	if self.token_kind not in KINDS_THAT_END_WORDS:
1642	p_die('Unexpected token after array literal',
1643	self.cur_token)
1644	done = True
1645
1646	elif (is_first and self.parse_opts.parse_at() and
1647	self.token_type == Id.Lit_Splice):
1648
1649	splice_tok = self.cur_token
1650	part2 = word_part.Splice(splice_tok,
1651	lexer.TokenSliceLeft(splice_tok, 1))
1652
1653	parts.append(part2)
1654
1655	# @words must be the last part of the word
1656	self._SetNext(lex_mode)
1657	self._GetToken()
1658	# EOF, whitespace, newline, Right_Subshell
1659	if self.token_kind not in KINDS_THAT_END_WORDS:
1660	p_die('Unexpected token after array splice', self.cur_token)
1661	done = True
1662
1663	elif (is_first and self.parse_opts.parse_at() and
1664	self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1665	part2 = self._ReadExprSub(lex_mode_e.DQ)
1666	parts.append(part2)
1667
1668	# @[split(x)]
1669	self._SetNext(lex_mode)
1670	self._GetToken()
1671	# EOF, whitespace, newline, Right_Subshell
1672	if self.token_kind not in KINDS_THAT_END_WORDS:
1673	p_die('Unexpected token after Expr splice', self.cur_token)
1674	done = True
1675
1676	elif (is_first and self.parse_opts.parse_at() and
1677	self.token_type == Id.Lit_AtLBraceDot):
1678	p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1679
1680	elif (is_first and self.parse_opts.parse_at_all() and
1681	self.token_type == Id.Lit_At):
1682	# Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1683	# at the beginning of a word to be reserved.
1684
1685	# Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1686	# @_argv and
1687	p_die('Literal @ starting a word must be quoted (parse_at_all)',
1688	self.cur_token)
1689
1690	else:
1691	# not a literal with lookahead; append it
1692	parts.append(part)
1693
1694	return done
1695
1696	def _ReadCompoundWord(self, lex_mode):
1697	# type: (lex_mode_t) -> CompoundWord
1698	return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1699
1700	def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1701	# type: (lex_mode_t, Id_t, bool) -> CompoundWord
1702	"""
1703	Precondition: Looking at the first token of the first word part
1704	Postcondition: Looking at the token after, e.g. space or operator
1705
1706	NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1707	could be an operator delimiting a compound word. Can we change lexer modes
1708	and remove this special case?
1709	"""
1710	w = CompoundWord([])
1711	num_parts = 0
1712	brace_count = 0
1713	done = False
1714	is_triple_quoted = None # type: Optional[BoolParamBox]
1715
1716	while not done:
1717	self._GetToken()
1718
1719	allow_done = empty_ok or num_parts != 0
1720	if allow_done and self.token_type == eof_type:
1721	done = True # e.g. for ${foo//pat/replace}
1722
1723	# Keywords like "for" are treated like literals
1724	elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1725	Kind.ControlFlow, Kind.BoolUnary,
1726	Kind.BoolBinary):
1727
1728	# Syntax error for { and }
1729	if self.token_type == Id.Lit_LBrace:
1730	brace_count += 1
1731	elif self.token_type == Id.Lit_RBrace:
1732	brace_count -= 1
1733	elif self.token_type == Id.Lit_Dollar:
1734	if not self.parse_opts.parse_dollar():
1735	if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1736	next_byte = self.lexer.ByteLookAhead()
1737	# TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1738	if next_byte == '/':
1739	#log('next_byte %r', next_byte)
1740	pass
1741
1742	p_die('Literal $ should be quoted like \$',
1743	self.cur_token)
1744
1745	done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1746	w.parts)
1747
1748	elif self.token_kind == Kind.VSub:
1749	vsub_token = self.cur_token
1750
1751	part = SimpleVarSub(vsub_token) # type: word_part_t
1752	w.parts.append(part)
1753
1754	elif self.token_kind == Kind.ExtGlob:
1755	# If parse_at, we can take over @( to start @(seq 3)
1756	# Users can also use look at ,(.py\|.sh)
1757	if (self.parse_opts.parse_at() and
1758	self.token_type == Id.ExtGlob_At and num_parts == 0):
1759	cs_part = self._ReadCommandSub(Id.Left_AtParen,
1760	d_quoted=False)
1761	# RARE mutation of tok.id!
1762	cs_part.left_token.id = Id.Left_AtParen
1763	part = cs_part # for type safety
1764
1765	# Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1766	# a=(one two)x and @arrayfunc(3)x.
1767	self._GetToken()
1768	if self.token_kind not in KINDS_THAT_END_WORDS:
1769	p_die('Unexpected token after @()', self.cur_token)
1770	done = True
1771
1772	else:
1773	part = self._ReadExtGlob()
1774	w.parts.append(part)
1775
1776	elif self.token_kind == Kind.Left:
1777	try_triple_quote = (self.parse_opts.parse_triple_quote() and
1778	lex_mode == lex_mode_e.ShCommand and
1779	num_parts == 0)
1780
1781	# Save allocation
1782	if try_triple_quote:
1783	is_triple_quoted = BoolParamBox(False)
1784
1785	part = self._ReadUnquotedLeftParts(is_triple_quoted)
1786	w.parts.append(part)
1787
1788	# NOT done yet, will advance below
1789	elif self.token_kind == Kind.Right:
1790	# Still part of the word; will be done on the next iter.
1791	if self.token_type == Id.Right_DoubleQuote:
1792	pass
1793	# Never happens, no PushHint for this case.
1794	#elif self.token_type == Id.Right_DollarParen:
1795	# pass
1796	elif self.token_type == Id.Right_Subshell:
1797	# LEXER HACK for (case x in x) ;; esac )
1798	# Rewind before it's used
1799	assert self.next_lex_mode == lex_mode_e.Undefined
1800	if self.lexer.MaybeUnreadOne():
1801	self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1802	self._SetNext(lex_mode)
1803	done = True
1804	else:
1805	done = True
1806
1807	elif self.token_kind == Kind.Ignored:
1808	done = True
1809
1810	else:
1811	# LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1812	# so to test for ESAC, we can read ) before getting a chance to
1813	# PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1814	# token and do it again.
1815
1816	# We get Id.Op_RParen at top level: case x in x) ;; esac
1817	# We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1818	if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1819	# Rewind before it's used
1820	assert self.next_lex_mode == lex_mode_e.Undefined
1821	if self.lexer.MaybeUnreadOne():
1822	if self.token_type == Id.Eof_RParen:
1823	# Redo translation
1824	self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1825	self._SetNext(lex_mode)
1826
1827	done = True # anything we don't recognize means we're done
1828
1829	if not done:
1830	self._SetNext(lex_mode)
1831	num_parts += 1
1832
1833	if (self.parse_opts.parse_brace() and num_parts > 1 and
1834	brace_count != 0):
1835	# accept { and }, but not foo{
1836	p_die(
1837	'Word has unbalanced { }. Maybe add a space or quote it like \{',
1838	loc.Word(w))
1839
1840	if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1841	p_die('Unexpected parts after triple quoted string',
1842	loc.WordPart(w.parts[-1]))
1843
1844	if 0:
1845	from _devbuild.gen.syntax_asdl import word_part_str
1846	word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1847	WORD_HIST[word_key] += 1
1848	return w
1849
1850	def _ReadArithWord(self):
1851	# type: () -> Optional[word_t]
1852	""" Helper for ReadArithWord() """
1853	self._GetToken()
1854
1855	if self.token_kind == Kind.Unknown:
1856	# e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1857	p_die(
1858	'Unexpected token while parsing arithmetic: %r' %
1859	lexer.TokenVal(self.cur_token), self.cur_token)
1860
1861	elif self.token_kind == Kind.Eof:
1862	return self.cur_token
1863
1864	elif self.token_kind == Kind.Ignored:
1865	# Space should be ignored.
1866	self._SetNext(lex_mode_e.Arith)
1867	return None
1868
1869	elif self.token_kind in (Kind.Arith, Kind.Right):
1870	# Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1871	self._SetNext(lex_mode_e.Arith)
1872	return self.cur_token
1873
1874	elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1875	return self._ReadCompoundWord(lex_mode_e.Arith)
1876
1877	else:
1878	raise AssertionError(self.cur_token)
1879
1880	def _ReadWord(self, word_mode):
1881	# type: (lex_mode_t) -> Optional[word_t]
1882	"""Helper function for ReadWord()."""
1883
1884	# Change the pseudo lexer mode to a real lexer mode
1885	if word_mode == lex_mode_e.ShCommandBrack:
1886	lex_mode = lex_mode_e.ShCommand
1887	else:
1888	lex_mode = word_mode
1889
1890	self._GetToken()
1891
1892	if self.token_kind == Kind.Eof:
1893	# No advance
1894	return self.cur_token
1895
1896	# Allow Arith for ) at end of for loop?
1897	elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1898	self._SetNext(lex_mode)
1899
1900	# Newlines are complicated. See 3x2 matrix in the comment about
1901	# self.multiline and self.newline_state above.
1902	if self.token_type == Id.Op_Newline:
1903	if self.multiline:
1904	if self.newline_state > 1:
1905	# This points at a blank line, but at least it gives the line number
1906	p_die('Invalid blank line in multiline mode',
1907	self.cur_token)
1908	return None
1909
1910	if self.returned_newline: # skip
1911	return None
1912
1913	return self.cur_token
1914
1915	elif self.token_kind == Kind.Right:
1916	if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1917	Id.Right_CasePat,
1918	Id.Right_ShArrayLiteral):
1919	raise AssertionError(self.cur_token)
1920
1921	self._SetNext(lex_mode)
1922	return self.cur_token
1923
1924	elif self.token_kind in (Kind.Ignored, Kind.WS):
1925	self._SetNext(lex_mode)
1926	return None
1927
1928	else:
1929	assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
1930	Kind.Left, Kind.KW, Kind.ControlFlow,
1931	Kind.BoolUnary, Kind.BoolBinary,
1932	Kind.ExtGlob), 'Unhandled token kind'
1933
1934	if (word_mode == lex_mode_e.ShCommandBrack and
1935	self.parse_opts.parse_bracket() and
1936	self.token_type == Id.Lit_LBracket):
1937	# Change [ from Kind.Lit -> Kind.Op
1938	# So CommandParser can treat
1939	# assert [42 === x]
1940	# like
1941	# json write (x)
1942	bracket_word = self.cur_token
1943	bracket_word.id = Id.Op_LBracket
1944
1945	self._SetNext(lex_mode)
1946	return bracket_word
1947
1948	# We're beginning a word. If we see Id.Lit_Pound, change to
1949	# lex_mode_e.Comment and read until end of line.
1950	if self.token_type == Id.Lit_Pound:
1951	self._SetNext(lex_mode_e.Comment)
1952	self._GetToken()
1953
1954	# NOTE: The # could be the last character in the file. It can't be
1955	# Eof_{RParen,Backtick} because #) and #` are comments.
1956	assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
1957	self.cur_token
1958
1959	# The next iteration will go into Kind.Ignored and set lex state to
1960	# lex_mode_e.ShCommand/etc.
1961	return None # tell ReadWord() to try again after comment
1962
1963	elif self.token_type == Id.Lit_TPound: ### doc comment
1964	self._SetNext(lex_mode_e.Comment)
1965	self._GetToken()
1966
1967	if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
1968	return self.cur_token
1969
1970	return None # tell ReadWord() to try again after comment
1971
1972	else:
1973	# r'' u'' b''
1974	if (self.token_type == Id.Lit_Chars and
1975	self.lexer.LookAheadOne(
1976	lex_mode_e.ShCommand) == Id.Left_SingleQuote):
1977
1978	# When shopt -s parse_raw_string:
1979	# echo r'hi' is like echo 'hi'
1980	#
1981	# echo u'\u{3bc}' b'\yff' works
1982
1983	tok = self.cur_token
1984	if self.parse_opts.parse_ysh_string():
1985	if lexer.TokenEquals(tok, 'r'):
1986	left_id = Id.Left_RSingleQuote
1987	elif lexer.TokenEquals(tok, 'u'):
1988	left_id = Id.Left_USingleQuote
1989	elif lexer.TokenEquals(tok, 'b'):
1990	left_id = Id.Left_BSingleQuote
1991	else:
1992	left_id = Id.Undefined_Tok
1993
1994	if left_id != Id.Undefined_Tok:
1995	# skip the r, and then 'foo' will be read as normal
1996	self._SetNext(lex_mode_e.ShCommand)
1997
1998	self._GetToken()
1999	assert self.token_type == Id.Left_SingleQuote, self.token_type
2000
2001	# Read the word in a different lexer mode
2002	return self._ReadYshSingleQuoted(left_id)
2003
2004	return self._ReadCompoundWord(lex_mode)
2005
2006	def ParseVarRef(self):
2007	# type: () -> BracedVarSub
2008	"""DYNAMIC parsing of what's inside ${!ref}
2009
2010	# Same as VarOf production
2011	VarRefExpr = VarOf EOF
2012	"""
2013	self._SetNext(lex_mode_e.VSub_1)
2014
2015	self._GetToken()
2016	if self.token_kind != Kind.VSub:
2017	p_die('Expected var name', self.cur_token)
2018
2019	part = self._ParseVarOf()
2020	# NOTE: no ${ } means no part.left and part.right
2021	part.left = part.token # cheat to make test pass
2022	part.right = part.token
2023
2024	self._GetToken()
2025	if self.token_type != Id.Eof_Real:
2026	p_die('Expected end of var ref expression', self.cur_token)
2027	return part
2028
2029	def LookPastSpace(self):
2030	# type: () -> Id_t
2031	"""Look ahead to the next token.
2032
2033	For the CommandParser to recognize
2034	array= (1 2 3)
2035	YSH for ( versus bash for ((
2036	YSH if ( versus if test
2037	YSH while ( versus while test
2038	YSH bare assignment 'grep =' versus 'grep foo'
2039	"""
2040	assert self.token_type != Id.Undefined_Tok
2041	if self.cur_token.id == Id.WS_Space:
2042	id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2043	else:
2044	id_ = self.cur_token.id
2045	return id_
2046
2047	def LookAheadFuncParens(self):
2048	# type: () -> bool
2049	"""Special lookahead for f( ) { echo hi; } to check for ( )"""
2050	assert self.token_type != Id.Undefined_Tok
2051
2052	# We have to handle 2 cases because we buffer a token
2053	if self.cur_token.id == Id.Op_LParen: # saw funcname(
2054	return self.lexer.LookAheadFuncParens(1) # go back one char
2055
2056	elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2057	return self.lexer.LookAheadFuncParens(0)
2058
2059	else:
2060	return False
2061
2062	def ReadWord(self, word_mode):
2063	# type: (lex_mode_t) -> word_t
2064	"""Read the next word, using the given lexer mode.
2065
2066	This is a stateful wrapper for the stateless _ReadWord function.
2067	"""
2068	assert word_mode in (lex_mode_e.ShCommand, lex_mode_e.ShCommandBrack,
2069	lex_mode_e.DBracket, lex_mode_e.BashRegex)
2070
2071	if self.buffered_word: # For integration with pgen2
2072	w = self.buffered_word
2073	self.buffered_word = None
2074	else:
2075	while True:
2076	w = self._ReadWord(word_mode)
2077	if w is not None:
2078	break
2079
2080	self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2081	return w
2082
2083	def ReadArithWord(self):
2084	# type: () -> word_t
2085	while True:
2086	w = self._ReadArithWord()
2087	if w is not None:
2088	break
2089	return w
2090
2091	def ReadHereDocBody(self, parts):
2092	# type: (List[word_part_t]) -> None
2093	"""
2094	A here doc is like a double quoted context, except " isn't special.
2095	"""
2096	self._ReadLikeDQ(None, False, parts)
2097	# Returns nothing
2098
2099	def ReadForPlugin(self):
2100	# type: () -> CompoundWord
2101	"""For $PS1, $PS4, etc.
2102
2103	This is just like reading a here doc line. "\n" is allowed, as
2104	well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2105	"""
2106	w = CompoundWord([])
2107	self._ReadLikeDQ(None, False, w.parts)
2108	return w
2109
2110	def EmitDocToken(self, b):
2111	# type: (bool) -> None
2112	self.emit_doc_token = b
2113
2114	def Multiline(self, b):
2115	# type: (bool) -> None
2116	self.multiline = b
2117
2118
2119	if 0:
2120	import collections
2121	WORD_HIST = collections.Counter()