OILS / osh / word_parse.py View on Github | oilshell.org

2108 lines, 1123 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 NameTok,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90)
91from core import alloc
92from core.error import p_die
93from mycpp.mylib import log
94from core import pyutil
95from core import ui
96from frontend import consts
97from frontend import lexer
98from frontend import reader
99from osh import tdop
100from osh import arith_parse
101from osh import braces
102from osh import word_
103from osh import word_compile
104from mycpp.mylib import tagswitch
105
106from typing import List, Optional, Tuple, cast
107from typing import TYPE_CHECKING
108if TYPE_CHECKING:
109 from frontend.lexer import Lexer
110 from frontend.parse_lib import ParseContext
111 from frontend.reader import _Reader
112 from osh.cmd_parse import VarChecker
113
114unused1 = log
115unused2 = Id_str
116
117KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120class WordEmitter(object):
121 """Common interface for [ and [["""
122
123 def __init__(self):
124 # type: () -> None
125 """Empty constructor for mycpp."""
126 pass
127
128 def ReadWord(self, lex_mode):
129 # type: (lex_mode_t) -> word_t
130 raise NotImplementedError()
131
132
133class WordParser(WordEmitter):
134
135 def __init__(self, parse_ctx, lexer, line_reader):
136 # type: (ParseContext, Lexer, _Reader) -> None
137 self.parse_ctx = parse_ctx
138 self.lexer = lexer
139 self.line_reader = line_reader
140 self.arena = line_reader.arena
141
142 self.parse_opts = parse_ctx.parse_opts
143 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144 self.parse_opts)
145 self.Reset()
146
147 def Init(self, lex_mode):
148 # type: (lex_mode_t) -> None
149 """Used to parse arithmetic, see ParseContext."""
150 self.next_lex_mode = lex_mode
151
152 def Reset(self):
153 # type: () -> None
154 """Called by interactive loop."""
155 # For _GetToken()
156 self.cur_token = None # type: Token
157 self.token_kind = Kind.Undefined
158 self.token_type = Id.Undefined_Tok
159
160 self.next_lex_mode = lex_mode_e.ShCommand
161
162 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163 # comments
164 self.emit_doc_token = False
165 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166 # multiline mode.
167 self.multiline = False
168
169 # For detecting invalid \n\n in multiline mode. Counts what we got
170 # directly from the lexer.
171 self.newline_state = 0
172 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173 # that consume words.
174 self.returned_newline = False
175
176 # For integration with pgen2
177 self.buffered_word = None # type: word_t
178
179 def _GetToken(self):
180 # type: () -> None
181 """Call this when you need to make a decision based on any of:
182
183 self.token_type self.token_kind self.cur_token # contents
184 """
185 if self.next_lex_mode != lex_mode_e.Undefined:
186 self.cur_token = self.lexer.Read(self.next_lex_mode)
187 self.token_type = self.cur_token.id
188 self.token_kind = consts.GetKind(self.token_type)
189
190 # number of consecutive newlines, ignoring whitespace
191 if self.token_type == Id.Op_Newline:
192 self.newline_state += 1
193 elif self.token_kind != Kind.WS:
194 self.newline_state = 0
195
196 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
197 self.next_lex_mode = lex_mode_e.Undefined
198
199 def _SetNext(self, lex_mode):
200 # type: (lex_mode_t) -> None
201 """Set the next lex state, but don't actually read a token.
202
203 We need this for proper interactive parsing.
204 """
205 self.next_lex_mode = lex_mode
206
207 def _ReadVarOpArg(self, arg_lex_mode):
208 # type: (lex_mode_t) -> rhs_word_t
209
210 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
211 # valid, even when unquoted.
212 self._SetNext(arg_lex_mode)
213 self._GetToken()
214
215 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
216 True) # empty_ok
217
218 # If the Compound has no parts, and we're in a double-quoted VarSub
219 # arg, and empty_ok, then return Empty. This is so it can evaluate to
220 # the empty string and not get elided.
221 #
222 # Examples:
223 # - "${s:-}", "${s/%pat/}"
224 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
225 # has the same potential problem of not having Token location info.
226 #
227 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
228 # return a Compound with no parts, which is explicitly checked with a
229 # custom error message.
230 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
231 return rhs_word.Empty
232
233 return w
234
235 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
236 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
237 """Return a CompoundWord.
238
239 Helper function for _ReadVarOpArg and used directly by
240 _ReadPatSubVarOp.
241 """
242 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
243 #log('w %s', w)
244 tilde = word_.TildeDetect(w)
245 if tilde:
246 w = tilde
247 return w
248
249 def _ReadSliceVarOp(self):
250 # type: () -> suffix_op.Slice
251 """VarOf ':' ArithExpr (':' ArithExpr )?"""
252 self._SetNext(lex_mode_e.Arith)
253 self._GetToken()
254 cur_id = self.token_type # e.g. Id.Arith_Colon
255
256 if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
257 # no beginning specified
258 begin = None # type: Optional[arith_expr_t]
259 else:
260 begin = self.a_parser.Parse()
261 cur_id = self.a_parser.CurrentId()
262
263 if cur_id == Id.Arith_RBrace:
264 no_length = None # type: Optional[arith_expr_t] # No length specified
265 return suffix_op.Slice(begin, no_length)
266
267 # Id.Arith_Colon is a pun for Id.VOp2_Colon
268 if cur_id == Id.Arith_Colon:
269 self._SetNext(lex_mode_e.Arith)
270 length = self._ReadArithExpr(Id.Arith_RBrace)
271 return suffix_op.Slice(begin, length)
272
273 p_die("Expected : or } in slice", self.cur_token)
274 raise AssertionError() # for MyPy
275
276 def _ReadPatSubVarOp(self):
277 # type: () -> suffix_op.PatSub
278 """Looking at the first '/' after VarOf:
279
280 VarSub = ...
281 | VarOf '/' Match ( '/' WORD? )?
282 Match = '/' WORD # can't be empty
283 | '#' WORD? # may be empty
284 | '%' WORD?
285 """
286 slash_tok = self.cur_token # location info
287 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
288
289 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
290
291 self._GetToken()
292 if self.token_type == Id.Right_DollarBrace:
293 pat = CompoundWord([])
294 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
295 slash_tok)
296
297 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
298 replace_mode = self.token_type
299 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
300
301 # Bash quirk:
302 # echo ${x/#/replace} has an empty pattern
303 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
304 empty_ok = replace_mode != Id.Lit_Slash
305 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
306 empty_ok)
307 #log('pat 1 %r', pat)
308
309 if self.token_type == Id.Lit_Slash:
310 # read until }
311 replace = self._ReadVarOpArg(
312 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
313 #log('r 1 %r', replace)
314 else:
315 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
316 replace = rhs_word.Empty
317
318 self._GetToken()
319 if self.token_type != Id.Right_DollarBrace:
320 # This happens on invalid code
321 p_die(
322 "Expected } after replacement string, got %s" %
323 ui.PrettyId(self.token_type), self.cur_token)
324
325 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
326
327 def _ReadSubscript(self):
328 # type: () -> bracket_op_t
329 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
330 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
331 # expression.
332 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
333 if next_id in (Id.Lit_At, Id.Arith_Star):
334 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
335
336 self._SetNext(lex_mode_e.Arith) # skip past [
337 self._GetToken()
338 self._SetNext(lex_mode_e.Arith) # skip past @
339 self._GetToken()
340 else:
341 self._SetNext(lex_mode_e.Arith) # skip past [
342 anode = self._ReadArithExpr(Id.Arith_RBracket)
343 op = bracket_op.ArrayIndex(anode)
344
345 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
346 p_die('Expected ] to close subscript', self.cur_token)
347
348 self._SetNext(lex_mode_e.VSub_2) # skip past ]
349 self._GetToken() # Needed to be in the same spot as no subscript
350
351 return op
352
353 def _ParseVarOf(self):
354 # type: () -> BracedVarSub
355 """
356 VarOf = NAME Subscript?
357 | NUMBER # no subscript allowed, none of these are arrays
358 # ${@[1]} doesn't work, even though slicing does
359 | VarSymbol
360 """
361 self._GetToken()
362 name_token = self.cur_token
363 self._SetNext(lex_mode_e.VSub_2)
364
365 self._GetToken() # Check for []
366 if self.token_type == Id.VOp2_LBracket:
367 bracket_op = self._ReadSubscript()
368 else:
369 bracket_op = None
370
371 part = BracedVarSub.CreateNull()
372 part.token = name_token
373 part.var_name = lexer.TokenVal(name_token)
374 part.bracket_op = bracket_op
375 return part
376
377 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
378 # type: (lex_mode_t, bool) -> BracedVarSub
379 """Start parsing at the op -- we already skipped past the name."""
380 part = self._ParseVarOf()
381
382 self._GetToken()
383 if self.token_type == Id.Right_DollarBrace:
384 return part # no ops
385
386 op_kind = self.token_kind
387
388 if op_kind == Kind.VTest:
389 tok = self.cur_token
390 arg_word = self._ReadVarOpArg(arg_lex_mode)
391 if self.token_type != Id.Right_DollarBrace:
392 p_die('Expected } to close ${', self.cur_token)
393
394 part.suffix_op = suffix_op.Unary(tok, arg_word)
395
396 elif op_kind == Kind.VOpOil:
397 tok = self.cur_token
398 arg_word = self._ReadVarOpArg(arg_lex_mode)
399 if self.token_type != Id.Right_DollarBrace:
400 p_die('Expected } to close ${', self.cur_token)
401
402 UP_arg_word = arg_word
403 with tagswitch(arg_word) as case:
404 if case(rhs_word_e.Empty):
405 pass
406 elif case(rhs_word_e.Compound):
407 arg_word = cast(CompoundWord, UP_arg_word)
408 # This handles ${x|html} and ${x %.3f} now
409 # However I think ${x %.3f} should be statically parsed? It can enter
410 # the printf lexer modes.
411 ok, arg, quoted = word_.StaticEval(arg_word)
412 if not ok or quoted:
413 p_die('Expected a constant argument',
414 loc.Word(arg_word))
415
416 part.suffix_op = suffix_op.Static(tok, arg)
417
418 elif op_kind == Kind.VOp0:
419 part.suffix_op = self.cur_token # Nullary
420 self._SetNext(lex_mode_e.VSub_2) # Expecting }
421 self._GetToken()
422
423 elif op_kind == Kind.VOp1: # % %% # ## etc.
424 tok = self.cur_token
425 # Weird exception that all shells have: these operators take a glob
426 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
427 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
428 if self.token_type != Id.Right_DollarBrace:
429 p_die('Expected } to close ${', self.cur_token)
430
431 part.suffix_op = suffix_op.Unary(tok, arg_word)
432
433 elif op_kind == Kind.VOp2: # / : [ ]
434 if self.token_type == Id.VOp2_Slash:
435 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
436 part.suffix_op = patsub_op
437
438 # Checked by the method above
439 assert self.token_type == Id.Right_DollarBrace, self.cur_token
440
441 elif self.token_type == Id.VOp2_Colon:
442 part.suffix_op = self._ReadSliceVarOp()
443 # NOTE: } in arithmetic mode.
444 if self.token_type != Id.Arith_RBrace:
445 # Token seems off; doesn't point to X in # ${a:1:2 X
446 p_die('Expected } to close ${', self.cur_token)
447
448 else:
449 # TODO: Does this ever happen?
450 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
451
452 elif op_kind == Kind.VOp3: # ${prefix@} etc.
453 if allow_query:
454 part.suffix_op = self.cur_token # Nullary
455 self._SetNext(lex_mode_e.VSub_2) # Expecting }
456 self._GetToken()
457 else:
458 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
459
460 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
461 # mode. It's redundantly checked above.
462 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
463 # ${a.} or ${!a.}
464 p_die('Expected } to close ${', self.cur_token)
465
466 # Now look for ops
467 return part
468
469 def ReadBracedVarSub(self, left_token):
470 # type: (Token) -> Tuple[BracedVarSub, Token]
471 """ For YSH expressions like var x = ${x:-"default"}. """
472 part = self._ReadBracedVarSub(left_token, d_quoted=False)
473 last_token = self.cur_token
474 return part, last_token
475
476 def _ReadBracedVarSub(self, left_token, d_quoted):
477 # type: (Token, bool) -> BracedVarSub
478 """For the ${} expression language.
479
480 NAME = [a-zA-Z_][a-zA-Z0-9_]*
481 NUMBER = [0-9]+ # ${10}, ${11}, ...
482
483 Subscript = '[' ('@' | '*' | ArithExpr) ']'
484 VarSymbol = '!' | '@' | '#' | ...
485 VarOf = NAME Subscript?
486 | NUMBER # no subscript allowed, none of these are arrays
487 # ${@[1]} doesn't work, even though slicing does
488 | VarSymbol
489
490 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
491
492 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
493 STRIP_OP = '#' | '##' | '%' | '%%'
494 CASE_OP = ',' | ',,' | '^' | '^^'
495 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
496
497 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
498 # SPACE is operator not %
499 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
500 VarExpr = VarOf
501 | VarOf NULLARY_OP
502 | VarOf UnaryOp WORD
503 | VarOf YSH_UNARY STATIC_WORD
504 | VarOf ':' ArithExpr (':' ArithExpr )?
505 | VarOf '/' Match '/' WORD
506
507 LengthExpr = '#' VarOf # can't apply operators after length
508
509 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
510 # ${!ref[0]} vs ${!keys[@]} resolved later
511
512 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
513
514 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
515
516 VarSub = LengthExpr
517 | RefOrKeys
518 | PrefixQuery
519 | VarExpr
520 | BuiltinSub
521
522 NOTES:
523 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
524 slicing ${a:x+1:y+2}
525 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
526 - @ and * are technically arithmetic expressions in this implementation
527 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
528 it's also vectorized.
529
530 Strictness over bash:
531 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
532 grammar
533 - ! and # prefixes can't be composed, even though named refs can be
534 composed with other operators
535 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
536 a prefix, and it can also be a literal part of WORD.
537
538 From the parser's point of view, the prefix # can't be combined with
539 UnaryOp/slicing/matching, and the ! can. However
540
541 - ${a[@]:1:2} is not allowed
542 - ${#a[@]:1:2} is allowed, but gives the wrong answer
543 """
544 if d_quoted:
545 arg_lex_mode = lex_mode_e.VSub_ArgDQ
546 else:
547 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
548
549 self._SetNext(lex_mode_e.VSub_1)
550 self._GetToken()
551
552 ty = self.token_type
553 first_tok = self.cur_token
554
555 if ty == Id.VSub_Pound:
556 # Disambiguate
557 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
558 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
559 # e.g. a name, '#' is the prefix
560 self._SetNext(lex_mode_e.VSub_1)
561 part = self._ParseVarOf()
562
563 self._GetToken()
564 if self.token_type != Id.Right_DollarBrace:
565 p_die('Expected } after length expression', self.cur_token)
566
567 part.prefix_op = first_tok
568
569 else: # not a prefix, '#' is the variable
570 part = self._ParseVarExpr(arg_lex_mode)
571
572 elif ty == Id.VSub_Bang:
573 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
574 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
575 # e.g. a name, '!' is the prefix
576 # ${!a} -- this is a ref
577 # ${!3} -- this is ref
578 # ${!a[1]} -- this is a ref
579 # ${!a[@]} -- this is a keys
580 # No lookahead -- do it in a second step, or at runtime
581 self._SetNext(lex_mode_e.VSub_1)
582 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
583
584 part.prefix_op = first_tok
585
586 else: # not a prefix, '!' is the variable
587 part = self._ParseVarExpr(arg_lex_mode)
588
589 elif ty == Id.VSub_Dot:
590 # Note: this will become a new builtin_sub type, so this method must
591 # return word_part_t rather than BracedVarSub. I don't think that
592 # should cause problems.
593 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
594
595 # VS_NAME, VS_NUMBER, symbol that isn't # or !
596 elif self.token_kind == Kind.VSub:
597 part = self._ParseVarExpr(arg_lex_mode)
598
599 else:
600 # e.g. ${^}
601 p_die('Unexpected token in ${}', self.cur_token)
602
603 part.left = left_token # attach the argument
604 part.right = self.cur_token
605 return part
606
607 def _ReadSingleQuoted(self, left_token, lex_mode):
608 # type: (Token, lex_mode_t) -> SingleQuoted
609 """Internal method to read a word_part."""
610 tokens = [] # type: List[Token]
611 # In command mode, we never disallow backslashes like '\'
612 self.ReadSingleQuoted(lex_mode, left_token, tokens, False)
613 right_quote = self.cur_token
614 node = SingleQuoted(left_token, tokens, right_quote)
615 return node
616
617 def ReadSingleQuoted(self, lex_mode, left_token, tokens, is_ysh_expr):
618 # type: (lex_mode_t, Token, List[Token], bool) -> Token
619 """Appends to tokens
620
621 Used by expr_parse.py
622 """
623
624 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
625 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
626
627 expected_end_tokens = 3 if left_token.id in (
628 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
629 Id.Left_BTSingleQuote) else 1
630 num_end_tokens = 0
631
632 while num_end_tokens < expected_end_tokens:
633 self._SetNext(lex_mode)
634 self._GetToken()
635
636 # Kind.Char emitted in DOLLAR_SQ state
637 if self.token_kind in (Kind.Lit, Kind.Char):
638 tok = self.cur_token
639 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
640 # r'one\two' or c'one\\two'
641 if no_backslashes and '\\' in tok.tval:
642 p_die(
643 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
644 tok)
645
646 if is_ysh_expr:
647 # Disallow var x = $'\001'. Arguably we don't need these
648 # checks because u'\u{1}' is the way to write it.
649 if self.token_type == Id.Char_Octal3:
650 p_die(
651 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
652 tok)
653
654 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
655 # disallow \xH
656 p_die(
657 r'Invalid hex escape in YSH string (must be \xHH)',
658 tok)
659
660 tokens.append(tok)
661
662 elif self.token_kind == Kind.Unknown:
663 tok = self.cur_token
664 assert tok.id == Id.Unknown_Backslash, tok
665
666 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
667 if is_ysh_expr or not self.parse_opts.parse_backslash():
668 p_die(
669 "Invalid char escape in C-style string literal (OILS-ERR-11)",
670 tok)
671
672 tokens.append(tok)
673
674 elif self.token_kind == Kind.Eof:
675 p_die('Unexpected EOF in single-quoted string that began here',
676 left_token)
677
678 elif self.token_kind == Kind.Right:
679 # assume Id.Right_SingleQuote
680 num_end_tokens += 1
681 tokens.append(self.cur_token)
682
683 else:
684 raise AssertionError(self.cur_token)
685
686 if self.token_kind != Kind.Right:
687 num_end_tokens = 0 # we need three in a ROW
688
689 if expected_end_tokens == 1:
690 tokens.pop()
691 elif expected_end_tokens == 3: # Get rid of spurious end tokens
692 tokens.pop()
693 tokens.pop()
694 tokens.pop()
695
696 # Remove space from ''' r''' $''' in both expression mode and command mode
697 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
698 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
699 word_compile.RemoveLeadingSpaceSQ(tokens)
700
701 # Validation after lexing - same 2 checks in j8.LexerDecoder
702 is_u_string = left_token.id in (Id.Left_USingleQuote,
703 Id.Left_UTSingleQuote)
704
705 for tok in tokens:
706 # u'\yff' is not valid, but b'\yff' is
707 if is_u_string and tok.id == Id.Char_YHex:
708 p_die(
709 r"%s escapes not allowed in u'' strings" %
710 lexer.TokenVal(tok), tok)
711 # \u{dc00} isn't valid
712 if tok.id == Id.Char_UBraced:
713 h = lexer.TokenSlice(tok, 3, -1) # \u{123456}
714 i = int(h, 16)
715 if 0xD800 <= i and i < 0xE000:
716 p_die(
717 r"%s escape is illegal because it's in the surrogate range"
718 % lexer.TokenVal(tok), tok)
719
720 return self.cur_token
721
722 def _ReadDoubleQuotedLeftParts(self):
723 # type: () -> word_part_t
724 """Read substitution parts in a double quoted context."""
725 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
726 return self._ReadCommandSub(self.token_type, d_quoted=True)
727
728 if self.token_type == Id.Left_DollarBrace:
729 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
730
731 if self.token_type == Id.Left_DollarDParen:
732 return self._ReadArithSub()
733
734 if self.token_type == Id.Left_DollarBracket:
735 return self._ReadExprSub(lex_mode_e.DQ)
736
737 raise AssertionError(self.cur_token)
738
739 def _ReadYshSingleQuoted(self, left_id):
740 # type: (Id_t) -> CompoundWord
741 """Read YSH style strings
742
743 r'' u'' b''
744 r''' ''' u''' ''' b''' '''
745 """
746 #log('BEF self.cur_token %s', self.cur_token)
747 if left_id == Id.Left_RSingleQuote:
748 lexer_mode = lex_mode_e.SQ_Raw
749 triple_left_id = Id.Left_RTSingleQuote
750 elif left_id == Id.Left_USingleQuote:
751 lexer_mode = lex_mode_e.J8_Str
752 triple_left_id = Id.Left_UTSingleQuote
753 elif left_id == Id.Left_BSingleQuote:
754 lexer_mode = lex_mode_e.J8_Str
755 triple_left_id = Id.Left_BTSingleQuote
756 else:
757 raise AssertionError(left_id)
758
759 # Needed for syntax checks
760 left_tok = self.cur_token
761 left_tok.id = left_id
762
763 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
764
765 if (len(sq_part.tokens) == 0 and self.lexer.ByteLookAhead() == "'"):
766 self._SetNext(lex_mode_e.ShCommand)
767 self._GetToken()
768
769 assert self.token_type == Id.Left_SingleQuote
770 # HACK: magically transform the third ' in u''' to
771 # Id.Left_UTSingleQuote, so that ''' is the terminator
772 left_tok = self.cur_token
773 left_tok.id = triple_left_id
774
775 # Handles stripping leading whitespace
776 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
777
778 # Advance and validate
779 self._SetNext(lex_mode_e.ShCommand)
780
781 self._GetToken()
782 if self.token_kind not in KINDS_THAT_END_WORDS:
783 p_die('Unexpected token after YSH single-quoted string',
784 self.cur_token)
785
786 return CompoundWord([sq_part])
787
788 def _ReadUnquotedLeftParts(self, triple_out):
789 # type: (Optional[BoolParamBox]) -> word_part_t
790 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
791
792 If triple_out is set, then we try parsing triple quoted strings,
793 and set its value to True if we got one.
794 """
795 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
796 # Note: $"" is a synonym for "". It might make sense if it added
797 # \n \0 \x00 \u{123} etc. But that's not what bash does!
798 dq_part = self._ReadDoubleQuoted(self.cur_token)
799 # Got empty word "" and there's a " after
800 if (triple_out and len(dq_part.parts) == 0 and
801 self.lexer.ByteLookAhead() == '"'):
802
803 self._SetNext(lex_mode_e.ShCommand)
804 self._GetToken()
805 # HACK: magically transform the third " in """ to
806 # Id.Left_TDoubleQuote, so that """ is the terminator
807 left_dq_token = self.cur_token
808 left_dq_token.id = Id.Left_TDoubleQuote
809 triple_out.b = True # let caller know we got it
810 return self._ReadDoubleQuoted(left_dq_token)
811
812 return dq_part
813
814 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
815 Id.Left_DollarSingleQuote):
816 if self.token_type == Id.Left_SingleQuote:
817 lexer_mode = lex_mode_e.SQ_Raw
818 triple_left_id = Id.Left_TSingleQuote
819 elif self.token_type == Id.Left_RSingleQuote:
820 lexer_mode = lex_mode_e.SQ_Raw
821 triple_left_id = Id.Left_RTSingleQuote
822 else:
823 lexer_mode = lex_mode_e.SQ_C
824 # there is no such thing as $'''
825 triple_left_id = Id.Undefined_Tok
826
827 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
828
829 # Got empty '' or r'' and there's a ' after
830 # u'' and b'' are handled in _ReadYshSingleQuoted
831 if (triple_left_id != Id.Undefined_Tok and
832 triple_out is not None and len(sq_part.tokens) == 0 and
833 self.lexer.ByteLookAhead() == "'"):
834
835 self._SetNext(lex_mode_e.ShCommand)
836 self._GetToken()
837
838 # HACK: magically transform the third ' in ''' to
839 # Id.Left_TSingleQuote, so that ''' is the terminator
840 left_sq_token = self.cur_token
841 left_sq_token.id = triple_left_id
842
843 triple_out.b = True # let caller know we got it
844 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
845
846 return sq_part
847
848 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
849 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
850 return self._ReadCommandSub(self.token_type, d_quoted=False)
851
852 if self.token_type == Id.Left_DollarBrace:
853 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
854
855 if self.token_type == Id.Left_DollarDParen:
856 return self._ReadArithSub()
857
858 if self.token_type == Id.Left_DollarBracket:
859 return self._ReadExprSub(lex_mode_e.ShCommand)
860
861 raise AssertionError(self.cur_token)
862
863 def _ReadExtGlob(self):
864 # type: () -> word_part.ExtGlob
865 """
866 Grammar:
867 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
868 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
869 RIGHT = ')'
870 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
871 Compound includes ExtGlob
872 """
873 left_token = self.cur_token
874 right_token = None # type: Token
875 arms = [] # type: List[CompoundWord]
876
877 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
878 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
879
880 read_word = False # did we just a read a word? To handle @(||).
881
882 while True:
883 self._GetToken()
884
885 if self.token_type == Id.Right_ExtGlob:
886 if not read_word:
887 arms.append(CompoundWord([]))
888 right_token = self.cur_token
889 break
890
891 elif self.token_type == Id.Op_Pipe:
892 if not read_word:
893 arms.append(CompoundWord([]))
894 read_word = False
895 self._SetNext(lex_mode_e.ExtGlob)
896
897 # lex mode EXTGLOB should only produce these 4 kinds of tokens
898 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
899 Kind.ExtGlob):
900 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
901 arms.append(w)
902 read_word = True
903
904 elif self.token_kind == Kind.Eof:
905 p_die('Unexpected EOF reading extended glob that began here',
906 left_token)
907
908 else:
909 raise AssertionError(self.cur_token)
910
911 return word_part.ExtGlob(left_token, arms, right_token)
912
913 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
914 # type: (Optional[Token], bool, List[word_part_t]) -> None
915 """
916 Args:
917 left_token: A token if we are reading a double quoted part, or None if
918 we're reading a here doc.
919 is_ysh_expr: Whether to disallow backticks and invalid char escapes
920 out_parts: list of word_part to append to
921 """
922 if left_token:
923 expected_end_tokens = 3 if left_token.id == Id.Left_TDoubleQuote else 1
924 else:
925 expected_end_tokens = 1000 # here doc will break
926
927 num_end_tokens = 0
928 while num_end_tokens < expected_end_tokens:
929 self._SetNext(lex_mode_e.DQ)
930 self._GetToken()
931
932 if self.token_kind == Kind.Lit:
933 if self.token_type == Id.Lit_EscapedChar:
934 tok = self.cur_token
935 ch = lexer.TokenSliceLeft(tok, 1)
936 part = word_part.EscapedLiteral(tok,
937 ch) # type: word_part_t
938 else:
939 if self.token_type == Id.Lit_BadBackslash:
940 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
941 # YSH.
942 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
943 # recursion (unless parse_backslash)
944 if (is_ysh_expr or
945 not self.parse_opts.parse_backslash()):
946 p_die(
947 "Invalid char escape in double quoted string (OILS-ERR-12)",
948 self.cur_token)
949 elif self.token_type == Id.Lit_Dollar:
950 if is_ysh_expr or not self.parse_opts.parse_dollar():
951 p_die("Literal $ should be quoted like \$",
952 self.cur_token)
953
954 part = self.cur_token
955 out_parts.append(part)
956
957 elif self.token_kind == Kind.Left:
958 if self.token_type == Id.Left_Backtick and is_ysh_expr:
959 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
960 self.cur_token)
961
962 part = self._ReadDoubleQuotedLeftParts()
963 out_parts.append(part)
964
965 elif self.token_kind == Kind.VSub:
966 tok = self.cur_token
967 part = NameTok(tok, lexer.TokenSliceLeft(tok, 1))
968 out_parts.append(part)
969 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
970 # later.
971
972 elif self.token_kind == Kind.Right:
973 assert self.token_type == Id.Right_DoubleQuote, self.token_type
974 if left_token:
975 num_end_tokens += 1
976
977 # In a here doc, the right quote is literal!
978 out_parts.append(self.cur_token)
979
980 elif self.token_kind == Kind.Eof:
981 if left_token:
982 p_die(
983 'Unexpected EOF reading double-quoted string that began here',
984 left_token)
985 else: # here docs will have an EOF in their token stream
986 break
987
988 else:
989 raise AssertionError(self.cur_token)
990
991 if self.token_kind != Kind.Right:
992 num_end_tokens = 0 # """ must be CONSECUTIVE
993
994 if expected_end_tokens == 1:
995 out_parts.pop()
996 elif expected_end_tokens == 3:
997 out_parts.pop()
998 out_parts.pop()
999 out_parts.pop()
1000
1001 # Remove space from """ in both expression mode and command mode
1002 if left_token and left_token.id == Id.Left_TDoubleQuote:
1003 word_compile.RemoveLeadingSpaceDQ(out_parts)
1004
1005 # Return nothing, since we appended to 'out_parts'
1006
1007 def _ReadDoubleQuoted(self, left_token):
1008 # type: (Token) -> DoubleQuoted
1009 """Helper function for "hello $name".
1010
1011 Args:
1012 eof_type: for stopping at }, Id.Lit_RBrace
1013 here_doc: Whether we are reading in a here doc context
1014
1015 Also ${foo%%a b c} # treat this as double quoted. until you hit
1016 """
1017 parts = [] # type: List[word_part_t]
1018 self._ReadLikeDQ(left_token, False, parts)
1019
1020 right_quote = self.cur_token
1021 return DoubleQuoted(left_token, parts, right_quote)
1022
1023 def ReadDoubleQuoted(self, left_token, parts):
1024 # type: (Token, List[word_part_t]) -> Token
1025 """For expression mode.
1026
1027 Read var x = "${dir:-}/$name"; etc.
1028 """
1029 self._ReadLikeDQ(left_token, True, parts)
1030 return self.cur_token
1031
1032 def _ReadCommandSub(self, left_id, d_quoted=False):
1033 # type: (Id_t, bool) -> CommandSub
1034 """
1035 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1036
1037 command_sub = '$(' command_list ')'
1038 | '@(' command_list ')'
1039 | '<(' command_list ')'
1040 | '>(' command_list ')'
1041 | ` command_list `
1042 """
1043 left_token = self.cur_token
1044
1045 # Set the lexer in a state so ) becomes the EOF token.
1046 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1047 Id.Left_ProcSubOut):
1048 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1049
1050 right_id = Id.Eof_RParen
1051 self.lexer.PushHint(Id.Op_RParen, right_id)
1052 c_parser = self.parse_ctx.MakeParserForCommandSub(
1053 self.line_reader, self.lexer, right_id)
1054 # NOTE: This doesn't use something like main_loop because we don't want
1055 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1056 node = c_parser.ParseCommandSub()
1057
1058 right_token = c_parser.w_parser.cur_token
1059
1060 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1061 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1062 # test/osh2oil.
1063
1064 right_id = Id.Eof_Backtick
1065 self.lexer.PushHint(Id.Left_Backtick, right_id)
1066 c_parser = self.parse_ctx.MakeParserForCommandSub(
1067 self.line_reader, self.lexer, right_id)
1068 node = c_parser.ParseCommandSub()
1069 right_token = c_parser.w_parser.cur_token
1070
1071 elif left_id == Id.Left_Backtick:
1072 if not self.parse_opts.parse_backticks():
1073 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1074 left_token)
1075
1076 self._SetNext(lex_mode_e.Backtick) # advance past `
1077
1078 parts = [] # type: List[str]
1079 while True:
1080 self._GetToken()
1081 #log("TOK %s", self.cur_token)
1082
1083 if self.token_type == Id.Backtick_Quoted:
1084 # Remove leading \
1085 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1086
1087 elif self.token_type == Id.Backtick_DoubleQuote:
1088 # Compatibility: If backticks are double quoted, then double quotes
1089 # within them have to be \"
1090 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1091 # is)
1092 if d_quoted:
1093 # Remove leading \
1094 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1095 else:
1096 parts.append(lexer.TokenVal(self.cur_token))
1097
1098 elif self.token_type == Id.Backtick_Other:
1099 parts.append(lexer.TokenVal(self.cur_token))
1100
1101 elif self.token_type == Id.Backtick_Right:
1102 break
1103
1104 elif self.token_type == Id.Eof_Real:
1105 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1106 p_die('Unexpected EOF while looking for closing backtick',
1107 left_token)
1108
1109 else:
1110 raise AssertionError(self.cur_token)
1111
1112 self._SetNext(lex_mode_e.Backtick)
1113
1114 # Calculate right SPID on CommandSub BEFORE re-parsing.
1115 right_token = self.cur_token
1116
1117 code_str = ''.join(parts)
1118 #log('code %r', code_str)
1119
1120 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1121 # won't have the same location info as MakeParserForCommandSub(), because
1122 # the lexer is different.
1123 arena = self.parse_ctx.arena
1124 #arena = alloc.Arena()
1125 line_reader = reader.StringLineReader(code_str, arena)
1126 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1127 src = source.Reparsed('backticks', left_token, right_token)
1128 with alloc.ctx_SourceCode(arena, src):
1129 node = c_parser.ParseCommandSub()
1130
1131 else:
1132 raise AssertionError(left_id)
1133
1134 return CommandSub(left_token, node, right_token)
1135
1136 def _ReadExprSub(self, lex_mode):
1137 # type: (lex_mode_t) -> word_part.ExprSub
1138 """$[d->key] $[obj.method()] etc."""
1139 left_token = self.cur_token
1140
1141 self._SetNext(lex_mode_e.Expr)
1142 enode, right_token = self.parse_ctx.ParseYshExpr(
1143 self.lexer, grammar_nt.ysh_expr_sub)
1144
1145 self._SetNext(lex_mode) # Move past ]
1146 return word_part.ExprSub(left_token, enode, right_token)
1147
1148 def ParseVarDecl(self, kw_token):
1149 # type: (Token) -> command.VarDecl
1150 """
1151 oil_var_decl: name_type_list '=' testlist end_stmt
1152
1153 Note that assignments must end with \n ; } or EOF. Unlike shell
1154 assignments, we disallow:
1155
1156 var x = 42 | wc -l
1157 var x = 42 && echo hi
1158 """
1159 self._SetNext(lex_mode_e.Expr)
1160 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1161 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1162 # wants
1163 if last_token.id == Id.Op_RBrace:
1164 last_token.id = Id.Lit_RBrace
1165
1166 # Let the CommandParser see the Op_Semi or Op_Newline.
1167 self.buffered_word = last_token
1168 self._SetNext(lex_mode_e.ShCommand) # always back to this
1169 return enode
1170
1171 def ParseMutation(self, kw_token, var_checker):
1172 # type: (Token, VarChecker) -> command.Mutation
1173 """
1174 setvar i = 42
1175 setvar i += 1
1176 setvar a[i] = 42
1177 setvar a[i] += 1
1178 setvar d.key = 42
1179 setvar d.key += 1
1180 """
1181 self._SetNext(lex_mode_e.Expr)
1182 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1183 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1184 # wants
1185 if last_token.id == Id.Op_RBrace:
1186 last_token.id = Id.Lit_RBrace
1187
1188 for lhs in enode.lhs:
1189 UP_lhs = lhs
1190 with tagswitch(lhs) as case:
1191 if case(y_lhs_e.Var):
1192 lhs = cast(NameTok, UP_lhs)
1193 var_checker.Check(kw_token.id, lhs.var_name, lhs.left)
1194
1195 # Note: this does not cover cases like
1196 # setvar (a[0])[1] = v
1197 # setvar (d.key).other = v
1198 # This leaks into catching all typos statically, which may be
1199 # possible if 'use' makes all names explicit.
1200 elif case(y_lhs_e.Subscript):
1201 lhs = cast(Subscript, UP_lhs)
1202 if lhs.obj.tag() == expr_e.Var:
1203 v = cast(expr.Var, lhs.obj)
1204 var_checker.Check(kw_token.id, v.name, v.left)
1205
1206 elif case(y_lhs_e.Attribute):
1207 lhs = cast(Attribute, UP_lhs)
1208 if lhs.obj.tag() == expr_e.Var:
1209 v = cast(expr.Var, lhs.obj)
1210 var_checker.Check(kw_token.id, v.name, v.left)
1211
1212 # Let the CommandParser see the Op_Semi or Op_Newline.
1213 self.buffered_word = last_token
1214 self._SetNext(lex_mode_e.ShCommand) # always back to this
1215 return enode
1216
1217 def ParseBareDecl(self):
1218 # type: () -> expr_t
1219 """
1220 x = {name: val}
1221 """
1222 self._SetNext(lex_mode_e.Expr)
1223 self._GetToken()
1224 enode, last_token = self.parse_ctx.ParseYshExpr(
1225 self.lexer, grammar_nt.command_expr)
1226 if last_token.id == Id.Op_RBrace:
1227 last_token.id = Id.Lit_RBrace
1228 self.buffered_word = last_token
1229 self._SetNext(lex_mode_e.ShCommand)
1230 return enode
1231
1232 def ParseYshExprForCommand(self):
1233 # type: () -> expr_t
1234
1235 # Fudge for this case
1236 # for x in(y) {
1237 # versus
1238 # for x in (y) {
1239 #
1240 # In the former case, ReadWord on 'in' puts the lexer past (.
1241 # Also see LookPastSpace in CommandParers.
1242 # A simpler solution would be nicer.
1243
1244 if self.token_type == Id.Op_LParen:
1245 self.lexer.MaybeUnreadOne()
1246
1247 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1248
1249 self._SetNext(lex_mode_e.ShCommand)
1250 return enode
1251
1252 def ParseCommandExpr(self):
1253 # type: () -> expr_t
1254 """
1255 = 1+2
1256 """
1257 enode, last_token = self.parse_ctx.ParseYshExpr(
1258 self.lexer, grammar_nt.command_expr)
1259
1260 # In some cases, such as the case statement, we expect *the lexer* to be
1261 # pointing at the token right after the expression. But the expression
1262 # parser must have read to the `last_token`. Unreading places the lexer
1263 # back in the expected state. Ie:
1264 #
1265 # case (x) { case (x) {
1266 # (else) { = x } (else) { = x }
1267 # ^ The lexer is here ^ Unread to here
1268 # } }
1269 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1270 Id.Op_RBrace), last_token
1271 if last_token.id != Id.Eof_Real:
1272 # Eof_Real is the only token we cannot unread
1273 self.lexer.MaybeUnreadOne()
1274
1275 return enode
1276
1277 def ParseProc(self, node):
1278 # type: (Proc) -> None
1279
1280 # proc name-with-hyphens() must be accepted
1281 self._SetNext(lex_mode_e.ShCommand)
1282 self._GetToken()
1283 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1284 if self.token_type != Id.Lit_Chars:
1285 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1286 self.cur_token)
1287
1288 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1289 # for shell functions. Similar to IsValidVarName().
1290 node.name = self.cur_token
1291
1292 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1293
1294 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1295 assert last_token.id == Id.Op_LBrace
1296 last_token.id = Id.Lit_LBrace
1297 self.buffered_word = last_token
1298
1299 self._SetNext(lex_mode_e.ShCommand)
1300
1301 def ParseFunc(self, node):
1302 # type: (Func) -> None
1303 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1304
1305 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1306 assert last_token.id == Id.Op_LBrace
1307 last_token.id = Id.Lit_LBrace
1308 self.buffered_word = last_token
1309
1310 self._SetNext(lex_mode_e.ShCommand)
1311
1312 def ParseYshCasePattern(self):
1313 # type: () -> Tuple[pat_t, Token]
1314 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1315 self.lexer)
1316
1317 if last_token.id == Id.Op_LBrace:
1318 last_token.id = Id.Lit_LBrace
1319 self.buffered_word = last_token
1320
1321 return pat, left_tok
1322
1323 def NewlineOkForYshCase(self):
1324 # type: () -> Id_t
1325 """Check for optional newline and consume it.
1326
1327 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1328 which crop up while parsing Ysh Case Arms. For more details, see
1329 #oil-dev > Progress On YSH Case Grammar on zulip.
1330
1331 Returns a token id which is filled with the choice of
1332
1333 word { echo word }
1334 (3) { echo expr }
1335 /e/ { echo eggex }
1336 } # right brace
1337 """
1338 while True:
1339 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1340
1341 # Cannot lookahead past lines
1342 if next_id == Id.Unknown_Tok:
1343 self.lexer.MoveToNextLine()
1344 continue
1345
1346 next_kind = consts.GetKind(next_id)
1347 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1348 break
1349
1350 self.lexer.Read(lex_mode_e.Expr)
1351
1352 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1353 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1354 else:
1355 # Consume the trailing Op_Newline
1356 self._SetNext(lex_mode_e.ShCommand)
1357 self._GetToken()
1358
1359 return next_id
1360
1361 def _ReadArithExpr(self, end_id):
1362 # type: (Id_t) -> arith_expr_t
1363 """Read and parse an arithmetic expression in various contexts.
1364
1365 $(( 1+2 ))
1366 (( a=1+2 ))
1367 ${a[ 1+2 ]}
1368 ${a : 1+2 : 1+2}
1369
1370 See tests/arith-context.test.sh for ambiguous cases.
1371
1372 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1373
1374 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1375
1376 See the assertion in ArithParser.Parse() -- unexpected extra input.
1377 """
1378 # calls self.ReadWord(lex_mode_e.Arith)
1379 anode = self.a_parser.Parse()
1380 cur_id = self.a_parser.CurrentId()
1381 if end_id != Id.Undefined_Tok and cur_id != end_id:
1382 p_die(
1383 'Unexpected token after arithmetic expression (%s != %s)' %
1384 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1385 loc.Word(self.a_parser.cur_word))
1386 return anode
1387
1388 def _ReadArithSub(self):
1389 # type: () -> word_part.ArithSub
1390 """Read an arith substitution, which contains an arith expression, e.g.
1391
1392 $((a + 1)).
1393 """
1394 left_tok = self.cur_token
1395
1396 # The second one needs to be disambiguated in stuff like stuff like:
1397 # $(echo $(( 1+2 )) )
1398 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1399
1400 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1401 # could save the lexer/reader state here, and retry if the arithmetic parse
1402 # fails. But we can almost always catch this at parse time. There could
1403 # be some exceptions like:
1404 # $((echo * foo)) # looks like multiplication
1405 # $((echo / foo)) # looks like division
1406
1407 self._SetNext(lex_mode_e.Arith)
1408 anode = self._ReadArithExpr(Id.Arith_RParen)
1409
1410 # TODO: This could be DQ or Arith too
1411 self._SetNext(lex_mode_e.ShCommand)
1412
1413 # PROBLEM: $(echo $(( 1 + 2 )) )
1414 # Two right parens break the Id.Eof_RParen scheme
1415 self._GetToken()
1416 if self.token_type != Id.Right_DollarDParen:
1417 p_die('Expected second ) to end arith sub', self.cur_token)
1418
1419 right_tok = self.cur_token
1420 return word_part.ArithSub(left_tok, anode, right_tok)
1421
1422 def ReadDParen(self):
1423 # type: () -> Tuple[arith_expr_t, Token]
1424 """Read ((1+ 2)) -- command context.
1425
1426 We're using the word parser because it's very similar to _ReadArithExpr
1427 above.
1428
1429 This also returns the terminating `Op_DRightParen` token for use as location
1430 tracking.
1431 """
1432 # The second one needs to be disambiguated in stuff like stuff like:
1433 # TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1434 # Then you can get rid of this.
1435 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1436
1437 self._SetNext(lex_mode_e.Arith)
1438 anode = self._ReadArithExpr(Id.Arith_RParen)
1439
1440 self._SetNext(lex_mode_e.ShCommand)
1441
1442 # PROBLEM: $(echo $(( 1 + 2 )) )
1443 self._GetToken()
1444 right = self.cur_token
1445 if self.token_type != Id.Op_DRightParen:
1446 p_die('Expected second ) to end arith statement', self.cur_token)
1447
1448 self._SetNext(lex_mode_e.ShCommand)
1449
1450 return anode, right
1451
1452 def _SetNextNonSpace(self):
1453 # type: () -> None
1454 """Same logic as _ReadWord, but for ReadForExpression."""
1455 while True:
1456 self._SetNext(lex_mode_e.Arith)
1457 self._GetToken()
1458 if self.token_kind not in (Kind.Ignored, Kind.WS):
1459 break
1460
1461 def ReadForExpression(self):
1462 # type: () -> command.ForExpr
1463 """Read ((i=0; i<5; ++i)) -- part of command context."""
1464 self._SetNextNonSpace() # skip over ((
1465
1466 self._GetToken()
1467 cur_id = self.token_type # for end of arith expressions
1468
1469 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1470 init_node = None # type: Optional[arith_expr_t]
1471 else:
1472 init_node = self.a_parser.Parse()
1473 cur_id = self.a_parser.CurrentId()
1474 self._SetNextNonSpace()
1475
1476 # It's odd to keep track of both cur_id and self.token_type in this
1477 # function, but it works, and is tested in 'test/parse_error.sh
1478 # arith-integration'
1479 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1480 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1481
1482 self._GetToken()
1483 cur_id = self.token_type
1484
1485 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1486 cond_node = None # type: Optional[arith_expr_t]
1487 else:
1488 cond_node = self.a_parser.Parse()
1489 cur_id = self.a_parser.CurrentId()
1490 self._SetNextNonSpace()
1491
1492 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1493 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1494
1495 self._GetToken()
1496 cur_id = self.token_type
1497
1498 if cur_id == Id.Arith_RParen: # for (( ; ; ))
1499 update_node = None # type: Optional[arith_expr_t]
1500 else:
1501 update_node = self._ReadArithExpr(Id.Arith_RParen)
1502 self._SetNextNonSpace()
1503
1504 self._GetToken()
1505 if self.token_type != Id.Arith_RParen:
1506 p_die('Expected ) to end for loop expression', self.cur_token)
1507 self._SetNext(lex_mode_e.ShCommand)
1508
1509 # redirects is None, will be assigned in CommandEvaluator
1510 node = command.ForExpr.CreateNull()
1511 node.init = init_node
1512 node.cond = cond_node
1513 node.update = update_node
1514 return node
1515
1516 def _ReadArrayLiteral(self):
1517 # type: () -> word_part_t
1518 """a=(1 2 3)
1519
1520 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1521
1522 We want:
1523
1524 A=(['x']=1 ["x"]=2 [$x$y]=3)
1525
1526 Maybe allow this as a literal string? Because I think I've seen it before?
1527 Or maybe force people to patch to learn the rule.
1528
1529 A=([x]=4)
1530
1531 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1532 Maybe enforce that ALL have keys or NONE of have keys.
1533 """
1534 self._SetNext(lex_mode_e.ShCommand) # advance past (
1535 self._GetToken()
1536 if self.cur_token.id != Id.Op_LParen:
1537 p_die('Expected ( after =', self.cur_token)
1538 left_token = self.cur_token
1539 right_token = None # type: Token
1540
1541 # MUST use a new word parser (with same lexer).
1542 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1543 words = [] # type: List[CompoundWord]
1544 done = False
1545 while not done:
1546 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1547 with tagswitch(w) as case:
1548 if case(word_e.Operator):
1549 tok = cast(Token, w)
1550 if tok.id == Id.Right_ShArrayLiteral:
1551 right_token = tok
1552 done = True # can't use break here
1553 # Unlike command parsing, array parsing allows embedded \n.
1554 elif tok.id == Id.Op_Newline:
1555 continue
1556 else:
1557 p_die('Unexpected token in array literal', loc.Word(w))
1558
1559 elif case(word_e.Compound):
1560 words.append(cast(CompoundWord, w))
1561
1562 else:
1563 raise AssertionError()
1564
1565 if len(words) == 0: # a=() is empty indexed array
1566 # Needed for type safety, doh
1567 no_words = [] # type: List[word_t]
1568 node = ShArrayLiteral(left_token, no_words, right_token)
1569 return node
1570
1571 pairs = [] # type: List[AssocPair]
1572 # If the first one is a key/value pair, then the rest are assumed to be.
1573 pair = word_.DetectAssocPair(words[0])
1574 if pair:
1575 pairs.append(pair)
1576
1577 n = len(words)
1578 for i in xrange(1, n):
1579 w2 = words[i]
1580 pair = word_.DetectAssocPair(w2)
1581 if not pair:
1582 p_die("Expected associative array pair", loc.Word(w2))
1583
1584 pairs.append(pair)
1585
1586 # invariant List?
1587 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1588
1589 # Brace detection for arrays but NOT associative arrays
1590 words2 = braces.BraceDetectAll(words)
1591 words3 = word_.TildeDetectAll(words2)
1592 return ShArrayLiteral(left_token, words3, right_token)
1593
1594 def ParseProcCallArgs(self, start_symbol):
1595 # type: (int) -> ArgList
1596 """ json write (x) """
1597 self.lexer.MaybeUnreadOne()
1598
1599 arg_list = ArgList.CreateNull(alloc_lists=True)
1600 arg_list.left = self.cur_token
1601 self.parse_ctx.ParseYshArgList(self.lexer, arg_list, start_symbol)
1602 return arg_list
1603
1604 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1605 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1606 """Helper for _ReadCompoundWord3."""
1607 done = False
1608
1609 if self.token_type == Id.Lit_EscapedChar:
1610 tok = self.cur_token
1611 assert tok.length == 2
1612 ch = lexer.TokenSliceLeft(tok, 1)
1613 if not self.parse_opts.parse_backslash():
1614 if not pyutil.IsValidCharEscape(ch):
1615 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1616 self.cur_token)
1617
1618 part = word_part.EscapedLiteral(self.cur_token,
1619 ch) # type: word_part_t
1620 else:
1621 part = self.cur_token
1622
1623 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1624 parts.append(part)
1625 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1626 # _ReadWord.
1627 next_id = self.lexer.LookPastSpace(lex_mode)
1628 if next_id == Id.Op_LParen:
1629 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1630 part2 = self._ReadArrayLiteral()
1631 parts.append(part2)
1632
1633 # Array literal must be the last part of the word.
1634 self._SetNext(lex_mode)
1635 self._GetToken()
1636 # EOF, whitespace, newline, Right_Subshell
1637 if self.token_kind not in KINDS_THAT_END_WORDS:
1638 p_die('Unexpected token after array literal',
1639 self.cur_token)
1640 done = True
1641
1642 elif (is_first and self.parse_opts.parse_at() and
1643 self.token_type == Id.Lit_Splice):
1644
1645 splice_tok = self.cur_token
1646 part2 = word_part.Splice(splice_tok,
1647 lexer.TokenSliceLeft(splice_tok, 1))
1648
1649 parts.append(part2)
1650
1651 # @words must be the last part of the word
1652 self._SetNext(lex_mode)
1653 self._GetToken()
1654 # EOF, whitespace, newline, Right_Subshell
1655 if self.token_kind not in KINDS_THAT_END_WORDS:
1656 p_die('Unexpected token after array splice', self.cur_token)
1657 done = True
1658
1659 elif (is_first and self.parse_opts.parse_at() and
1660 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1661 part2 = self._ReadExprSub(lex_mode_e.DQ)
1662 parts.append(part2)
1663
1664 # @[split(x)]
1665 self._SetNext(lex_mode)
1666 self._GetToken()
1667 # EOF, whitespace, newline, Right_Subshell
1668 if self.token_kind not in KINDS_THAT_END_WORDS:
1669 p_die('Unexpected token after Expr splice', self.cur_token)
1670 done = True
1671
1672 elif (is_first and self.parse_opts.parse_at() and
1673 self.token_type == Id.Lit_AtLBraceDot):
1674 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1675
1676 elif (is_first and self.parse_opts.parse_at_all() and
1677 self.token_type == Id.Lit_At):
1678 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1679 # at the beginning of a word to be reserved.
1680
1681 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1682 # @_argv and
1683 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1684 self.cur_token)
1685
1686 else:
1687 # not a literal with lookahead; append it
1688 parts.append(part)
1689
1690 return done
1691
1692 def _ReadCompoundWord(self, lex_mode):
1693 # type: (lex_mode_t) -> CompoundWord
1694 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1695
1696 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1697 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1698 """
1699 Precondition: Looking at the first token of the first word part
1700 Postcondition: Looking at the token after, e.g. space or operator
1701
1702 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1703 could be an operator delimiting a compound word. Can we change lexer modes
1704 and remove this special case?
1705 """
1706 w = CompoundWord([])
1707 num_parts = 0
1708 brace_count = 0
1709 done = False
1710 is_triple_quoted = None # type: Optional[BoolParamBox]
1711
1712 while not done:
1713 self._GetToken()
1714
1715 allow_done = empty_ok or num_parts != 0
1716 if allow_done and self.token_type == eof_type:
1717 done = True # e.g. for ${foo//pat/replace}
1718
1719 # Keywords like "for" are treated like literals
1720 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1721 Kind.ControlFlow, Kind.BoolUnary,
1722 Kind.BoolBinary):
1723
1724 # Syntax error for { and }
1725 if self.token_type == Id.Lit_LBrace:
1726 brace_count += 1
1727 elif self.token_type == Id.Lit_RBrace:
1728 brace_count -= 1
1729 elif self.token_type == Id.Lit_Dollar:
1730 if not self.parse_opts.parse_dollar():
1731 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1732 next_byte = self.lexer.ByteLookAhead()
1733 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1734 if next_byte == '/':
1735 #log('next_byte %r', next_byte)
1736 pass
1737
1738 p_die('Literal $ should be quoted like \$',
1739 self.cur_token)
1740
1741 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1742 w.parts)
1743
1744 elif self.token_kind == Kind.VSub:
1745 vsub_token = self.cur_token
1746
1747 part = NameTok(vsub_token,
1748 lexer.TokenSliceLeft(vsub_token,
1749 1)) # type: word_part_t
1750 w.parts.append(part)
1751
1752 elif self.token_kind == Kind.ExtGlob:
1753 # If parse_at, we can take over @( to start @(seq 3)
1754 # Users can also use look at ,(*.py|*.sh)
1755 if (self.parse_opts.parse_at() and
1756 self.token_type == Id.ExtGlob_At and num_parts == 0):
1757 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1758 d_quoted=False)
1759 # RARE mutation of tok.id!
1760 cs_part.left_token.id = Id.Left_AtParen
1761 part = cs_part # for type safety
1762
1763 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1764 # a=(one two)x and @arrayfunc(3)x.
1765 self._GetToken()
1766 if self.token_kind not in KINDS_THAT_END_WORDS:
1767 p_die('Unexpected token after @()', self.cur_token)
1768 done = True
1769
1770 else:
1771 part = self._ReadExtGlob()
1772 w.parts.append(part)
1773
1774 elif self.token_kind == Kind.Left:
1775 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1776 lex_mode == lex_mode_e.ShCommand and
1777 num_parts == 0)
1778
1779 # Save allocation
1780 if try_triple_quote:
1781 is_triple_quoted = BoolParamBox(False)
1782
1783 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1784 w.parts.append(part)
1785
1786 # NOT done yet, will advance below
1787 elif self.token_kind == Kind.Right:
1788 # Still part of the word; will be done on the next iter.
1789 if self.token_type == Id.Right_DoubleQuote:
1790 pass
1791 # Never happens, no PushHint for this case.
1792 #elif self.token_type == Id.Right_DollarParen:
1793 # pass
1794 elif self.token_type == Id.Right_Subshell:
1795 # LEXER HACK for (case x in x) ;; esac )
1796 # Rewind before it's used
1797 assert self.next_lex_mode == lex_mode_e.Undefined
1798 if self.lexer.MaybeUnreadOne():
1799 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1800 self._SetNext(lex_mode)
1801 done = True
1802 else:
1803 done = True
1804
1805 elif self.token_kind == Kind.Ignored:
1806 done = True
1807
1808 else:
1809 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1810 # so to test for ESAC, we can read ) before getting a chance to
1811 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1812 # token and do it again.
1813
1814 # We get Id.Op_RParen at top level: case x in x) ;; esac
1815 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1816 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1817 # Rewind before it's used
1818 assert self.next_lex_mode == lex_mode_e.Undefined
1819 if self.lexer.MaybeUnreadOne():
1820 if self.token_type == Id.Eof_RParen:
1821 # Redo translation
1822 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1823 self._SetNext(lex_mode)
1824
1825 done = True # anything we don't recognize means we're done
1826
1827 if not done:
1828 self._SetNext(lex_mode)
1829 num_parts += 1
1830
1831 if (self.parse_opts.parse_brace() and num_parts > 1 and
1832 brace_count != 0):
1833 # accept { and }, but not foo{
1834 p_die(
1835 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1836 loc.Word(w))
1837
1838 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1839 p_die('Unexpected parts after triple quoted string',
1840 loc.WordPart(w.parts[-1]))
1841
1842 return w
1843
1844 def _ReadArithWord(self):
1845 # type: () -> Optional[word_t]
1846 """ Helper for ReadArithWord() """
1847 self._GetToken()
1848
1849 if self.token_kind == Kind.Unknown:
1850 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1851 p_die(
1852 'Unexpected token while parsing arithmetic: %r' %
1853 lexer.TokenVal(self.cur_token), self.cur_token)
1854
1855 elif self.token_kind == Kind.Eof:
1856 return self.cur_token
1857
1858 elif self.token_kind == Kind.Ignored:
1859 # Space should be ignored.
1860 self._SetNext(lex_mode_e.Arith)
1861 return None
1862
1863 elif self.token_kind in (Kind.Arith, Kind.Right):
1864 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1865 self._SetNext(lex_mode_e.Arith)
1866 return self.cur_token
1867
1868 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1869 return self._ReadCompoundWord(lex_mode_e.Arith)
1870
1871 else:
1872 raise AssertionError(self.cur_token)
1873
1874 def _ReadWord(self, word_mode):
1875 # type: (lex_mode_t) -> Optional[word_t]
1876 """Helper function for ReadWord()."""
1877
1878 # Change the pseudo lexer mode to a real lexer mode
1879 if word_mode == lex_mode_e.ShCommandBrack:
1880 lex_mode = lex_mode_e.ShCommand
1881 else:
1882 lex_mode = word_mode
1883
1884 self._GetToken()
1885
1886 if self.token_kind == Kind.Eof:
1887 # No advance
1888 return self.cur_token
1889
1890 # Allow Arith for ) at end of for loop?
1891 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1892 self._SetNext(lex_mode)
1893
1894 # Newlines are complicated. See 3x2 matrix in the comment about
1895 # self.multiline and self.newline_state above.
1896 if self.token_type == Id.Op_Newline:
1897 if self.multiline:
1898 if self.newline_state > 1:
1899 # This points at a blank line, but at least it gives the line number
1900 p_die('Invalid blank line in multiline mode',
1901 self.cur_token)
1902 return None
1903
1904 if self.returned_newline: # skip
1905 return None
1906
1907 return self.cur_token
1908
1909 elif self.token_kind == Kind.Right:
1910 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1911 Id.Right_CasePat,
1912 Id.Right_ShArrayLiteral):
1913 raise AssertionError(self.cur_token)
1914
1915 self._SetNext(lex_mode)
1916 return self.cur_token
1917
1918 elif self.token_kind in (Kind.Ignored, Kind.WS):
1919 self._SetNext(lex_mode)
1920 return None
1921
1922 else:
1923 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
1924 Kind.Left, Kind.KW, Kind.ControlFlow,
1925 Kind.BoolUnary, Kind.BoolBinary,
1926 Kind.ExtGlob), 'Unhandled token kind'
1927
1928 if (word_mode == lex_mode_e.ShCommandBrack and
1929 self.parse_opts.parse_bracket() and
1930 self.token_type == Id.Lit_LBracket):
1931 # Change [ from Kind.Lit -> Kind.Op
1932 # So CommandParser can treat
1933 # assert [42 === x]
1934 # like
1935 # json write (x)
1936 bracket_word = self.cur_token
1937 bracket_word.id = Id.Op_LBracket
1938
1939 self._SetNext(lex_mode)
1940 return bracket_word
1941
1942 # We're beginning a word. If we see Id.Lit_Pound, change to
1943 # lex_mode_e.Comment and read until end of line.
1944 if self.token_type == Id.Lit_Pound:
1945 self._SetNext(lex_mode_e.Comment)
1946 self._GetToken()
1947
1948 # NOTE: The # could be the last character in the file. It can't be
1949 # Eof_{RParen,Backtick} because #) and #` are comments.
1950 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
1951 self.cur_token
1952
1953 # The next iteration will go into Kind.Ignored and set lex state to
1954 # lex_mode_e.ShCommand/etc.
1955 return None # tell ReadWord() to try again after comment
1956
1957 elif self.token_type == Id.Lit_TPound: ### doc comment
1958 self._SetNext(lex_mode_e.Comment)
1959 self._GetToken()
1960
1961 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
1962 return self.cur_token
1963
1964 return None # tell ReadWord() to try again after comment
1965
1966 else:
1967 # r'' u'' b''
1968 if (self.token_type == Id.Lit_Chars and
1969 self.lexer.LookAheadOne(
1970 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
1971
1972 # When shopt -s parse_raw_string:
1973 # echo r'hi' is like echo 'hi'
1974 #
1975 # echo u'\u{3bc}' b'\yff' works
1976
1977 if (self.parse_opts.parse_ysh_string() and
1978 self.cur_token.tval in ('r', 'u', 'b')):
1979
1980 if self.cur_token.tval == 'r':
1981 left_id = Id.Left_RSingleQuote
1982 elif self.cur_token.tval == 'u':
1983 left_id = Id.Left_USingleQuote
1984 else:
1985 left_id = Id.Left_BSingleQuote
1986
1987 # skip the r, and then 'foo' will be read as normal
1988 self._SetNext(lex_mode_e.ShCommand)
1989
1990 self._GetToken()
1991 assert self.token_type == Id.Left_SingleQuote, self.token_type
1992
1993 # Read the word in a different lexer mode
1994 return self._ReadYshSingleQuoted(left_id)
1995
1996 return self._ReadCompoundWord(lex_mode)
1997
1998 def ParseVarRef(self):
1999 # type: () -> BracedVarSub
2000 """DYNAMIC parsing of what's inside ${!ref}
2001
2002 # Same as VarOf production
2003 VarRefExpr = VarOf EOF
2004 """
2005 self._SetNext(lex_mode_e.VSub_1)
2006
2007 self._GetToken()
2008 if self.token_kind != Kind.VSub:
2009 p_die('Expected var name', self.cur_token)
2010
2011 part = self._ParseVarOf()
2012 # NOTE: no ${ } means no part.left and part.right
2013 part.left = part.token # cheat to make test pass
2014 part.right = part.token
2015
2016 self._GetToken()
2017 if self.token_type != Id.Eof_Real:
2018 p_die('Expected end of var ref expression', self.cur_token)
2019 return part
2020
2021 def LookPastSpace(self):
2022 # type: () -> Id_t
2023 """Look ahead to the next token.
2024
2025 For the CommandParser to recognize
2026 array= (1 2 3)
2027 YSH for ( versus bash for ((
2028 YSH if ( versus if test
2029 YSH while ( versus while test
2030 YSH bare assignment 'grep =' versus 'grep foo'
2031 """
2032 assert self.token_type != Id.Undefined_Tok
2033 if self.cur_token.id == Id.WS_Space:
2034 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2035 else:
2036 id_ = self.cur_token.id
2037 return id_
2038
2039 def LookAheadFuncParens(self):
2040 # type: () -> bool
2041 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2042 assert self.token_type != Id.Undefined_Tok
2043
2044 # We have to handle 2 cases because we buffer a token
2045 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2046 return self.lexer.LookAheadFuncParens(1) # go back one char
2047
2048 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2049 return self.lexer.LookAheadFuncParens(0)
2050
2051 else:
2052 return False
2053
2054 def ReadWord(self, word_mode):
2055 # type: (lex_mode_t) -> word_t
2056 """Read the next word, using the given lexer mode.
2057
2058 This is a stateful wrapper for the stateless _ReadWord function.
2059 """
2060 assert word_mode in (lex_mode_e.ShCommand, lex_mode_e.ShCommandBrack,
2061 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2062
2063 if self.buffered_word: # For integration with pgen2
2064 w = self.buffered_word
2065 self.buffered_word = None
2066 else:
2067 while True:
2068 w = self._ReadWord(word_mode)
2069 if w is not None:
2070 break
2071
2072 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2073 return w
2074
2075 def ReadArithWord(self):
2076 # type: () -> word_t
2077 while True:
2078 w = self._ReadArithWord()
2079 if w is not None:
2080 break
2081 return w
2082
2083 def ReadHereDocBody(self, parts):
2084 # type: (List[word_part_t]) -> None
2085 """
2086 A here doc is like a double quoted context, except " isn't special.
2087 """
2088 self._ReadLikeDQ(None, False, parts)
2089 # Returns nothing
2090
2091 def ReadForPlugin(self):
2092 # type: () -> CompoundWord
2093 """For $PS1, $PS4, etc.
2094
2095 This is just like reading a here doc line. "\n" is allowed, as
2096 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2097 """
2098 w = CompoundWord([])
2099 self._ReadLikeDQ(None, False, w.parts)
2100 return w
2101
2102 def EmitDocToken(self, b):
2103 # type: (bool) -> None
2104 self.emit_doc_token = b
2105
2106 def Multiline(self, b):
2107 # type: (bool) -> None
2108 self.multiline = b