OILS / osh / word_parse.py View on Github | oilshell.org

2121 lines, 1134 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90)
91from core import alloc
92from core.error import p_die
93from mycpp.mylib import log
94from core import pyutil
95from core import ui
96from frontend import consts
97from frontend import lexer
98from frontend import reader
99from osh import tdop
100from osh import arith_parse
101from osh import braces
102from osh import word_
103from osh import word_compile
104from mycpp.mylib import tagswitch
105
106from typing import List, Optional, Tuple, cast
107from typing import TYPE_CHECKING
108if TYPE_CHECKING:
109 from frontend.lexer import Lexer
110 from frontend.parse_lib import ParseContext
111 from frontend.reader import _Reader
112 from osh.cmd_parse import VarChecker
113
114unused1 = log
115unused2 = Id_str
116
117KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120class WordEmitter(object):
121 """Common interface for [ and [["""
122
123 def __init__(self):
124 # type: () -> None
125 """Empty constructor for mycpp."""
126 pass
127
128 def ReadWord(self, lex_mode):
129 # type: (lex_mode_t) -> word_t
130 raise NotImplementedError()
131
132
133class WordParser(WordEmitter):
134
135 def __init__(self, parse_ctx, lexer, line_reader):
136 # type: (ParseContext, Lexer, _Reader) -> None
137 self.parse_ctx = parse_ctx
138 self.lexer = lexer
139 self.line_reader = line_reader
140 self.arena = line_reader.arena
141
142 self.parse_opts = parse_ctx.parse_opts
143 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144 self.parse_opts)
145 self.Reset()
146
147 def Init(self, lex_mode):
148 # type: (lex_mode_t) -> None
149 """Used to parse arithmetic, see ParseContext."""
150 self.next_lex_mode = lex_mode
151
152 def Reset(self):
153 # type: () -> None
154 """Called by interactive loop."""
155 # For _GetToken()
156 self.cur_token = None # type: Token
157 self.token_kind = Kind.Undefined
158 self.token_type = Id.Undefined_Tok
159
160 self.next_lex_mode = lex_mode_e.ShCommand
161
162 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163 # comments
164 self.emit_doc_token = False
165 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166 # multiline mode.
167 self.multiline = False
168
169 # For detecting invalid \n\n in multiline mode. Counts what we got
170 # directly from the lexer.
171 self.newline_state = 0
172 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173 # that consume words.
174 self.returned_newline = False
175
176 # For integration with pgen2
177 self.buffered_word = None # type: word_t
178
179 def _GetToken(self):
180 # type: () -> None
181 """Call this when you need to make a decision based on any of:
182
183 self.token_type self.token_kind self.cur_token # contents
184 """
185 if self.next_lex_mode != lex_mode_e.Undefined:
186 self.cur_token = self.lexer.Read(self.next_lex_mode)
187 self.token_type = self.cur_token.id
188 self.token_kind = consts.GetKind(self.token_type)
189
190 # number of consecutive newlines, ignoring whitespace
191 if self.token_type == Id.Op_Newline:
192 self.newline_state += 1
193 elif self.token_kind != Kind.WS:
194 self.newline_state = 0
195
196 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
197 self.next_lex_mode = lex_mode_e.Undefined
198
199 def _SetNext(self, lex_mode):
200 # type: (lex_mode_t) -> None
201 """Set the next lex state, but don't actually read a token.
202
203 We need this for proper interactive parsing.
204 """
205 self.next_lex_mode = lex_mode
206
207 def _ReadVarOpArg(self, arg_lex_mode):
208 # type: (lex_mode_t) -> rhs_word_t
209
210 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
211 # valid, even when unquoted.
212 self._SetNext(arg_lex_mode)
213 self._GetToken()
214
215 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
216 True) # empty_ok
217
218 # If the Compound has no parts, and we're in a double-quoted VarSub
219 # arg, and empty_ok, then return Empty. This is so it can evaluate to
220 # the empty string and not get elided.
221 #
222 # Examples:
223 # - "${s:-}", "${s/%pat/}"
224 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
225 # has the same potential problem of not having Token location info.
226 #
227 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
228 # return a Compound with no parts, which is explicitly checked with a
229 # custom error message.
230 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
231 return rhs_word.Empty
232
233 return w
234
235 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
236 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
237 """Return a CompoundWord.
238
239 Helper function for _ReadVarOpArg and used directly by
240 _ReadPatSubVarOp.
241 """
242 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
243 #log('w %s', w)
244 tilde = word_.TildeDetect(w)
245 if tilde:
246 w = tilde
247 return w
248
249 def _ReadSliceVarOp(self):
250 # type: () -> suffix_op.Slice
251 """VarOf ':' ArithExpr (':' ArithExpr )?"""
252 self._SetNext(lex_mode_e.Arith)
253 self._GetToken()
254 cur_id = self.token_type # e.g. Id.Arith_Colon
255
256 if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
257 # no beginning specified
258 begin = None # type: Optional[arith_expr_t]
259 else:
260 begin = self.a_parser.Parse()
261 cur_id = self.a_parser.CurrentId()
262
263 if cur_id == Id.Arith_RBrace:
264 no_length = None # type: Optional[arith_expr_t] # No length specified
265 return suffix_op.Slice(begin, no_length)
266
267 # Id.Arith_Colon is a pun for Id.VOp2_Colon
268 if cur_id == Id.Arith_Colon:
269 self._SetNext(lex_mode_e.Arith)
270 length = self._ReadArithExpr(Id.Arith_RBrace)
271 return suffix_op.Slice(begin, length)
272
273 p_die("Expected : or } in slice", self.cur_token)
274 raise AssertionError() # for MyPy
275
276 def _ReadPatSubVarOp(self):
277 # type: () -> suffix_op.PatSub
278 """Looking at the first '/' after VarOf:
279
280 VarSub = ...
281 | VarOf '/' Match ( '/' WORD? )?
282 Match = '/' WORD # can't be empty
283 | '#' WORD? # may be empty
284 | '%' WORD?
285 """
286 slash_tok = self.cur_token # location info
287 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
288
289 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
290
291 self._GetToken()
292 if self.token_type == Id.Right_DollarBrace:
293 pat = CompoundWord([])
294 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
295 slash_tok)
296
297 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
298 replace_mode = self.token_type
299 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
300
301 # Bash quirk:
302 # echo ${x/#/replace} has an empty pattern
303 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
304 empty_ok = replace_mode != Id.Lit_Slash
305 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
306 empty_ok)
307 #log('pat 1 %r', pat)
308
309 if self.token_type == Id.Lit_Slash:
310 # read until }
311 replace = self._ReadVarOpArg(
312 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
313 #log('r 1 %r', replace)
314 else:
315 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
316 replace = rhs_word.Empty
317
318 self._GetToken()
319 if self.token_type != Id.Right_DollarBrace:
320 # This happens on invalid code
321 p_die(
322 "Expected } after replacement string, got %s" %
323 ui.PrettyId(self.token_type), self.cur_token)
324
325 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
326
327 def _ReadSubscript(self):
328 # type: () -> bracket_op_t
329 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
330 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
331 # expression.
332 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
333 if next_id in (Id.Lit_At, Id.Arith_Star):
334 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
335
336 self._SetNext(lex_mode_e.Arith) # skip past [
337 self._GetToken()
338 self._SetNext(lex_mode_e.Arith) # skip past @
339 self._GetToken()
340 else:
341 self._SetNext(lex_mode_e.Arith) # skip past [
342 anode = self._ReadArithExpr(Id.Arith_RBracket)
343 op = bracket_op.ArrayIndex(anode)
344
345 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
346 p_die('Expected ] to close subscript', self.cur_token)
347
348 self._SetNext(lex_mode_e.VSub_2) # skip past ]
349 self._GetToken() # Needed to be in the same spot as no subscript
350
351 return op
352
353 def _ParseVarOf(self):
354 # type: () -> BracedVarSub
355 """
356 VarOf = NAME Subscript?
357 | NUMBER # no subscript allowed, none of these are arrays
358 # ${@[1]} doesn't work, even though slicing does
359 | VarSymbol
360 """
361 self._GetToken()
362 name_token = self.cur_token
363 self._SetNext(lex_mode_e.VSub_2)
364
365 self._GetToken() # Check for []
366 if self.token_type == Id.VOp2_LBracket:
367 bracket_op = self._ReadSubscript()
368 else:
369 bracket_op = None
370
371 part = BracedVarSub.CreateNull()
372 part.token = name_token
373 part.var_name = lexer.TokenVal(name_token)
374 part.bracket_op = bracket_op
375 return part
376
377 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
378 # type: (lex_mode_t, bool) -> BracedVarSub
379 """Start parsing at the op -- we already skipped past the name."""
380 part = self._ParseVarOf()
381
382 self._GetToken()
383 if self.token_type == Id.Right_DollarBrace:
384 return part # no ops
385
386 op_kind = self.token_kind
387
388 if op_kind == Kind.VTest:
389 tok = self.cur_token
390 arg_word = self._ReadVarOpArg(arg_lex_mode)
391 if self.token_type != Id.Right_DollarBrace:
392 p_die('Expected } to close ${', self.cur_token)
393
394 part.suffix_op = suffix_op.Unary(tok, arg_word)
395
396 elif op_kind == Kind.VOpYsh:
397 tok = self.cur_token
398 arg_word = self._ReadVarOpArg(arg_lex_mode)
399 if self.token_type != Id.Right_DollarBrace:
400 p_die('Expected } to close ${', self.cur_token)
401
402 UP_arg_word = arg_word
403 with tagswitch(arg_word) as case:
404 if case(rhs_word_e.Empty):
405 pass
406 elif case(rhs_word_e.Compound):
407 arg_word = cast(CompoundWord, UP_arg_word)
408 # This handles ${x|html} and ${x %.3f} now
409 # However I think ${x %.3f} should be statically parsed? It can enter
410 # the printf lexer modes.
411 ok, arg, quoted = word_.StaticEval(arg_word)
412 if not ok or quoted:
413 p_die('Expected a constant argument',
414 loc.Word(arg_word))
415
416 part.suffix_op = suffix_op.Static(tok, arg)
417
418 elif op_kind == Kind.VOp0:
419 part.suffix_op = self.cur_token # Nullary
420 self._SetNext(lex_mode_e.VSub_2) # Expecting }
421 self._GetToken()
422
423 elif op_kind == Kind.VOp1: # % %% # ## etc.
424 tok = self.cur_token
425 # Weird exception that all shells have: these operators take a glob
426 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
427 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
428 if self.token_type != Id.Right_DollarBrace:
429 p_die('Expected } to close ${', self.cur_token)
430
431 part.suffix_op = suffix_op.Unary(tok, arg_word)
432
433 elif op_kind == Kind.VOp2: # / : [ ]
434 if self.token_type == Id.VOp2_Slash:
435 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
436 part.suffix_op = patsub_op
437
438 # Checked by the method above
439 assert self.token_type == Id.Right_DollarBrace, self.cur_token
440
441 elif self.token_type == Id.VOp2_Colon:
442 part.suffix_op = self._ReadSliceVarOp()
443 # NOTE: } in arithmetic mode.
444 if self.token_type != Id.Arith_RBrace:
445 # Token seems off; doesn't point to X in # ${a:1:2 X
446 p_die('Expected } to close ${', self.cur_token)
447
448 else:
449 # TODO: Does this ever happen?
450 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
451
452 elif op_kind == Kind.VOp3: # ${prefix@} etc.
453 if allow_query:
454 part.suffix_op = self.cur_token # Nullary
455 self._SetNext(lex_mode_e.VSub_2) # Expecting }
456 self._GetToken()
457 else:
458 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
459
460 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
461 # mode. It's redundantly checked above.
462 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
463 # ${a.} or ${!a.}
464 p_die('Expected } to close ${', self.cur_token)
465
466 # Now look for ops
467 return part
468
469 def ReadBracedVarSub(self, left_token):
470 # type: (Token) -> Tuple[BracedVarSub, Token]
471 """ For YSH expressions like var x = ${x:-"default"}. """
472 part = self._ReadBracedVarSub(left_token, d_quoted=False)
473 last_token = self.cur_token
474 return part, last_token
475
476 def _ReadBracedVarSub(self, left_token, d_quoted):
477 # type: (Token, bool) -> BracedVarSub
478 """For the ${} expression language.
479
480 NAME = [a-zA-Z_][a-zA-Z0-9_]*
481 NUMBER = [0-9]+ # ${10}, ${11}, ...
482
483 Subscript = '[' ('@' | '*' | ArithExpr) ']'
484 VarSymbol = '!' | '@' | '#' | ...
485 VarOf = NAME Subscript?
486 | NUMBER # no subscript allowed, none of these are arrays
487 # ${@[1]} doesn't work, even though slicing does
488 | VarSymbol
489
490 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
491
492 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
493 STRIP_OP = '#' | '##' | '%' | '%%'
494 CASE_OP = ',' | ',,' | '^' | '^^'
495 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
496
497 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
498 # SPACE is operator not %
499 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
500 VarExpr = VarOf
501 | VarOf NULLARY_OP
502 | VarOf UnaryOp WORD
503 | VarOf YSH_UNARY STATIC_WORD
504 | VarOf ':' ArithExpr (':' ArithExpr )?
505 | VarOf '/' Match '/' WORD
506
507 LengthExpr = '#' VarOf # can't apply operators after length
508
509 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
510 # ${!ref[0]} vs ${!keys[@]} resolved later
511
512 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
513
514 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
515
516 VarSub = LengthExpr
517 | RefOrKeys
518 | PrefixQuery
519 | VarExpr
520 | BuiltinSub
521
522 NOTES:
523 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
524 slicing ${a:x+1:y+2}
525 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
526 - @ and * are technically arithmetic expressions in this implementation
527 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
528 it's also vectorized.
529
530 Strictness over bash:
531 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
532 grammar
533 - ! and # prefixes can't be composed, even though named refs can be
534 composed with other operators
535 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
536 a prefix, and it can also be a literal part of WORD.
537
538 From the parser's point of view, the prefix # can't be combined with
539 UnaryOp/slicing/matching, and the ! can. However
540
541 - ${a[@]:1:2} is not allowed
542 - ${#a[@]:1:2} is allowed, but gives the wrong answer
543 """
544 if d_quoted:
545 arg_lex_mode = lex_mode_e.VSub_ArgDQ
546 else:
547 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
548
549 self._SetNext(lex_mode_e.VSub_1)
550 self._GetToken()
551
552 ty = self.token_type
553 first_tok = self.cur_token
554
555 if ty == Id.VSub_Pound:
556 # Disambiguate
557 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
558 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
559 # e.g. a name, '#' is the prefix
560 self._SetNext(lex_mode_e.VSub_1)
561 part = self._ParseVarOf()
562
563 self._GetToken()
564 if self.token_type != Id.Right_DollarBrace:
565 p_die('Expected } after length expression', self.cur_token)
566
567 part.prefix_op = first_tok
568
569 else: # not a prefix, '#' is the variable
570 part = self._ParseVarExpr(arg_lex_mode)
571
572 elif ty == Id.VSub_Bang:
573 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
574 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
575 # e.g. a name, '!' is the prefix
576 # ${!a} -- this is a ref
577 # ${!3} -- this is ref
578 # ${!a[1]} -- this is a ref
579 # ${!a[@]} -- this is a keys
580 # No lookahead -- do it in a second step, or at runtime
581 self._SetNext(lex_mode_e.VSub_1)
582 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
583
584 part.prefix_op = first_tok
585
586 else: # not a prefix, '!' is the variable
587 part = self._ParseVarExpr(arg_lex_mode)
588
589 elif ty == Id.VSub_Dot:
590 # Note: this will become a new builtin_sub type, so this method must
591 # return word_part_t rather than BracedVarSub. I don't think that
592 # should cause problems.
593 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
594
595 # VS_NAME, VS_NUMBER, symbol that isn't # or !
596 elif self.token_kind == Kind.VSub:
597 part = self._ParseVarExpr(arg_lex_mode)
598
599 else:
600 # e.g. ${^}
601 p_die('Unexpected token in ${}', self.cur_token)
602
603 part.left = left_token # attach the argument
604 part.right = self.cur_token
605 return part
606
607 def _ReadSingleQuoted(self, left_token, lex_mode):
608 # type: (Token, lex_mode_t) -> SingleQuoted
609 """Internal method to read a word_part."""
610 tokens = [] # type: List[Token]
611 # In command mode, we never disallow backslashes like '\'
612 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
613 False)
614 sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
615 node = SingleQuoted(left_token, sval, right_quote)
616 return node
617
618 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
619 # type: (lex_mode_t, Token, List[Token], bool) -> Token
620 """Appends to out_tokens; returns last token
621
622 Used by expr_parse.py
623 """
624 # TODO: Remove and use out_tokens
625 tokens = [] # type: List[Token]
626
627 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
628 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
629
630 expected_end_tokens = 3 if left_token.id in (
631 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
632 Id.Left_BTSingleQuote) else 1
633 num_end_tokens = 0
634
635 while num_end_tokens < expected_end_tokens:
636 self._SetNext(lex_mode)
637 self._GetToken()
638
639 # Kind.Char emitted in lex_mode.SQ_C
640 if self.token_kind in (Kind.Lit, Kind.Char):
641 tok = self.cur_token
642 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
643 # r'one\two' or c'one\\two'
644 if no_backslashes and lexer.TokenContains(tok, '\\'):
645 p_die(
646 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
647 tok)
648
649 if is_ysh_expr:
650 # Disallow var x = $'\001'. Arguably we don't need these
651 # checks because u'\u{1}' is the way to write it.
652 if self.token_type == Id.Char_Octal3:
653 p_die(
654 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
655 tok)
656
657 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
658 # disallow \xH
659 p_die(
660 r'Invalid hex escape in YSH string (must be \xHH)',
661 tok)
662
663 tokens.append(tok)
664
665 elif self.token_kind == Kind.Unknown:
666 tok = self.cur_token
667 assert tok.id == Id.Unknown_Backslash, tok
668
669 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
670 if is_ysh_expr or not self.parse_opts.parse_backslash():
671 p_die(
672 "Invalid char escape in C-style string literal (OILS-ERR-11)",
673 tok)
674
675 tokens.append(tok)
676
677 elif self.token_kind == Kind.Eof:
678 p_die('Unexpected EOF in single-quoted string that began here',
679 left_token)
680
681 elif self.token_kind == Kind.Right:
682 # assume Id.Right_SingleQuote
683 num_end_tokens += 1
684 tokens.append(self.cur_token)
685
686 else:
687 raise AssertionError(self.cur_token)
688
689 if self.token_kind != Kind.Right:
690 num_end_tokens = 0 # we need three in a ROW
691
692 if expected_end_tokens == 1:
693 tokens.pop()
694 elif expected_end_tokens == 3: # Get rid of spurious end tokens
695 tokens.pop()
696 tokens.pop()
697 tokens.pop()
698
699 # Remove space from ''' r''' $''' in both expression mode and command mode
700 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
701 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
702 word_compile.RemoveLeadingSpaceSQ(tokens)
703
704 # Validation after lexing - same 2 checks in j8.LexerDecoder
705 is_u_string = left_token.id in (Id.Left_USingleQuote,
706 Id.Left_UTSingleQuote)
707
708 for tok in tokens:
709 # u'\yff' is not valid, but b'\yff' is
710 if is_u_string and tok.id == Id.Char_YHex:
711 p_die(
712 r"%s escapes not allowed in u'' strings" %
713 lexer.TokenVal(tok), tok)
714 # \u{dc00} isn't valid
715 if tok.id == Id.Char_UBraced:
716 h = lexer.TokenSlice(tok, 3, -1) # \u{123456}
717 i = int(h, 16)
718 if 0xD800 <= i and i < 0xE000:
719 p_die(
720 r"%s escape is illegal because it's in the surrogate range"
721 % lexer.TokenVal(tok), tok)
722
723 out_tokens.extend(tokens)
724 return self.cur_token
725
726 def _ReadDoubleQuotedLeftParts(self):
727 # type: () -> word_part_t
728 """Read substitution parts in a double quoted context."""
729 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
730 return self._ReadCommandSub(self.token_type, d_quoted=True)
731
732 if self.token_type == Id.Left_DollarBrace:
733 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
734
735 if self.token_type == Id.Left_DollarDParen:
736 return self._ReadArithSub()
737
738 if self.token_type == Id.Left_DollarBracket:
739 return self._ReadExprSub(lex_mode_e.DQ)
740
741 raise AssertionError(self.cur_token)
742
743 def _ReadYshSingleQuoted(self, left_id):
744 # type: (Id_t) -> CompoundWord
745 """Read YSH style strings
746
747 r'' u'' b''
748 r''' ''' u''' ''' b''' '''
749 """
750 #log('BEF self.cur_token %s', self.cur_token)
751 if left_id == Id.Left_RSingleQuote:
752 lexer_mode = lex_mode_e.SQ_Raw
753 triple_left_id = Id.Left_RTSingleQuote
754 elif left_id == Id.Left_USingleQuote:
755 lexer_mode = lex_mode_e.J8_Str
756 triple_left_id = Id.Left_UTSingleQuote
757 elif left_id == Id.Left_BSingleQuote:
758 lexer_mode = lex_mode_e.J8_Str
759 triple_left_id = Id.Left_BTSingleQuote
760 else:
761 raise AssertionError(left_id)
762
763 # Needed for syntax checks
764 left_tok = self.cur_token
765 left_tok.id = left_id
766
767 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
768
769 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
770 self._SetNext(lex_mode_e.ShCommand)
771 self._GetToken()
772
773 assert self.token_type == Id.Left_SingleQuote
774 # HACK: magically transform the third ' in u''' to
775 # Id.Left_UTSingleQuote, so that ''' is the terminator
776 left_tok = self.cur_token
777 left_tok.id = triple_left_id
778
779 # Handles stripping leading whitespace
780 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
781
782 # Advance and validate
783 self._SetNext(lex_mode_e.ShCommand)
784
785 self._GetToken()
786 if self.token_kind not in KINDS_THAT_END_WORDS:
787 p_die('Unexpected token after YSH single-quoted string',
788 self.cur_token)
789
790 return CompoundWord([sq_part])
791
792 def _ReadUnquotedLeftParts(self, triple_out):
793 # type: (Optional[BoolParamBox]) -> word_part_t
794 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
795
796 If triple_out is set, then we try parsing triple quoted strings,
797 and set its value to True if we got one.
798 """
799 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
800 # Note: $"" is a synonym for "". It might make sense if it added
801 # \n \0 \x00 \u{123} etc. But that's not what bash does!
802 dq_part = self._ReadDoubleQuoted(self.cur_token)
803 # Got empty word "" and there's a " after
804 if (triple_out and len(dq_part.parts) == 0 and
805 self.lexer.ByteLookAhead() == '"'):
806
807 self._SetNext(lex_mode_e.ShCommand)
808 self._GetToken()
809 # HACK: magically transform the third " in """ to
810 # Id.Left_TDoubleQuote, so that """ is the terminator
811 left_dq_token = self.cur_token
812 left_dq_token.id = Id.Left_TDoubleQuote
813 triple_out.b = True # let caller know we got it
814 return self._ReadDoubleQuoted(left_dq_token)
815
816 return dq_part
817
818 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
819 Id.Left_DollarSingleQuote):
820 if self.token_type == Id.Left_SingleQuote:
821 lexer_mode = lex_mode_e.SQ_Raw
822 triple_left_id = Id.Left_TSingleQuote
823 elif self.token_type == Id.Left_RSingleQuote:
824 lexer_mode = lex_mode_e.SQ_Raw
825 triple_left_id = Id.Left_RTSingleQuote
826 else:
827 lexer_mode = lex_mode_e.SQ_C
828 # there is no such thing as $'''
829 triple_left_id = Id.Undefined_Tok
830
831 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
832
833 # Got empty '' or r'' and there's a ' after
834 # u'' and b'' are handled in _ReadYshSingleQuoted
835 if (triple_left_id != Id.Undefined_Tok and
836 triple_out is not None and len(sq_part.sval) == 0 and
837 self.lexer.ByteLookAhead() == "'"):
838
839 self._SetNext(lex_mode_e.ShCommand)
840 self._GetToken()
841
842 # HACK: magically transform the third ' in ''' to
843 # Id.Left_TSingleQuote, so that ''' is the terminator
844 left_sq_token = self.cur_token
845 left_sq_token.id = triple_left_id
846
847 triple_out.b = True # let caller know we got it
848 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
849
850 return sq_part
851
852 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
853 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
854 return self._ReadCommandSub(self.token_type, d_quoted=False)
855
856 if self.token_type == Id.Left_DollarBrace:
857 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
858
859 if self.token_type == Id.Left_DollarDParen:
860 return self._ReadArithSub()
861
862 if self.token_type == Id.Left_DollarBracket:
863 return self._ReadExprSub(lex_mode_e.ShCommand)
864
865 raise AssertionError(self.cur_token)
866
867 def _ReadExtGlob(self):
868 # type: () -> word_part.ExtGlob
869 """
870 Grammar:
871 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
872 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
873 RIGHT = ')'
874 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
875 Compound includes ExtGlob
876 """
877 left_token = self.cur_token
878 right_token = None # type: Token
879 arms = [] # type: List[CompoundWord]
880
881 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
882 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
883
884 read_word = False # did we just a read a word? To handle @(||).
885
886 while True:
887 self._GetToken()
888
889 if self.token_type == Id.Right_ExtGlob:
890 if not read_word:
891 arms.append(CompoundWord([]))
892 right_token = self.cur_token
893 break
894
895 elif self.token_type == Id.Op_Pipe:
896 if not read_word:
897 arms.append(CompoundWord([]))
898 read_word = False
899 self._SetNext(lex_mode_e.ExtGlob)
900
901 # lex mode EXTGLOB should only produce these 4 kinds of tokens
902 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
903 Kind.ExtGlob):
904 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
905 arms.append(w)
906 read_word = True
907
908 elif self.token_kind == Kind.Eof:
909 p_die('Unexpected EOF reading extended glob that began here',
910 left_token)
911
912 else:
913 raise AssertionError(self.cur_token)
914
915 return word_part.ExtGlob(left_token, arms, right_token)
916
917 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
918 # type: (Optional[Token], bool, List[word_part_t]) -> None
919 """
920 Args:
921 left_token: A token if we are reading a double quoted part, or None if
922 we're reading a here doc.
923 is_ysh_expr: Whether to disallow backticks and invalid char escapes
924 out_parts: list of word_part to append to
925 """
926 if left_token:
927 expected_end_tokens = 3 if left_token.id == Id.Left_TDoubleQuote else 1
928 else:
929 expected_end_tokens = 1000 # here doc will break
930
931 num_end_tokens = 0
932 while num_end_tokens < expected_end_tokens:
933 self._SetNext(lex_mode_e.DQ)
934 self._GetToken()
935
936 if self.token_kind == Kind.Lit:
937 if self.token_type == Id.Lit_EscapedChar:
938 tok = self.cur_token
939 ch = lexer.TokenSliceLeft(tok, 1)
940 part = word_part.EscapedLiteral(tok,
941 ch) # type: word_part_t
942 else:
943 if self.token_type == Id.Lit_BadBackslash:
944 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
945 # YSH.
946 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
947 # recursion (unless parse_backslash)
948 if (is_ysh_expr or
949 not self.parse_opts.parse_backslash()):
950 p_die(
951 "Invalid char escape in double quoted string (OILS-ERR-12)",
952 self.cur_token)
953 elif self.token_type == Id.Lit_Dollar:
954 if is_ysh_expr or not self.parse_opts.parse_dollar():
955 p_die("Literal $ should be quoted like \$",
956 self.cur_token)
957
958 part = self.cur_token
959 out_parts.append(part)
960
961 elif self.token_kind == Kind.Left:
962 if self.token_type == Id.Left_Backtick and is_ysh_expr:
963 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
964 self.cur_token)
965
966 part = self._ReadDoubleQuotedLeftParts()
967 out_parts.append(part)
968
969 elif self.token_kind == Kind.VSub:
970 tok = self.cur_token
971 part = SimpleVarSub(tok)
972 out_parts.append(part)
973 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
974 # later.
975
976 elif self.token_kind == Kind.Right:
977 assert self.token_type == Id.Right_DoubleQuote, self.token_type
978 if left_token:
979 num_end_tokens += 1
980
981 # In a here doc, the right quote is literal!
982 out_parts.append(self.cur_token)
983
984 elif self.token_kind == Kind.Eof:
985 if left_token:
986 p_die(
987 'Unexpected EOF reading double-quoted string that began here',
988 left_token)
989 else: # here docs will have an EOF in their token stream
990 break
991
992 else:
993 raise AssertionError(self.cur_token)
994
995 if self.token_kind != Kind.Right:
996 num_end_tokens = 0 # """ must be CONSECUTIVE
997
998 if expected_end_tokens == 1:
999 out_parts.pop()
1000 elif expected_end_tokens == 3:
1001 out_parts.pop()
1002 out_parts.pop()
1003 out_parts.pop()
1004
1005 # Remove space from """ in both expression mode and command mode
1006 if left_token and left_token.id == Id.Left_TDoubleQuote:
1007 word_compile.RemoveLeadingSpaceDQ(out_parts)
1008
1009 # Return nothing, since we appended to 'out_parts'
1010
1011 def _ReadDoubleQuoted(self, left_token):
1012 # type: (Token) -> DoubleQuoted
1013 """Helper function for "hello $name".
1014
1015 Args:
1016 eof_type: for stopping at }, Id.Lit_RBrace
1017 here_doc: Whether we are reading in a here doc context
1018
1019 Also ${foo%%a b c} # treat this as double quoted. until you hit
1020 """
1021 parts = [] # type: List[word_part_t]
1022 self._ReadLikeDQ(left_token, False, parts)
1023
1024 right_quote = self.cur_token
1025 return DoubleQuoted(left_token, parts, right_quote)
1026
1027 def ReadDoubleQuoted(self, left_token, parts):
1028 # type: (Token, List[word_part_t]) -> Token
1029 """For expression mode.
1030
1031 Read var x = "${dir:-}/$name"; etc.
1032 """
1033 self._ReadLikeDQ(left_token, True, parts)
1034 return self.cur_token
1035
1036 def _ReadCommandSub(self, left_id, d_quoted=False):
1037 # type: (Id_t, bool) -> CommandSub
1038 """
1039 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1040
1041 command_sub = '$(' command_list ')'
1042 | '@(' command_list ')'
1043 | '<(' command_list ')'
1044 | '>(' command_list ')'
1045 | ` command_list `
1046 """
1047 left_token = self.cur_token
1048
1049 # Set the lexer in a state so ) becomes the EOF token.
1050 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1051 Id.Left_ProcSubOut):
1052 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1053
1054 right_id = Id.Eof_RParen
1055 self.lexer.PushHint(Id.Op_RParen, right_id)
1056 c_parser = self.parse_ctx.MakeParserForCommandSub(
1057 self.line_reader, self.lexer, right_id)
1058 # NOTE: This doesn't use something like main_loop because we don't want
1059 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1060 node = c_parser.ParseCommandSub()
1061
1062 right_token = c_parser.w_parser.cur_token
1063
1064 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1065 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1066 # test/osh2oil.
1067
1068 right_id = Id.Eof_Backtick
1069 self.lexer.PushHint(Id.Left_Backtick, right_id)
1070 c_parser = self.parse_ctx.MakeParserForCommandSub(
1071 self.line_reader, self.lexer, right_id)
1072 node = c_parser.ParseCommandSub()
1073 right_token = c_parser.w_parser.cur_token
1074
1075 elif left_id == Id.Left_Backtick:
1076 if not self.parse_opts.parse_backticks():
1077 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1078 left_token)
1079
1080 self._SetNext(lex_mode_e.Backtick) # advance past `
1081
1082 parts = [] # type: List[str]
1083 while True:
1084 self._GetToken()
1085 #log("TOK %s", self.cur_token)
1086
1087 if self.token_type == Id.Backtick_Quoted:
1088 # Remove leading \
1089 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1090
1091 elif self.token_type == Id.Backtick_DoubleQuote:
1092 # Compatibility: If backticks are double quoted, then double quotes
1093 # within them have to be \"
1094 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1095 # is)
1096 if d_quoted:
1097 # Remove leading \
1098 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1099 else:
1100 parts.append(lexer.TokenVal(self.cur_token))
1101
1102 elif self.token_type == Id.Backtick_Other:
1103 parts.append(lexer.TokenVal(self.cur_token))
1104
1105 elif self.token_type == Id.Backtick_Right:
1106 break
1107
1108 elif self.token_type == Id.Eof_Real:
1109 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1110 p_die('Unexpected EOF while looking for closing backtick',
1111 left_token)
1112
1113 else:
1114 raise AssertionError(self.cur_token)
1115
1116 self._SetNext(lex_mode_e.Backtick)
1117
1118 # Calculate right SPID on CommandSub BEFORE re-parsing.
1119 right_token = self.cur_token
1120
1121 code_str = ''.join(parts)
1122 #log('code %r', code_str)
1123
1124 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1125 # won't have the same location info as MakeParserForCommandSub(), because
1126 # the lexer is different.
1127 arena = self.parse_ctx.arena
1128 #arena = alloc.Arena()
1129 line_reader = reader.StringLineReader(code_str, arena)
1130 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1131 src = source.Reparsed('backticks', left_token, right_token)
1132 with alloc.ctx_SourceCode(arena, src):
1133 node = c_parser.ParseCommandSub()
1134
1135 else:
1136 raise AssertionError(left_id)
1137
1138 return CommandSub(left_token, node, right_token)
1139
1140 def _ReadExprSub(self, lex_mode):
1141 # type: (lex_mode_t) -> word_part.ExprSub
1142 """$[d->key] $[obj.method()] etc."""
1143 left_token = self.cur_token
1144
1145 self._SetNext(lex_mode_e.Expr)
1146 enode, right_token = self.parse_ctx.ParseYshExpr(
1147 self.lexer, grammar_nt.ysh_expr_sub)
1148
1149 self._SetNext(lex_mode) # Move past ]
1150 return word_part.ExprSub(left_token, enode, right_token)
1151
1152 def ParseVarDecl(self, kw_token):
1153 # type: (Token) -> command.VarDecl
1154 """
1155 oil_var_decl: name_type_list '=' testlist end_stmt
1156
1157 Note that assignments must end with \n ; } or EOF. Unlike shell
1158 assignments, we disallow:
1159
1160 var x = 42 | wc -l
1161 var x = 42 && echo hi
1162 """
1163 self._SetNext(lex_mode_e.Expr)
1164 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1165 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1166 # wants
1167 if last_token.id == Id.Op_RBrace:
1168 last_token.id = Id.Lit_RBrace
1169
1170 # Let the CommandParser see the Op_Semi or Op_Newline.
1171 self.buffered_word = last_token
1172 self._SetNext(lex_mode_e.ShCommand) # always back to this
1173 return enode
1174
1175 def ParseMutation(self, kw_token, var_checker):
1176 # type: (Token, VarChecker) -> command.Mutation
1177 """
1178 setvar i = 42
1179 setvar i += 1
1180 setvar a[i] = 42
1181 setvar a[i] += 1
1182 setvar d.key = 42
1183 setvar d.key += 1
1184 """
1185 self._SetNext(lex_mode_e.Expr)
1186 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1187 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1188 # wants
1189 if last_token.id == Id.Op_RBrace:
1190 last_token.id = Id.Lit_RBrace
1191
1192 for lhs in enode.lhs:
1193 UP_lhs = lhs
1194 with tagswitch(lhs) as case:
1195 if case(y_lhs_e.Var):
1196 lhs = cast(Token, UP_lhs)
1197 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1198
1199 # Note: this does not cover cases like
1200 # setvar (a[0])[1] = v
1201 # setvar (d.key).other = v
1202 # This leaks into catching all typos statically, which may be
1203 # possible if 'use' makes all names explicit.
1204 elif case(y_lhs_e.Subscript):
1205 lhs = cast(Subscript, UP_lhs)
1206 if lhs.obj.tag() == expr_e.Var:
1207 v = cast(expr.Var, lhs.obj)
1208 var_checker.Check(kw_token.id, v.name, v.left)
1209
1210 elif case(y_lhs_e.Attribute):
1211 lhs = cast(Attribute, UP_lhs)
1212 if lhs.obj.tag() == expr_e.Var:
1213 v = cast(expr.Var, lhs.obj)
1214 var_checker.Check(kw_token.id, v.name, v.left)
1215
1216 # Let the CommandParser see the Op_Semi or Op_Newline.
1217 self.buffered_word = last_token
1218 self._SetNext(lex_mode_e.ShCommand) # always back to this
1219 return enode
1220
1221 def ParseBareDecl(self):
1222 # type: () -> expr_t
1223 """
1224 x = {name: val}
1225 """
1226 self._SetNext(lex_mode_e.Expr)
1227 self._GetToken()
1228 enode, last_token = self.parse_ctx.ParseYshExpr(
1229 self.lexer, grammar_nt.command_expr)
1230 if last_token.id == Id.Op_RBrace:
1231 last_token.id = Id.Lit_RBrace
1232 self.buffered_word = last_token
1233 self._SetNext(lex_mode_e.ShCommand)
1234 return enode
1235
1236 def ParseYshExprForCommand(self):
1237 # type: () -> expr_t
1238
1239 # Fudge for this case
1240 # for x in(y) {
1241 # versus
1242 # for x in (y) {
1243 #
1244 # In the former case, ReadWord on 'in' puts the lexer past (.
1245 # Also see LookPastSpace in CommandParers.
1246 # A simpler solution would be nicer.
1247
1248 if self.token_type == Id.Op_LParen:
1249 self.lexer.MaybeUnreadOne()
1250
1251 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1252
1253 self._SetNext(lex_mode_e.ShCommand)
1254 return enode
1255
1256 def ParseCommandExpr(self):
1257 # type: () -> expr_t
1258 """
1259 = 1+2
1260 """
1261 enode, last_token = self.parse_ctx.ParseYshExpr(
1262 self.lexer, grammar_nt.command_expr)
1263
1264 # In some cases, such as the case statement, we expect *the lexer* to be
1265 # pointing at the token right after the expression. But the expression
1266 # parser must have read to the `last_token`. Unreading places the lexer
1267 # back in the expected state. Ie:
1268 #
1269 # case (x) { case (x) {
1270 # (else) { = x } (else) { = x }
1271 # ^ The lexer is here ^ Unread to here
1272 # } }
1273 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1274 Id.Op_RBrace), last_token
1275 if last_token.id != Id.Eof_Real:
1276 # Eof_Real is the only token we cannot unread
1277 self.lexer.MaybeUnreadOne()
1278
1279 return enode
1280
1281 def ParseProc(self, node):
1282 # type: (Proc) -> None
1283
1284 # proc name-with-hyphens() must be accepted
1285 self._SetNext(lex_mode_e.ShCommand)
1286 self._GetToken()
1287 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1288 if self.token_type != Id.Lit_Chars:
1289 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1290 self.cur_token)
1291
1292 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1293 # for shell functions. Similar to IsValidVarName().
1294 node.name = self.cur_token
1295
1296 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1297
1298 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1299 assert last_token.id == Id.Op_LBrace
1300 last_token.id = Id.Lit_LBrace
1301 self.buffered_word = last_token
1302
1303 self._SetNext(lex_mode_e.ShCommand)
1304
1305 def ParseFunc(self, node):
1306 # type: (Func) -> None
1307 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1308
1309 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1310 assert last_token.id == Id.Op_LBrace
1311 last_token.id = Id.Lit_LBrace
1312 self.buffered_word = last_token
1313
1314 self._SetNext(lex_mode_e.ShCommand)
1315
1316 def ParseYshCasePattern(self):
1317 # type: () -> Tuple[pat_t, Token]
1318 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1319 self.lexer)
1320
1321 if last_token.id == Id.Op_LBrace:
1322 last_token.id = Id.Lit_LBrace
1323 self.buffered_word = last_token
1324
1325 return pat, left_tok
1326
1327 def NewlineOkForYshCase(self):
1328 # type: () -> Id_t
1329 """Check for optional newline and consume it.
1330
1331 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1332 which crop up while parsing Ysh Case Arms. For more details, see
1333 #oil-dev > Progress On YSH Case Grammar on zulip.
1334
1335 Returns a token id which is filled with the choice of
1336
1337 word { echo word }
1338 (3) { echo expr }
1339 /e/ { echo eggex }
1340 } # right brace
1341 """
1342 while True:
1343 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1344
1345 # Cannot lookahead past lines
1346 if next_id == Id.Unknown_Tok:
1347 self.lexer.MoveToNextLine()
1348 continue
1349
1350 next_kind = consts.GetKind(next_id)
1351 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1352 break
1353
1354 self.lexer.Read(lex_mode_e.Expr)
1355
1356 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1357 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1358 else:
1359 # Consume the trailing Op_Newline
1360 self._SetNext(lex_mode_e.ShCommand)
1361 self._GetToken()
1362
1363 return next_id
1364
1365 def _ReadArithExpr(self, end_id):
1366 # type: (Id_t) -> arith_expr_t
1367 """Read and parse an arithmetic expression in various contexts.
1368
1369 $(( 1+2 ))
1370 (( a=1+2 ))
1371 ${a[ 1+2 ]}
1372 ${a : 1+2 : 1+2}
1373
1374 See tests/arith-context.test.sh for ambiguous cases.
1375
1376 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1377
1378 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1379
1380 See the assertion in ArithParser.Parse() -- unexpected extra input.
1381 """
1382 # calls self.ReadWord(lex_mode_e.Arith)
1383 anode = self.a_parser.Parse()
1384 cur_id = self.a_parser.CurrentId()
1385 if end_id != Id.Undefined_Tok and cur_id != end_id:
1386 p_die(
1387 'Unexpected token after arithmetic expression (%s != %s)' %
1388 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1389 loc.Word(self.a_parser.cur_word))
1390 return anode
1391
1392 def _ReadArithSub(self):
1393 # type: () -> word_part.ArithSub
1394 """Read an arith substitution, which contains an arith expression, e.g.
1395
1396 $((a + 1)).
1397 """
1398 left_tok = self.cur_token
1399
1400 # The second one needs to be disambiguated in stuff like stuff like:
1401 # $(echo $(( 1+2 )) )
1402 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1403
1404 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1405 # could save the lexer/reader state here, and retry if the arithmetic parse
1406 # fails. But we can almost always catch this at parse time. There could
1407 # be some exceptions like:
1408 # $((echo * foo)) # looks like multiplication
1409 # $((echo / foo)) # looks like division
1410
1411 self._SetNext(lex_mode_e.Arith)
1412 anode = self._ReadArithExpr(Id.Arith_RParen)
1413
1414 # TODO: This could be DQ or Arith too
1415 self._SetNext(lex_mode_e.ShCommand)
1416
1417 # PROBLEM: $(echo $(( 1 + 2 )) )
1418 # Two right parens break the Id.Eof_RParen scheme
1419 self._GetToken()
1420 if self.token_type != Id.Right_DollarDParen:
1421 p_die('Expected second ) to end arith sub', self.cur_token)
1422
1423 right_tok = self.cur_token
1424 return word_part.ArithSub(left_tok, anode, right_tok)
1425
1426 def ReadDParen(self):
1427 # type: () -> Tuple[arith_expr_t, Token]
1428 """Read ((1+ 2)) -- command context.
1429
1430 We're using the word parser because it's very similar to _ReadArithExpr
1431 above.
1432
1433 This also returns the terminating `Op_DRightParen` token for use as location
1434 tracking.
1435 """
1436 # The second one needs to be disambiguated in stuff like stuff like:
1437 # TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1438 # Then you can get rid of this.
1439 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1440
1441 self._SetNext(lex_mode_e.Arith)
1442 anode = self._ReadArithExpr(Id.Arith_RParen)
1443
1444 self._SetNext(lex_mode_e.ShCommand)
1445
1446 # PROBLEM: $(echo $(( 1 + 2 )) )
1447 self._GetToken()
1448 right = self.cur_token
1449 if self.token_type != Id.Op_DRightParen:
1450 p_die('Expected second ) to end arith statement', self.cur_token)
1451
1452 self._SetNext(lex_mode_e.ShCommand)
1453
1454 return anode, right
1455
1456 def _SetNextNonSpace(self):
1457 # type: () -> None
1458 """Same logic as _ReadWord, but for ReadForExpression."""
1459 while True:
1460 self._SetNext(lex_mode_e.Arith)
1461 self._GetToken()
1462 if self.token_kind not in (Kind.Ignored, Kind.WS):
1463 break
1464
1465 def ReadForExpression(self):
1466 # type: () -> command.ForExpr
1467 """Read ((i=0; i<5; ++i)) -- part of command context."""
1468 self._SetNextNonSpace() # skip over ((
1469
1470 self._GetToken()
1471 cur_id = self.token_type # for end of arith expressions
1472
1473 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1474 init_node = None # type: Optional[arith_expr_t]
1475 else:
1476 init_node = self.a_parser.Parse()
1477 cur_id = self.a_parser.CurrentId()
1478 self._SetNextNonSpace()
1479
1480 # It's odd to keep track of both cur_id and self.token_type in this
1481 # function, but it works, and is tested in 'test/parse_error.sh
1482 # arith-integration'
1483 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1484 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1485
1486 self._GetToken()
1487 cur_id = self.token_type
1488
1489 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1490 cond_node = None # type: Optional[arith_expr_t]
1491 else:
1492 cond_node = self.a_parser.Parse()
1493 cur_id = self.a_parser.CurrentId()
1494 self._SetNextNonSpace()
1495
1496 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1497 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1498
1499 self._GetToken()
1500 cur_id = self.token_type
1501
1502 if cur_id == Id.Arith_RParen: # for (( ; ; ))
1503 update_node = None # type: Optional[arith_expr_t]
1504 else:
1505 update_node = self._ReadArithExpr(Id.Arith_RParen)
1506 self._SetNextNonSpace()
1507
1508 self._GetToken()
1509 if self.token_type != Id.Arith_RParen:
1510 p_die('Expected ) to end for loop expression', self.cur_token)
1511 self._SetNext(lex_mode_e.ShCommand)
1512
1513 # redirects is None, will be assigned in CommandEvaluator
1514 node = command.ForExpr.CreateNull()
1515 node.init = init_node
1516 node.cond = cond_node
1517 node.update = update_node
1518 return node
1519
1520 def _ReadArrayLiteral(self):
1521 # type: () -> word_part_t
1522 """a=(1 2 3)
1523
1524 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1525
1526 We want:
1527
1528 A=(['x']=1 ["x"]=2 [$x$y]=3)
1529
1530 Maybe allow this as a literal string? Because I think I've seen it before?
1531 Or maybe force people to patch to learn the rule.
1532
1533 A=([x]=4)
1534
1535 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1536 Maybe enforce that ALL have keys or NONE of have keys.
1537 """
1538 self._SetNext(lex_mode_e.ShCommand) # advance past (
1539 self._GetToken()
1540 if self.cur_token.id != Id.Op_LParen:
1541 p_die('Expected ( after =', self.cur_token)
1542 left_token = self.cur_token
1543 right_token = None # type: Token
1544
1545 # MUST use a new word parser (with same lexer).
1546 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1547 words = [] # type: List[CompoundWord]
1548 done = False
1549 while not done:
1550 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1551 with tagswitch(w) as case:
1552 if case(word_e.Operator):
1553 tok = cast(Token, w)
1554 if tok.id == Id.Right_ShArrayLiteral:
1555 right_token = tok
1556 done = True # can't use break here
1557 # Unlike command parsing, array parsing allows embedded \n.
1558 elif tok.id == Id.Op_Newline:
1559 continue
1560 else:
1561 p_die('Unexpected token in array literal', loc.Word(w))
1562
1563 elif case(word_e.Compound):
1564 words.append(cast(CompoundWord, w))
1565
1566 else:
1567 raise AssertionError()
1568
1569 if len(words) == 0: # a=() is empty indexed array
1570 # Needed for type safety, doh
1571 no_words = [] # type: List[word_t]
1572 node = ShArrayLiteral(left_token, no_words, right_token)
1573 return node
1574
1575 pairs = [] # type: List[AssocPair]
1576 # If the first one is a key/value pair, then the rest are assumed to be.
1577 pair = word_.DetectAssocPair(words[0])
1578 if pair:
1579 pairs.append(pair)
1580
1581 n = len(words)
1582 for i in xrange(1, n):
1583 w2 = words[i]
1584 pair = word_.DetectAssocPair(w2)
1585 if not pair:
1586 p_die("Expected associative array pair", loc.Word(w2))
1587
1588 pairs.append(pair)
1589
1590 # invariant List?
1591 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1592
1593 # Brace detection for arrays but NOT associative arrays
1594 words2 = braces.BraceDetectAll(words)
1595 words3 = word_.TildeDetectAll(words2)
1596 return ShArrayLiteral(left_token, words3, right_token)
1597
1598 def ParseProcCallArgs(self, start_symbol):
1599 # type: (int) -> ArgList
1600 """ json write (x) """
1601 self.lexer.MaybeUnreadOne()
1602
1603 arg_list = ArgList.CreateNull(alloc_lists=True)
1604 arg_list.left = self.cur_token
1605 self.parse_ctx.ParseYshArgList(self.lexer, arg_list, start_symbol)
1606 return arg_list
1607
1608 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1609 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1610 """Helper for _ReadCompoundWord3."""
1611 done = False
1612
1613 if self.token_type == Id.Lit_EscapedChar:
1614 tok = self.cur_token
1615 assert tok.length == 2
1616 ch = lexer.TokenSliceLeft(tok, 1)
1617 if not self.parse_opts.parse_backslash():
1618 if not pyutil.IsValidCharEscape(ch):
1619 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1620 self.cur_token)
1621
1622 part = word_part.EscapedLiteral(self.cur_token,
1623 ch) # type: word_part_t
1624 else:
1625 part = self.cur_token
1626
1627 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1628 parts.append(part)
1629 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1630 # _ReadWord.
1631 next_id = self.lexer.LookPastSpace(lex_mode)
1632 if next_id == Id.Op_LParen:
1633 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1634 part2 = self._ReadArrayLiteral()
1635 parts.append(part2)
1636
1637 # Array literal must be the last part of the word.
1638 self._SetNext(lex_mode)
1639 self._GetToken()
1640 # EOF, whitespace, newline, Right_Subshell
1641 if self.token_kind not in KINDS_THAT_END_WORDS:
1642 p_die('Unexpected token after array literal',
1643 self.cur_token)
1644 done = True
1645
1646 elif (is_first and self.parse_opts.parse_at() and
1647 self.token_type == Id.Lit_Splice):
1648
1649 splice_tok = self.cur_token
1650 part2 = word_part.Splice(splice_tok,
1651 lexer.TokenSliceLeft(splice_tok, 1))
1652
1653 parts.append(part2)
1654
1655 # @words must be the last part of the word
1656 self._SetNext(lex_mode)
1657 self._GetToken()
1658 # EOF, whitespace, newline, Right_Subshell
1659 if self.token_kind not in KINDS_THAT_END_WORDS:
1660 p_die('Unexpected token after array splice', self.cur_token)
1661 done = True
1662
1663 elif (is_first and self.parse_opts.parse_at() and
1664 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1665 part2 = self._ReadExprSub(lex_mode_e.DQ)
1666 parts.append(part2)
1667
1668 # @[split(x)]
1669 self._SetNext(lex_mode)
1670 self._GetToken()
1671 # EOF, whitespace, newline, Right_Subshell
1672 if self.token_kind not in KINDS_THAT_END_WORDS:
1673 p_die('Unexpected token after Expr splice', self.cur_token)
1674 done = True
1675
1676 elif (is_first and self.parse_opts.parse_at() and
1677 self.token_type == Id.Lit_AtLBraceDot):
1678 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1679
1680 elif (is_first and self.parse_opts.parse_at_all() and
1681 self.token_type == Id.Lit_At):
1682 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1683 # at the beginning of a word to be reserved.
1684
1685 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1686 # @_argv and
1687 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1688 self.cur_token)
1689
1690 else:
1691 # not a literal with lookahead; append it
1692 parts.append(part)
1693
1694 return done
1695
1696 def _ReadCompoundWord(self, lex_mode):
1697 # type: (lex_mode_t) -> CompoundWord
1698 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1699
1700 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1701 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1702 """
1703 Precondition: Looking at the first token of the first word part
1704 Postcondition: Looking at the token after, e.g. space or operator
1705
1706 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1707 could be an operator delimiting a compound word. Can we change lexer modes
1708 and remove this special case?
1709 """
1710 w = CompoundWord([])
1711 num_parts = 0
1712 brace_count = 0
1713 done = False
1714 is_triple_quoted = None # type: Optional[BoolParamBox]
1715
1716 while not done:
1717 self._GetToken()
1718
1719 allow_done = empty_ok or num_parts != 0
1720 if allow_done and self.token_type == eof_type:
1721 done = True # e.g. for ${foo//pat/replace}
1722
1723 # Keywords like "for" are treated like literals
1724 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1725 Kind.ControlFlow, Kind.BoolUnary,
1726 Kind.BoolBinary):
1727
1728 # Syntax error for { and }
1729 if self.token_type == Id.Lit_LBrace:
1730 brace_count += 1
1731 elif self.token_type == Id.Lit_RBrace:
1732 brace_count -= 1
1733 elif self.token_type == Id.Lit_Dollar:
1734 if not self.parse_opts.parse_dollar():
1735 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1736 next_byte = self.lexer.ByteLookAhead()
1737 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1738 if next_byte == '/':
1739 #log('next_byte %r', next_byte)
1740 pass
1741
1742 p_die('Literal $ should be quoted like \$',
1743 self.cur_token)
1744
1745 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1746 w.parts)
1747
1748 elif self.token_kind == Kind.VSub:
1749 vsub_token = self.cur_token
1750
1751 part = SimpleVarSub(vsub_token) # type: word_part_t
1752 w.parts.append(part)
1753
1754 elif self.token_kind == Kind.ExtGlob:
1755 # If parse_at, we can take over @( to start @(seq 3)
1756 # Users can also use look at ,(*.py|*.sh)
1757 if (self.parse_opts.parse_at() and
1758 self.token_type == Id.ExtGlob_At and num_parts == 0):
1759 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1760 d_quoted=False)
1761 # RARE mutation of tok.id!
1762 cs_part.left_token.id = Id.Left_AtParen
1763 part = cs_part # for type safety
1764
1765 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1766 # a=(one two)x and @arrayfunc(3)x.
1767 self._GetToken()
1768 if self.token_kind not in KINDS_THAT_END_WORDS:
1769 p_die('Unexpected token after @()', self.cur_token)
1770 done = True
1771
1772 else:
1773 part = self._ReadExtGlob()
1774 w.parts.append(part)
1775
1776 elif self.token_kind == Kind.Left:
1777 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1778 lex_mode == lex_mode_e.ShCommand and
1779 num_parts == 0)
1780
1781 # Save allocation
1782 if try_triple_quote:
1783 is_triple_quoted = BoolParamBox(False)
1784
1785 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1786 w.parts.append(part)
1787
1788 # NOT done yet, will advance below
1789 elif self.token_kind == Kind.Right:
1790 # Still part of the word; will be done on the next iter.
1791 if self.token_type == Id.Right_DoubleQuote:
1792 pass
1793 # Never happens, no PushHint for this case.
1794 #elif self.token_type == Id.Right_DollarParen:
1795 # pass
1796 elif self.token_type == Id.Right_Subshell:
1797 # LEXER HACK for (case x in x) ;; esac )
1798 # Rewind before it's used
1799 assert self.next_lex_mode == lex_mode_e.Undefined
1800 if self.lexer.MaybeUnreadOne():
1801 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1802 self._SetNext(lex_mode)
1803 done = True
1804 else:
1805 done = True
1806
1807 elif self.token_kind == Kind.Ignored:
1808 done = True
1809
1810 else:
1811 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1812 # so to test for ESAC, we can read ) before getting a chance to
1813 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1814 # token and do it again.
1815
1816 # We get Id.Op_RParen at top level: case x in x) ;; esac
1817 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1818 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1819 # Rewind before it's used
1820 assert self.next_lex_mode == lex_mode_e.Undefined
1821 if self.lexer.MaybeUnreadOne():
1822 if self.token_type == Id.Eof_RParen:
1823 # Redo translation
1824 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1825 self._SetNext(lex_mode)
1826
1827 done = True # anything we don't recognize means we're done
1828
1829 if not done:
1830 self._SetNext(lex_mode)
1831 num_parts += 1
1832
1833 if (self.parse_opts.parse_brace() and num_parts > 1 and
1834 brace_count != 0):
1835 # accept { and }, but not foo{
1836 p_die(
1837 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1838 loc.Word(w))
1839
1840 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1841 p_die('Unexpected parts after triple quoted string',
1842 loc.WordPart(w.parts[-1]))
1843
1844 if 0:
1845 from _devbuild.gen.syntax_asdl import word_part_str
1846 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1847 WORD_HIST[word_key] += 1
1848 return w
1849
1850 def _ReadArithWord(self):
1851 # type: () -> Optional[word_t]
1852 """ Helper for ReadArithWord() """
1853 self._GetToken()
1854
1855 if self.token_kind == Kind.Unknown:
1856 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1857 p_die(
1858 'Unexpected token while parsing arithmetic: %r' %
1859 lexer.TokenVal(self.cur_token), self.cur_token)
1860
1861 elif self.token_kind == Kind.Eof:
1862 return self.cur_token
1863
1864 elif self.token_kind == Kind.Ignored:
1865 # Space should be ignored.
1866 self._SetNext(lex_mode_e.Arith)
1867 return None
1868
1869 elif self.token_kind in (Kind.Arith, Kind.Right):
1870 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1871 self._SetNext(lex_mode_e.Arith)
1872 return self.cur_token
1873
1874 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1875 return self._ReadCompoundWord(lex_mode_e.Arith)
1876
1877 else:
1878 raise AssertionError(self.cur_token)
1879
1880 def _ReadWord(self, word_mode):
1881 # type: (lex_mode_t) -> Optional[word_t]
1882 """Helper function for ReadWord()."""
1883
1884 # Change the pseudo lexer mode to a real lexer mode
1885 if word_mode == lex_mode_e.ShCommandBrack:
1886 lex_mode = lex_mode_e.ShCommand
1887 else:
1888 lex_mode = word_mode
1889
1890 self._GetToken()
1891
1892 if self.token_kind == Kind.Eof:
1893 # No advance
1894 return self.cur_token
1895
1896 # Allow Arith for ) at end of for loop?
1897 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1898 self._SetNext(lex_mode)
1899
1900 # Newlines are complicated. See 3x2 matrix in the comment about
1901 # self.multiline and self.newline_state above.
1902 if self.token_type == Id.Op_Newline:
1903 if self.multiline:
1904 if self.newline_state > 1:
1905 # This points at a blank line, but at least it gives the line number
1906 p_die('Invalid blank line in multiline mode',
1907 self.cur_token)
1908 return None
1909
1910 if self.returned_newline: # skip
1911 return None
1912
1913 return self.cur_token
1914
1915 elif self.token_kind == Kind.Right:
1916 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1917 Id.Right_CasePat,
1918 Id.Right_ShArrayLiteral):
1919 raise AssertionError(self.cur_token)
1920
1921 self._SetNext(lex_mode)
1922 return self.cur_token
1923
1924 elif self.token_kind in (Kind.Ignored, Kind.WS):
1925 self._SetNext(lex_mode)
1926 return None
1927
1928 else:
1929 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
1930 Kind.Left, Kind.KW, Kind.ControlFlow,
1931 Kind.BoolUnary, Kind.BoolBinary,
1932 Kind.ExtGlob), 'Unhandled token kind'
1933
1934 if (word_mode == lex_mode_e.ShCommandBrack and
1935 self.parse_opts.parse_bracket() and
1936 self.token_type == Id.Lit_LBracket):
1937 # Change [ from Kind.Lit -> Kind.Op
1938 # So CommandParser can treat
1939 # assert [42 === x]
1940 # like
1941 # json write (x)
1942 bracket_word = self.cur_token
1943 bracket_word.id = Id.Op_LBracket
1944
1945 self._SetNext(lex_mode)
1946 return bracket_word
1947
1948 # We're beginning a word. If we see Id.Lit_Pound, change to
1949 # lex_mode_e.Comment and read until end of line.
1950 if self.token_type == Id.Lit_Pound:
1951 self._SetNext(lex_mode_e.Comment)
1952 self._GetToken()
1953
1954 # NOTE: The # could be the last character in the file. It can't be
1955 # Eof_{RParen,Backtick} because #) and #` are comments.
1956 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
1957 self.cur_token
1958
1959 # The next iteration will go into Kind.Ignored and set lex state to
1960 # lex_mode_e.ShCommand/etc.
1961 return None # tell ReadWord() to try again after comment
1962
1963 elif self.token_type == Id.Lit_TPound: ### doc comment
1964 self._SetNext(lex_mode_e.Comment)
1965 self._GetToken()
1966
1967 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
1968 return self.cur_token
1969
1970 return None # tell ReadWord() to try again after comment
1971
1972 else:
1973 # r'' u'' b''
1974 if (self.token_type == Id.Lit_Chars and
1975 self.lexer.LookAheadOne(
1976 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
1977
1978 # When shopt -s parse_raw_string:
1979 # echo r'hi' is like echo 'hi'
1980 #
1981 # echo u'\u{3bc}' b'\yff' works
1982
1983 tok = self.cur_token
1984 if self.parse_opts.parse_ysh_string():
1985 if lexer.TokenEquals(tok, 'r'):
1986 left_id = Id.Left_RSingleQuote
1987 elif lexer.TokenEquals(tok, 'u'):
1988 left_id = Id.Left_USingleQuote
1989 elif lexer.TokenEquals(tok, 'b'):
1990 left_id = Id.Left_BSingleQuote
1991 else:
1992 left_id = Id.Undefined_Tok
1993
1994 if left_id != Id.Undefined_Tok:
1995 # skip the r, and then 'foo' will be read as normal
1996 self._SetNext(lex_mode_e.ShCommand)
1997
1998 self._GetToken()
1999 assert self.token_type == Id.Left_SingleQuote, self.token_type
2000
2001 # Read the word in a different lexer mode
2002 return self._ReadYshSingleQuoted(left_id)
2003
2004 return self._ReadCompoundWord(lex_mode)
2005
2006 def ParseVarRef(self):
2007 # type: () -> BracedVarSub
2008 """DYNAMIC parsing of what's inside ${!ref}
2009
2010 # Same as VarOf production
2011 VarRefExpr = VarOf EOF
2012 """
2013 self._SetNext(lex_mode_e.VSub_1)
2014
2015 self._GetToken()
2016 if self.token_kind != Kind.VSub:
2017 p_die('Expected var name', self.cur_token)
2018
2019 part = self._ParseVarOf()
2020 # NOTE: no ${ } means no part.left and part.right
2021 part.left = part.token # cheat to make test pass
2022 part.right = part.token
2023
2024 self._GetToken()
2025 if self.token_type != Id.Eof_Real:
2026 p_die('Expected end of var ref expression', self.cur_token)
2027 return part
2028
2029 def LookPastSpace(self):
2030 # type: () -> Id_t
2031 """Look ahead to the next token.
2032
2033 For the CommandParser to recognize
2034 array= (1 2 3)
2035 YSH for ( versus bash for ((
2036 YSH if ( versus if test
2037 YSH while ( versus while test
2038 YSH bare assignment 'grep =' versus 'grep foo'
2039 """
2040 assert self.token_type != Id.Undefined_Tok
2041 if self.cur_token.id == Id.WS_Space:
2042 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2043 else:
2044 id_ = self.cur_token.id
2045 return id_
2046
2047 def LookAheadFuncParens(self):
2048 # type: () -> bool
2049 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2050 assert self.token_type != Id.Undefined_Tok
2051
2052 # We have to handle 2 cases because we buffer a token
2053 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2054 return self.lexer.LookAheadFuncParens(1) # go back one char
2055
2056 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2057 return self.lexer.LookAheadFuncParens(0)
2058
2059 else:
2060 return False
2061
2062 def ReadWord(self, word_mode):
2063 # type: (lex_mode_t) -> word_t
2064 """Read the next word, using the given lexer mode.
2065
2066 This is a stateful wrapper for the stateless _ReadWord function.
2067 """
2068 assert word_mode in (lex_mode_e.ShCommand, lex_mode_e.ShCommandBrack,
2069 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2070
2071 if self.buffered_word: # For integration with pgen2
2072 w = self.buffered_word
2073 self.buffered_word = None
2074 else:
2075 while True:
2076 w = self._ReadWord(word_mode)
2077 if w is not None:
2078 break
2079
2080 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2081 return w
2082
2083 def ReadArithWord(self):
2084 # type: () -> word_t
2085 while True:
2086 w = self._ReadArithWord()
2087 if w is not None:
2088 break
2089 return w
2090
2091 def ReadHereDocBody(self, parts):
2092 # type: (List[word_part_t]) -> None
2093 """
2094 A here doc is like a double quoted context, except " isn't special.
2095 """
2096 self._ReadLikeDQ(None, False, parts)
2097 # Returns nothing
2098
2099 def ReadForPlugin(self):
2100 # type: () -> CompoundWord
2101 """For $PS1, $PS4, etc.
2102
2103 This is just like reading a here doc line. "\n" is allowed, as
2104 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2105 """
2106 w = CompoundWord([])
2107 self._ReadLikeDQ(None, False, w.parts)
2108 return w
2109
2110 def EmitDocToken(self, b):
2111 # type: (bool) -> None
2112 self.emit_doc_token = b
2113
2114 def Multiline(self, b):
2115 # type: (bool) -> None
2116 self.multiline = b
2117
2118
2119if 0:
2120 import collections
2121 WORD_HIST = collections.Counter()