OILS / osh / word_parse.py View on Github | oilshell.org

2195 lines, 1175 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8word_parse.py - Parse the shell word language.
9
10Hairy example:
11
12 hi$((1 + 2))"$(echo hi)"${var:-__"$(echo default)"__}
13
14Substitutions can be nested, but which inner subs are allowed depends on the
15outer sub. Notes:
16
17lex_mode_e.ShCommand (_ReadUnquotedLeftParts)
18 All subs and quotes are allowed:
19 $v ${v} $() `` $(()) '' "" $'' $"" <() >()
20
21lex_mode_e.DQ (_ReadDoubleQuotedLeftParts)
22 Var, Command, Arith, but no quotes.
23 $v ${v} $() `` $(())
24 No process substitution.
25
26lex_mode_e.Arith
27 Similar to DQ: Var, Command, and Arith sub, but no process sub. bash doesn't
28 allow quotes, but OSH does. We allow ALL FOUR kinds of quotes, because we
29 need those for associative array indexing.
30
31lex_mode_e.VSub_ArgUnquoted
32 Like ShCommand, everything is allowed (even process substitutions), but we
33 stop at }, and space is SIGNIFICANT.
34
35 Example: ${a:- b }
36
37 ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
38 ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
39
40lex_mode_e.VSub_ArgDQ
41 In contrast to DQ, VS_ARG_DQ accepts nested "" and $'' and $"", e.g.
42 "${x:-"default"}".
43
44 In contrast, VSub_ArgUnquoted respects single quotes and process
45 substitution.
46
47 It's weird that double quotes are allowed. Space is also significant here,
48 e.g. "${x:-a "b"}".
49"""
50
51from _devbuild.gen import grammar_nt
52from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str, Kind
53from _devbuild.gen.types_asdl import (lex_mode_t, lex_mode_e)
54from _devbuild.gen.syntax_asdl import (
55 BoolParamBox,
56 Token,
57 SimpleVarSub,
58 loc,
59 source,
60 DoubleQuoted,
61 SingleQuoted,
62 BracedVarSub,
63 CommandSub,
64 ShArrayLiteral,
65 AssocPair,
66 bracket_op,
67 bracket_op_t,
68 suffix_op,
69 suffix_op_t,
70 rhs_word,
71 rhs_word_e,
72 rhs_word_t,
73 word_e,
74 word_t,
75 CompoundWord,
76 word_part,
77 word_part_t,
78 y_lhs_e,
79 arith_expr_t,
80 command,
81 expr,
82 expr_e,
83 expr_t,
84 pat_t,
85 ArgList,
86 Proc,
87 Func,
88 Subscript,
89 Attribute,
90)
91from core import alloc
92from core.error import p_die
93from mycpp.mylib import log
94from core import pyutil
95from core import ui
96from frontend import consts
97from frontend import lexer
98from frontend import reader
99from osh import tdop
100from osh import arith_parse
101from osh import braces
102from osh import word_
103from osh import word_compile
104from mycpp.mylib import tagswitch
105
106from typing import List, Optional, Tuple, cast
107from typing import TYPE_CHECKING
108if TYPE_CHECKING:
109 from frontend.lexer import Lexer
110 from frontend.parse_lib import ParseContext
111 from frontend.reader import _Reader
112 from osh.cmd_parse import VarChecker
113
114unused1 = log
115unused2 = Id_str
116
117KINDS_THAT_END_WORDS = [Kind.Eof, Kind.WS, Kind.Op, Kind.Right]
118
119
120class WordEmitter(object):
121 """Common interface for [ and [["""
122
123 def __init__(self):
124 # type: () -> None
125 """Empty constructor for mycpp."""
126 pass
127
128 def ReadWord(self, lex_mode):
129 # type: (lex_mode_t) -> word_t
130 raise NotImplementedError()
131
132
133class WordParser(WordEmitter):
134
135 def __init__(self, parse_ctx, lexer, line_reader):
136 # type: (ParseContext, Lexer, _Reader) -> None
137 self.parse_ctx = parse_ctx
138 self.lexer = lexer
139 self.line_reader = line_reader
140 self.arena = line_reader.arena
141
142 self.parse_opts = parse_ctx.parse_opts
143 self.a_parser = tdop.TdopParser(arith_parse.Spec(), self,
144 self.parse_opts)
145 self.Reset()
146
147 def Init(self, lex_mode):
148 # type: (lex_mode_t) -> None
149 """Used to parse arithmetic, see ParseContext."""
150 self.next_lex_mode = lex_mode
151
152 def Reset(self):
153 # type: () -> None
154 """Called by interactive loop."""
155 # For _GetToken()
156 self.cur_token = None # type: Token
157 self.token_kind = Kind.Undefined
158 self.token_type = Id.Undefined_Tok
159
160 self.next_lex_mode = lex_mode_e.ShCommand
161
162 # Boolean mutated by CommandParser via word_.ctx_EmitDocToken. For ### doc
163 # comments
164 self.emit_doc_token = False
165 # Boolean mutated by CommandParser via word_.ctx_Multiline. '...' starts
166 # multiline mode.
167 self.multiline = False
168
169 # For detecting invalid \n\n in multiline mode. Counts what we got
170 # directly from the lexer.
171 self.newline_state = 0
172 # For consolidating \n\n -> \n for the CALLER. This simplifies the parsers
173 # that consume words.
174 self.returned_newline = False
175
176 # For integration with pgen2
177 self.buffered_word = None # type: word_t
178
179 def _GetToken(self):
180 # type: () -> None
181 """Call this when you need to make a decision based on any of:
182
183 self.token_type
184 self.token_kind
185 self.cur_token
186 """
187 if self.next_lex_mode == lex_mode_e.Undefined:
188 return # _SetNext() not called, so do nothing
189
190 is_fake = self.next_lex_mode == lex_mode_e.BashRegexFakeInner
191 real_mode = (lex_mode_e.BashRegex if is_fake else self.next_lex_mode)
192
193 self.cur_token = self.lexer.Read(real_mode)
194
195 # MUTATE TOKEN for fake lexer mode.
196 # This is for crazy stuff bash allows, like [[ s =~ (< >) ]]
197 if (is_fake and self.cur_token.id
198 in (Id.WS_Space, Id.BashRegex_AllowedInParens)):
199 self.cur_token.id = Id.Lit_Chars
200
201 self.token_type = self.cur_token.id
202 self.token_kind = consts.GetKind(self.token_type)
203
204 # number of consecutive newlines, ignoring whitespace
205 if self.token_type == Id.Op_Newline:
206 self.newline_state += 1
207 elif self.token_kind != Kind.WS:
208 self.newline_state = 0
209
210 self.parse_ctx.trail.AppendToken(self.cur_token) # For completion
211 self.next_lex_mode = lex_mode_e.Undefined
212
213 def _SetNext(self, lex_mode):
214 # type: (lex_mode_t) -> None
215 """Set the next lex state, but don't actually read a token.
216
217 We need this for proper interactive parsing.
218 """
219 self.next_lex_mode = lex_mode
220
221 def _ReadVarOpArg(self, arg_lex_mode):
222 # type: (lex_mode_t) -> rhs_word_t
223
224 # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
225 # valid, even when unquoted.
226 self._SetNext(arg_lex_mode)
227 self._GetToken()
228
229 w = self._ReadVarOpArg2(arg_lex_mode, Id.Undefined_Tok,
230 True) # empty_ok
231
232 # If the Compound has no parts, and we're in a double-quoted VarSub
233 # arg, and empty_ok, then return Empty. This is so it can evaluate to
234 # the empty string and not get elided.
235 #
236 # Examples:
237 # - "${s:-}", "${s/%pat/}"
238 # It's similar to LooksLikeShAssignment where we turn x= into x=''. And it
239 # has the same potential problem of not having Token location info.
240 #
241 # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
242 # return a Compound with no parts, which is explicitly checked with a
243 # custom error message.
244 if len(w.parts) == 0 and arg_lex_mode == lex_mode_e.VSub_ArgDQ:
245 return rhs_word.Empty
246
247 return w
248
249 def _ReadVarOpArg2(self, arg_lex_mode, eof_type, empty_ok):
250 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
251 """Return a CompoundWord.
252
253 Helper function for _ReadVarOpArg and used directly by
254 _ReadPatSubVarOp.
255 """
256 w = self._ReadCompoundWord3(arg_lex_mode, eof_type, empty_ok)
257 #log('w %s', w)
258 tilde = word_.TildeDetect(w)
259 if tilde:
260 w = tilde
261 return w
262
263 def _ReadSliceVarOp(self):
264 # type: () -> suffix_op.Slice
265 """VarOf ':' ArithExpr (':' ArithExpr )?"""
266 self._SetNext(lex_mode_e.Arith)
267 self._GetToken()
268 cur_id = self.token_type # e.g. Id.Arith_Colon
269
270 if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
271 # no beginning specified
272 begin = None # type: Optional[arith_expr_t]
273 else:
274 begin = self.a_parser.Parse()
275 cur_id = self.a_parser.CurrentId()
276
277 if cur_id == Id.Arith_RBrace:
278 no_length = None # type: Optional[arith_expr_t] # No length specified
279 return suffix_op.Slice(begin, no_length)
280
281 # Id.Arith_Colon is a pun for Id.VOp2_Colon
282 if cur_id == Id.Arith_Colon:
283 self._SetNext(lex_mode_e.Arith)
284 length = self._ReadArithExpr(Id.Arith_RBrace)
285 return suffix_op.Slice(begin, length)
286
287 p_die("Expected : or } in slice", self.cur_token)
288 raise AssertionError() # for MyPy
289
290 def _ReadPatSubVarOp(self):
291 # type: () -> suffix_op.PatSub
292 """Looking at the first '/' after VarOf:
293
294 VarSub = ...
295 | VarOf '/' Match ( '/' WORD? )?
296 Match = '/' WORD # can't be empty
297 | '#' WORD? # may be empty
298 | '%' WORD?
299 """
300 slash_tok = self.cur_token # location info
301 replace_mode = Id.Undefined_Tok # bizarre syntax / # %
302
303 self._SetNext(lex_mode_e.VSub_ArgUnquoted) # advance past /
304
305 self._GetToken()
306 if self.token_type == Id.Right_DollarBrace:
307 pat = CompoundWord([])
308 return suffix_op.PatSub(pat, rhs_word.Empty, replace_mode,
309 slash_tok)
310
311 if self.token_type in (Id.Lit_Slash, Id.Lit_Pound, Id.Lit_Percent):
312 replace_mode = self.token_type
313 self._SetNext(lex_mode_e.VSub_ArgUnquoted)
314
315 # Bash quirk:
316 # echo ${x/#/replace} has an empty pattern
317 # echo ${x////replace} is non-empty; it means echo ${x//'/'/replace}
318 empty_ok = replace_mode != Id.Lit_Slash
319 pat = self._ReadVarOpArg2(lex_mode_e.VSub_ArgUnquoted, Id.Lit_Slash,
320 empty_ok)
321 #log('pat 1 %r', pat)
322
323 if self.token_type == Id.Lit_Slash:
324 # read until }
325 replace = self._ReadVarOpArg(
326 lex_mode_e.VSub_ArgUnquoted) # type: rhs_word_t
327 #log('r 1 %r', replace)
328 else:
329 # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
330 replace = rhs_word.Empty
331
332 self._GetToken()
333 if self.token_type != Id.Right_DollarBrace:
334 # This happens on invalid code
335 p_die(
336 "Expected } after replacement string, got %s" %
337 ui.PrettyId(self.token_type), self.cur_token)
338
339 return suffix_op.PatSub(pat, replace, replace_mode, slash_tok)
340
341 def _ReadSubscript(self):
342 # type: () -> bracket_op_t
343 """ Subscript = '[' ('@' | '*' | ArithExpr) ']' """
344 # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
345 # expression.
346 next_id = self.lexer.LookPastSpace(lex_mode_e.Arith)
347 if next_id in (Id.Lit_At, Id.Arith_Star):
348 op = bracket_op.WholeArray(next_id) # type: bracket_op_t
349
350 self._SetNext(lex_mode_e.Arith) # skip past [
351 self._GetToken()
352 self._SetNext(lex_mode_e.Arith) # skip past @
353 self._GetToken()
354 else:
355 self._SetNext(lex_mode_e.Arith) # skip past [
356 anode = self._ReadArithExpr(Id.Arith_RBracket)
357 op = bracket_op.ArrayIndex(anode)
358
359 if self.token_type != Id.Arith_RBracket: # Should be looking at ]
360 p_die('Expected ] to close subscript', self.cur_token)
361
362 self._SetNext(lex_mode_e.VSub_2) # skip past ]
363 self._GetToken() # Needed to be in the same spot as no subscript
364
365 return op
366
367 def _ParseVarOf(self):
368 # type: () -> BracedVarSub
369 """
370 VarOf = NAME Subscript?
371 | NUMBER # no subscript allowed, none of these are arrays
372 # ${@[1]} doesn't work, even though slicing does
373 | VarSymbol
374 """
375 self._GetToken()
376 name_token = self.cur_token
377 self._SetNext(lex_mode_e.VSub_2)
378
379 self._GetToken() # Check for []
380 if self.token_type == Id.VOp2_LBracket:
381 bracket_op = self._ReadSubscript()
382 else:
383 bracket_op = None
384
385 part = BracedVarSub.CreateNull()
386 part.token = name_token
387 part.var_name = lexer.TokenVal(name_token)
388 part.bracket_op = bracket_op
389 return part
390
391 def _ParseVarExpr(self, arg_lex_mode, allow_query=False):
392 # type: (lex_mode_t, bool) -> BracedVarSub
393 """Start parsing at the op -- we already skipped past the name."""
394 part = self._ParseVarOf()
395
396 self._GetToken()
397 if self.token_type == Id.Right_DollarBrace:
398 return part # no ops
399
400 op_kind = self.token_kind
401
402 if op_kind == Kind.VTest:
403 tok = self.cur_token
404 arg_word = self._ReadVarOpArg(arg_lex_mode)
405 if self.token_type != Id.Right_DollarBrace:
406 p_die('Expected } to close ${', self.cur_token)
407
408 part.suffix_op = suffix_op.Unary(tok, arg_word)
409
410 elif op_kind == Kind.VOpYsh:
411 tok = self.cur_token
412 arg_word = self._ReadVarOpArg(arg_lex_mode)
413 if self.token_type != Id.Right_DollarBrace:
414 p_die('Expected } to close ${', self.cur_token)
415
416 UP_arg_word = arg_word
417 with tagswitch(arg_word) as case:
418 if case(rhs_word_e.Empty):
419 pass
420 elif case(rhs_word_e.Compound):
421 arg_word = cast(CompoundWord, UP_arg_word)
422 # This handles ${x|html} and ${x %.3f} now
423 # However I think ${x %.3f} should be statically parsed? It can enter
424 # the printf lexer modes.
425 ok, arg, quoted = word_.StaticEval(arg_word)
426 if not ok or quoted:
427 p_die('Expected a constant argument',
428 loc.Word(arg_word))
429
430 part.suffix_op = suffix_op.Static(tok, arg)
431
432 elif op_kind == Kind.VOp0:
433 part.suffix_op = self.cur_token # Nullary
434 self._SetNext(lex_mode_e.VSub_2) # Expecting }
435 self._GetToken()
436
437 elif op_kind == Kind.VOp1: # % %% # ## etc.
438 tok = self.cur_token
439 # Weird exception that all shells have: these operators take a glob
440 # pattern, so they're lexed as VSub_ArgUnquoted, not VSub_ArgDQ
441 arg_word = self._ReadVarOpArg(lex_mode_e.VSub_ArgUnquoted)
442 if self.token_type != Id.Right_DollarBrace:
443 p_die('Expected } to close ${', self.cur_token)
444
445 part.suffix_op = suffix_op.Unary(tok, arg_word)
446
447 elif op_kind == Kind.VOp2: # / : [ ]
448 if self.token_type == Id.VOp2_Slash:
449 patsub_op = self._ReadPatSubVarOp() # type: suffix_op_t
450 part.suffix_op = patsub_op
451
452 # Checked by the method above
453 assert self.token_type == Id.Right_DollarBrace, self.cur_token
454
455 elif self.token_type == Id.VOp2_Colon:
456 part.suffix_op = self._ReadSliceVarOp()
457 # NOTE: } in arithmetic mode.
458 if self.token_type != Id.Arith_RBrace:
459 # Token seems off; doesn't point to X in # ${a:1:2 X
460 p_die('Expected } to close ${', self.cur_token)
461
462 else:
463 # TODO: Does this ever happen?
464 p_die('Unexpected token in ${} (%s)' % 'VOp2', self.cur_token)
465
466 elif op_kind == Kind.VOp3: # ${prefix@} etc.
467 if allow_query:
468 part.suffix_op = self.cur_token # Nullary
469 self._SetNext(lex_mode_e.VSub_2) # Expecting }
470 self._GetToken()
471 else:
472 p_die("Unexpected token in ${} (%s)" % 'VOp3', self.cur_token)
473
474 # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
475 # mode. It's redundantly checked above.
476 if self.token_type not in (Id.Right_DollarBrace, Id.Arith_RBrace):
477 # ${a.} or ${!a.}
478 p_die('Expected } to close ${', self.cur_token)
479
480 # Now look for ops
481 return part
482
483 def _ReadZshVarSub(self, left_token):
484 # type: (Token) -> word_part.ZshVarSub
485
486 self._SetNext(lex_mode_e.VSub_Zsh) # Move past ${(foo)
487
488 # Can be empty
489 w = self._ReadCompoundWord3(lex_mode_e.VSub_Zsh, Id.Right_DollarBrace,
490 True)
491 self._GetToken()
492 return word_part.ZshVarSub(left_token, w, self.cur_token)
493
494 def ReadBracedVarSub(self, left_token):
495 # type: (Token) -> Tuple[BracedVarSub, Token]
496 """ For YSH expressions like var x = ${x:-"default"}. """
497 part = self._ReadBracedVarSub(left_token, d_quoted=False)
498 last_token = self.cur_token
499 return part, last_token
500
501 def _ReadBracedVarSub(self, left_token, d_quoted):
502 # type: (Token, bool) -> BracedVarSub
503 """For the ${} expression language.
504
505 NAME = [a-zA-Z_][a-zA-Z0-9_]*
506 NUMBER = [0-9]+ # ${10}, ${11}, ...
507
508 Subscript = '[' ('@' | '*' | ArithExpr) ']'
509 VarSymbol = '!' | '@' | '#' | ...
510 VarOf = NAME Subscript?
511 | NUMBER # no subscript allowed, none of these are arrays
512 # ${@[1]} doesn't work, even though slicing does
513 | VarSymbol
514
515 NULLARY_OP = '@Q' | '@E' | '@P' | '@A' | '@a' # VOp0
516
517 TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
518 STRIP_OP = '#' | '##' | '%' | '%%'
519 CASE_OP = ',' | ',,' | '^' | '^^'
520 UnaryOp = TEST_OP | STRIP_OP | CASE_OP
521
522 YSH_UNARY = '|' | ' ' # ${x|html} and ${x %.3f}.
523 # SPACE is operator not %
524 Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
525 VarExpr = VarOf
526 | VarOf NULLARY_OP
527 | VarOf UnaryOp WORD
528 | VarOf YSH_UNARY STATIC_WORD
529 | VarOf ':' ArithExpr (':' ArithExpr )?
530 | VarOf '/' Match '/' WORD
531
532 LengthExpr = '#' VarOf # can't apply operators after length
533
534 RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
535 # ${!ref[0]} vs ${!keys[@]} resolved later
536
537 PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
538
539 BuiltinSub = '.' WORD+ # ${.myproc 'builtin' $sub}
540
541 VarSub = LengthExpr
542 | RefOrKeys
543 | PrefixQuery
544 | VarExpr
545 | BuiltinSub
546
547 NOTES:
548 - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
549 slicing ${a:x+1:y+2}
550 - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
551 - @ and * are technically arithmetic expressions in this implementation
552 - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
553 it's also vectorized.
554
555 Strictness over bash:
556 - echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
557 grammar
558 - ! and # prefixes can't be composed, even though named refs can be
559 composed with other operators
560 - '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip
561 a prefix, and it can also be a literal part of WORD.
562
563 From the parser's point of view, the prefix # can't be combined with
564 UnaryOp/slicing/matching, and the ! can. However
565
566 - ${a[@]:1:2} is not allowed
567 - ${#a[@]:1:2} is allowed, but gives the wrong answer
568 """
569 if d_quoted:
570 arg_lex_mode = lex_mode_e.VSub_ArgDQ
571 else:
572 arg_lex_mode = lex_mode_e.VSub_ArgUnquoted
573
574 self._SetNext(lex_mode_e.VSub_1)
575 self._GetToken()
576
577 ty = self.token_type
578 first_tok = self.cur_token
579
580 if ty == Id.VSub_Pound:
581 # Disambiguate
582 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
583 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
584 # e.g. a name, '#' is the prefix
585 self._SetNext(lex_mode_e.VSub_1)
586 part = self._ParseVarOf()
587
588 self._GetToken()
589 if self.token_type != Id.Right_DollarBrace:
590 p_die('Expected } after length expression', self.cur_token)
591
592 part.prefix_op = first_tok
593
594 else: # not a prefix, '#' is the variable
595 part = self._ParseVarExpr(arg_lex_mode)
596
597 elif ty == Id.VSub_Bang:
598 next_id = self.lexer.LookPastSpace(lex_mode_e.VSub_1)
599 if next_id not in (Id.Unknown_Tok, Id.Right_DollarBrace):
600 # e.g. a name, '!' is the prefix
601 # ${!a} -- this is a ref
602 # ${!3} -- this is ref
603 # ${!a[1]} -- this is a ref
604 # ${!a[@]} -- this is a keys
605 # No lookahead -- do it in a second step, or at runtime
606 self._SetNext(lex_mode_e.VSub_1)
607 part = self._ParseVarExpr(arg_lex_mode, allow_query=True)
608
609 part.prefix_op = first_tok
610
611 else: # not a prefix, '!' is the variable
612 part = self._ParseVarExpr(arg_lex_mode)
613
614 elif ty == Id.VSub_Dot:
615 # Note: this will become a new builtin_sub type, so this method must
616 # return word_part_t rather than BracedVarSub. I don't think that
617 # should cause problems.
618 p_die('TODO: ${.myproc builtin sub}', self.cur_token)
619
620 # VS_NAME, VS_NUMBER, symbol that isn't # or !
621 elif self.token_kind == Kind.VSub:
622 part = self._ParseVarExpr(arg_lex_mode)
623
624 else:
625 # e.g. ${^}
626 p_die('Unexpected token in ${}', self.cur_token)
627
628 part.left = left_token # attach the argument
629 part.right = self.cur_token
630 return part
631
632 def _ReadSingleQuoted(self, left_token, lex_mode):
633 # type: (Token, lex_mode_t) -> SingleQuoted
634 """Internal method to read a word_part."""
635 tokens = [] # type: List[Token]
636 # In command mode, we never disallow backslashes like '\'
637 right_quote = self.ReadSingleQuoted(lex_mode, left_token, tokens,
638 False)
639 sval = word_compile.EvalSingleQuoted2(left_token.id, tokens)
640 node = SingleQuoted(left_token, sval, right_quote)
641 return node
642
643 def ReadSingleQuoted(self, lex_mode, left_token, out_tokens, is_ysh_expr):
644 # type: (lex_mode_t, Token, List[Token], bool) -> Token
645 """Appends to out_tokens; returns last token
646
647 Used by expr_parse.py
648 """
649 # TODO: Remove and use out_tokens
650 tokens = [] # type: List[Token]
651
652 # echo '\' is allowed, but x = '\' is invalid, in favor of x = r'\'
653 no_backslashes = is_ysh_expr and left_token.id == Id.Left_SingleQuote
654
655 expected_end_tokens = 3 if left_token.id in (
656 Id.Left_TSingleQuote, Id.Left_RTSingleQuote, Id.Left_UTSingleQuote,
657 Id.Left_BTSingleQuote) else 1
658 num_end_tokens = 0
659
660 while num_end_tokens < expected_end_tokens:
661 self._SetNext(lex_mode)
662 self._GetToken()
663
664 # Kind.Char emitted in lex_mode.SQ_C
665 if self.token_kind in (Kind.Lit, Kind.Char):
666 tok = self.cur_token
667 # Happens in lex_mode_e.SQ: 'one\two' is ambiguous, should be
668 # r'one\two' or c'one\\two'
669 if no_backslashes and lexer.TokenContains(tok, '\\'):
670 p_die(
671 r"Strings with backslashes should look like r'\n' or u'\n' or b'\n'",
672 tok)
673
674 if is_ysh_expr:
675 # Disallow var x = $'\001'. Arguably we don't need these
676 # checks because u'\u{1}' is the way to write it.
677 if self.token_type == Id.Char_Octal3:
678 p_die(
679 r"Use \xhh or \u{...} instead of octal escapes in YSH strings",
680 tok)
681
682 if self.token_type == Id.Char_Hex and self.cur_token.length != 4:
683 # disallow \xH
684 p_die(
685 r'Invalid hex escape in YSH string (must be \xHH)',
686 tok)
687
688 tokens.append(tok)
689
690 elif self.token_kind == Kind.Unknown:
691 tok = self.cur_token
692 assert tok.id == Id.Unknown_Backslash, tok
693
694 # x = $'\z' is disallowed; ditto for echo $'\z' if shopt -u parse_backslash
695 if is_ysh_expr or not self.parse_opts.parse_backslash():
696 p_die(
697 "Invalid char escape in C-style string literal (OILS-ERR-11)",
698 tok)
699
700 tokens.append(tok)
701
702 elif self.token_kind == Kind.Eof:
703 p_die('Unexpected EOF in single-quoted string that began here',
704 left_token)
705
706 elif self.token_kind == Kind.Right:
707 # assume Id.Right_SingleQuote
708 num_end_tokens += 1
709 tokens.append(self.cur_token)
710
711 else:
712 raise AssertionError(self.cur_token)
713
714 if self.token_kind != Kind.Right:
715 num_end_tokens = 0 # we need three in a ROW
716
717 if expected_end_tokens == 1:
718 tokens.pop()
719 elif expected_end_tokens == 3: # Get rid of spurious end tokens
720 tokens.pop()
721 tokens.pop()
722 tokens.pop()
723
724 # Remove space from ''' r''' $''' in both expression mode and command mode
725 if left_token.id in (Id.Left_TSingleQuote, Id.Left_RTSingleQuote,
726 Id.Left_UTSingleQuote, Id.Left_BTSingleQuote):
727 word_compile.RemoveLeadingSpaceSQ(tokens)
728
729 # Validation after lexing - same 2 checks in j8.LexerDecoder
730 is_u_string = left_token.id in (Id.Left_USingleQuote,
731 Id.Left_UTSingleQuote)
732
733 for tok in tokens:
734 # u'\yff' is not valid, but b'\yff' is
735 if is_u_string and tok.id == Id.Char_YHex:
736 p_die(
737 r"%s escapes not allowed in u'' strings" %
738 lexer.TokenVal(tok), tok)
739 # \u{dc00} isn't valid
740 if tok.id == Id.Char_UBraced:
741 h = lexer.TokenSlice(tok, 3, -1) # \u{123456}
742 i = int(h, 16)
743 if 0xD800 <= i and i < 0xE000:
744 p_die(
745 r"%s escape is illegal because it's in the surrogate range"
746 % lexer.TokenVal(tok), tok)
747
748 out_tokens.extend(tokens)
749 return self.cur_token
750
751 def _ReadDoubleQuotedLeftParts(self):
752 # type: () -> word_part_t
753 """Read substitution parts in a double quoted context."""
754 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick):
755 return self._ReadCommandSub(self.token_type, d_quoted=True)
756
757 if self.token_type == Id.Left_DollarBrace:
758 return self._ReadBracedVarSub(self.cur_token, d_quoted=True)
759
760 if self.token_type == Id.Left_DollarDParen:
761 return self._ReadArithSub()
762
763 if self.token_type == Id.Left_DollarBracket:
764 return self._ReadExprSub(lex_mode_e.DQ)
765
766 raise AssertionError(self.cur_token)
767
768 def _ReadYshSingleQuoted(self, left_id):
769 # type: (Id_t) -> CompoundWord
770 """Read YSH style strings
771
772 r'' u'' b''
773 r''' ''' u''' ''' b''' '''
774 """
775 #log('BEF self.cur_token %s', self.cur_token)
776 if left_id == Id.Left_RSingleQuote:
777 lexer_mode = lex_mode_e.SQ_Raw
778 triple_left_id = Id.Left_RTSingleQuote
779 elif left_id == Id.Left_USingleQuote:
780 lexer_mode = lex_mode_e.J8_Str
781 triple_left_id = Id.Left_UTSingleQuote
782 elif left_id == Id.Left_BSingleQuote:
783 lexer_mode = lex_mode_e.J8_Str
784 triple_left_id = Id.Left_BTSingleQuote
785 else:
786 raise AssertionError(left_id)
787
788 # Needed for syntax checks
789 left_tok = self.cur_token
790 left_tok.id = left_id
791
792 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
793
794 if (len(sq_part.sval) == 0 and self.lexer.ByteLookAhead() == "'"):
795 self._SetNext(lex_mode_e.ShCommand)
796 self._GetToken()
797
798 assert self.token_type == Id.Left_SingleQuote
799 # HACK: magically transform the third ' in u''' to
800 # Id.Left_UTSingleQuote, so that ''' is the terminator
801 left_tok = self.cur_token
802 left_tok.id = triple_left_id
803
804 # Handles stripping leading whitespace
805 sq_part = self._ReadSingleQuoted(left_tok, lexer_mode)
806
807 # Advance and validate
808 self._SetNext(lex_mode_e.ShCommand)
809
810 self._GetToken()
811 if self.token_kind not in KINDS_THAT_END_WORDS:
812 p_die('Unexpected token after YSH single-quoted string',
813 self.cur_token)
814
815 return CompoundWord([sq_part])
816
817 def _ReadUnquotedLeftParts(self, triple_out):
818 # type: (Optional[BoolParamBox]) -> word_part_t
819 """Read substitutions and quoted strings (for lex_mode_e.ShCommand).
820
821 If triple_out is set, then we try parsing triple quoted strings,
822 and set its value to True if we got one.
823 """
824 if self.token_type in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote):
825 # Note: $"" is a synonym for "". It might make sense if it added
826 # \n \0 \x00 \u{123} etc. But that's not what bash does!
827 dq_part = self._ReadDoubleQuoted(self.cur_token)
828 # Got empty word "" and there's a " after
829 if (triple_out and len(dq_part.parts) == 0 and
830 self.lexer.ByteLookAhead() == '"'):
831
832 self._SetNext(lex_mode_e.ShCommand)
833 self._GetToken()
834 # HACK: magically transform the third " in """ to
835 # Id.Left_TDoubleQuote, so that """ is the terminator
836 left_dq_token = self.cur_token
837 left_dq_token.id = Id.Left_TDoubleQuote
838 triple_out.b = True # let caller know we got it
839 return self._ReadDoubleQuoted(left_dq_token)
840
841 return dq_part
842
843 if self.token_type in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
844 Id.Left_DollarSingleQuote):
845 if self.token_type == Id.Left_SingleQuote:
846 lexer_mode = lex_mode_e.SQ_Raw
847 triple_left_id = Id.Left_TSingleQuote
848 elif self.token_type == Id.Left_RSingleQuote:
849 lexer_mode = lex_mode_e.SQ_Raw
850 triple_left_id = Id.Left_RTSingleQuote
851 else:
852 lexer_mode = lex_mode_e.SQ_C
853 # there is no such thing as $'''
854 triple_left_id = Id.Undefined_Tok
855
856 sq_part = self._ReadSingleQuoted(self.cur_token, lexer_mode)
857
858 # Got empty '' or r'' and there's a ' after
859 # u'' and b'' are handled in _ReadYshSingleQuoted
860 if (triple_left_id != Id.Undefined_Tok and
861 triple_out is not None and len(sq_part.sval) == 0 and
862 self.lexer.ByteLookAhead() == "'"):
863
864 self._SetNext(lex_mode_e.ShCommand)
865 self._GetToken()
866
867 # HACK: magically transform the third ' in ''' to
868 # Id.Left_TSingleQuote, so that ''' is the terminator
869 left_sq_token = self.cur_token
870 left_sq_token.id = triple_left_id
871
872 triple_out.b = True # let caller know we got it
873 return self._ReadSingleQuoted(left_sq_token, lexer_mode)
874
875 return sq_part
876
877 if self.token_type in (Id.Left_DollarParen, Id.Left_Backtick,
878 Id.Left_ProcSubIn, Id.Left_ProcSubOut):
879 return self._ReadCommandSub(self.token_type, d_quoted=False)
880
881 if self.token_type == Id.Left_DollarBrace:
882 return self._ReadBracedVarSub(self.cur_token, d_quoted=False)
883
884 if self.token_type == Id.Left_DollarDParen:
885 return self._ReadArithSub()
886
887 if self.token_type == Id.Left_DollarBracket:
888 return self._ReadExprSub(lex_mode_e.ShCommand)
889
890 if self.token_type == Id.Left_DollarBraceZsh:
891 return self._ReadZshVarSub(self.cur_token)
892
893 raise AssertionError(self.cur_token)
894
895 def _ReadExtGlob(self):
896 # type: () -> word_part.ExtGlob
897 """
898 Grammar:
899 Item = CompoundWord | EPSILON # important: @(foo|) is allowed
900 LEFT = '@(' | '*(' | '+(' | '?(' | '!('
901 RIGHT = ')'
902 ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
903 Compound includes ExtGlob
904 """
905 left_token = self.cur_token
906 right_token = None # type: Token
907 arms = [] # type: List[CompoundWord]
908
909 self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
910 self._SetNext(lex_mode_e.ExtGlob) # advance past LEFT
911
912 read_word = False # did we just a read a word? To handle @(||).
913
914 while True:
915 self._GetToken()
916
917 if self.token_type == Id.Right_ExtGlob:
918 if not read_word:
919 arms.append(CompoundWord([]))
920 right_token = self.cur_token
921 break
922
923 elif self.token_type == Id.Op_Pipe:
924 if not read_word:
925 arms.append(CompoundWord([]))
926 read_word = False
927 self._SetNext(lex_mode_e.ExtGlob)
928
929 # lex_mode_e.ExtGlob should only produce these 4 kinds of tokens
930 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub,
931 Kind.ExtGlob):
932 w = self._ReadCompoundWord(lex_mode_e.ExtGlob)
933 arms.append(w)
934 read_word = True
935
936 elif self.token_kind == Kind.Eof:
937 p_die('Unexpected EOF reading extended glob that began here',
938 left_token)
939
940 else:
941 raise AssertionError(self.cur_token)
942
943 return word_part.ExtGlob(left_token, arms, right_token)
944
945 def _ReadBashRegexGroup(self):
946 # type: () -> word_part.BashRegexGroup
947 """
948 Grammar:
949 BashRegexGroup = '(' WORD? ')
950 """
951 left_token = self.cur_token
952 assert left_token.id == Id.BashRegex_LParen, left_token
953
954 right_token = None # type: Token
955 arms = [] # type: List[CompoundWord]
956
957 self.lexer.PushHint(Id.Op_RParen, Id.Right_BashRegexGroup)
958 self._SetNext(lex_mode_e.BashRegexFakeInner) # advance past LEFT
959
960 self._GetToken()
961 if self.token_type == Id.Right_BashRegexGroup: # empty ()
962 return word_part.BashRegexGroup(left_token, None, self.cur_token)
963
964 # lex_mode_e.BashRegex should only produce these 4 kinds of tokens
965 if self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.BashRegex):
966 # Fake lexer mode that translates Id.WS_Space to Id.Lit_Chars
967 # To allow bash style [[ s =~ (a b) ]]
968 w = self._ReadCompoundWord(lex_mode_e.BashRegexFakeInner)
969 arms.append(w)
970
971 self._GetToken()
972 if self.token_type != Id.Right_BashRegexGroup:
973 p_die('Expected ) to close bash regex group', self.cur_token)
974
975 return word_part.BashRegexGroup(left_token, w, self.cur_token)
976
977 p_die('Expected word after ( opening bash regex group', self.cur_token)
978
979 def _ReadLikeDQ(self, left_token, is_ysh_expr, out_parts):
980 # type: (Optional[Token], bool, List[word_part_t]) -> None
981 """
982 Args:
983 left_token: A token if we are reading a double quoted part, or None if
984 we're reading a here doc.
985 is_ysh_expr: Whether to disallow backticks and invalid char escapes
986 out_parts: list of word_part to append to
987 """
988 if left_token:
989 expected_end_tokens = 3 if left_token.id == Id.Left_TDoubleQuote else 1
990 else:
991 expected_end_tokens = 1000 # here doc will break
992
993 num_end_tokens = 0
994 while num_end_tokens < expected_end_tokens:
995 self._SetNext(lex_mode_e.DQ)
996 self._GetToken()
997
998 if self.token_kind == Kind.Lit:
999 if self.token_type == Id.Lit_EscapedChar:
1000 tok = self.cur_token
1001 ch = lexer.TokenSliceLeft(tok, 1)
1002 part = word_part.EscapedLiteral(tok,
1003 ch) # type: word_part_t
1004 else:
1005 if self.token_type == Id.Lit_BadBackslash:
1006 # echo "\z" is OK in shell, but 'x = "\z" is a syntax error in
1007 # YSH.
1008 # Slight hole: We don't catch 'x = ${undef:-"\z"} because of the
1009 # recursion (unless parse_backslash)
1010 if (is_ysh_expr or
1011 not self.parse_opts.parse_backslash()):
1012 p_die(
1013 "Invalid char escape in double quoted string (OILS-ERR-12)",
1014 self.cur_token)
1015 elif self.token_type == Id.Lit_Dollar:
1016 if is_ysh_expr or not self.parse_opts.parse_dollar():
1017 p_die("Literal $ should be quoted like \$",
1018 self.cur_token)
1019
1020 part = self.cur_token
1021 out_parts.append(part)
1022
1023 elif self.token_kind == Kind.Left:
1024 if self.token_type == Id.Left_Backtick and is_ysh_expr:
1025 p_die("Invalid backtick: use $(cmd) or \\` in YSH strings",
1026 self.cur_token)
1027
1028 part = self._ReadDoubleQuotedLeftParts()
1029 out_parts.append(part)
1030
1031 elif self.token_kind == Kind.VSub:
1032 tok = self.cur_token
1033 part = SimpleVarSub(tok)
1034 out_parts.append(part)
1035 # NOTE: parsing "$f(x)" would BREAK CODE. Could add a more for it
1036 # later.
1037
1038 elif self.token_kind == Kind.Right:
1039 assert self.token_type == Id.Right_DoubleQuote, self.token_type
1040 if left_token:
1041 num_end_tokens += 1
1042
1043 # In a here doc, the right quote is literal!
1044 out_parts.append(self.cur_token)
1045
1046 elif self.token_kind == Kind.Eof:
1047 if left_token:
1048 p_die(
1049 'Unexpected EOF reading double-quoted string that began here',
1050 left_token)
1051 else: # here docs will have an EOF in their token stream
1052 break
1053
1054 else:
1055 raise AssertionError(self.cur_token)
1056
1057 if self.token_kind != Kind.Right:
1058 num_end_tokens = 0 # """ must be CONSECUTIVE
1059
1060 if expected_end_tokens == 1:
1061 out_parts.pop()
1062 elif expected_end_tokens == 3:
1063 out_parts.pop()
1064 out_parts.pop()
1065 out_parts.pop()
1066
1067 # Remove space from """ in both expression mode and command mode
1068 if left_token and left_token.id == Id.Left_TDoubleQuote:
1069 word_compile.RemoveLeadingSpaceDQ(out_parts)
1070
1071 # Return nothing, since we appended to 'out_parts'
1072
1073 def _ReadDoubleQuoted(self, left_token):
1074 # type: (Token) -> DoubleQuoted
1075 """Helper function for "hello $name".
1076
1077 Args:
1078 eof_type: for stopping at }, Id.Lit_RBrace
1079 here_doc: Whether we are reading in a here doc context
1080
1081 Also ${foo%%a b c} # treat this as double quoted. until you hit
1082 """
1083 parts = [] # type: List[word_part_t]
1084 self._ReadLikeDQ(left_token, False, parts)
1085
1086 right_quote = self.cur_token
1087 return DoubleQuoted(left_token, parts, right_quote)
1088
1089 def ReadDoubleQuoted(self, left_token, parts):
1090 # type: (Token, List[word_part_t]) -> Token
1091 """For expression mode.
1092
1093 Read var x = "${dir:-}/$name"; etc.
1094 """
1095 self._ReadLikeDQ(left_token, True, parts)
1096 return self.cur_token
1097
1098 def _ReadCommandSub(self, left_id, d_quoted=False):
1099 # type: (Id_t, bool) -> CommandSub
1100 """
1101 NOTE: This is not in the grammar, because word parts aren't in the grammar!
1102
1103 command_sub = '$(' command_list ')'
1104 | '@(' command_list ')'
1105 | '<(' command_list ')'
1106 | '>(' command_list ')'
1107 | ` command_list `
1108 """
1109 left_token = self.cur_token
1110
1111 # Set the lexer in a state so ) becomes the EOF token.
1112 if left_id in (Id.Left_DollarParen, Id.Left_AtParen, Id.Left_ProcSubIn,
1113 Id.Left_ProcSubOut):
1114 self._SetNext(lex_mode_e.ShCommand) # advance past $( etc.
1115
1116 right_id = Id.Eof_RParen
1117 self.lexer.PushHint(Id.Op_RParen, right_id)
1118 c_parser = self.parse_ctx.MakeParserForCommandSub(
1119 self.line_reader, self.lexer, right_id)
1120 # NOTE: This doesn't use something like main_loop because we don't want
1121 # to interleave parsing and execution! Unlike 'source' and 'eval'.
1122 node = c_parser.ParseCommandSub()
1123
1124 right_token = c_parser.w_parser.cur_token
1125
1126 elif left_id == Id.Left_Backtick and self.parse_ctx.do_lossless:
1127 # NOTE: This is an APPROXIMATE solution for translation ONLY. See
1128 # test/osh2oil.
1129
1130 right_id = Id.Eof_Backtick
1131 self.lexer.PushHint(Id.Left_Backtick, right_id)
1132 c_parser = self.parse_ctx.MakeParserForCommandSub(
1133 self.line_reader, self.lexer, right_id)
1134 node = c_parser.ParseCommandSub()
1135 right_token = c_parser.w_parser.cur_token
1136
1137 elif left_id == Id.Left_Backtick:
1138 if not self.parse_opts.parse_backticks():
1139 p_die('Use $(cmd) instead of backticks (parse_backticks)',
1140 left_token)
1141
1142 self._SetNext(lex_mode_e.Backtick) # advance past `
1143
1144 parts = [] # type: List[str]
1145 while True:
1146 self._GetToken()
1147 #log("TOK %s", self.cur_token)
1148
1149 if self.token_type == Id.Backtick_Quoted:
1150 # Remove leading \
1151 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1152
1153 elif self.token_type == Id.Backtick_DoubleQuote:
1154 # Compatibility: If backticks are double quoted, then double quotes
1155 # within them have to be \"
1156 # Shells aren't smart enough to match nested " and ` quotes (but OSH
1157 # is)
1158 if d_quoted:
1159 # Remove leading \
1160 parts.append(lexer.TokenSliceLeft(self.cur_token, 1))
1161 else:
1162 parts.append(lexer.TokenVal(self.cur_token))
1163
1164 elif self.token_type == Id.Backtick_Other:
1165 parts.append(lexer.TokenVal(self.cur_token))
1166
1167 elif self.token_type == Id.Backtick_Right:
1168 break
1169
1170 elif self.token_type == Id.Eof_Real:
1171 # Note: this parse error is in the ORIGINAL context. No code_str yet.
1172 p_die('Unexpected EOF while looking for closing backtick',
1173 left_token)
1174
1175 else:
1176 raise AssertionError(self.cur_token)
1177
1178 self._SetNext(lex_mode_e.Backtick)
1179
1180 # Calculate right SPID on CommandSub BEFORE re-parsing.
1181 right_token = self.cur_token
1182
1183 code_str = ''.join(parts)
1184 #log('code %r', code_str)
1185
1186 # NOTE: This is similar to how we parse aliases in osh/cmd_parse.py. It
1187 # won't have the same location info as MakeParserForCommandSub(), because
1188 # the lexer is different.
1189 arena = self.parse_ctx.arena
1190 #arena = alloc.Arena()
1191 line_reader = reader.StringLineReader(code_str, arena)
1192 c_parser = self.parse_ctx.MakeOshParser(line_reader)
1193 src = source.Reparsed('backticks', left_token, right_token)
1194 with alloc.ctx_SourceCode(arena, src):
1195 node = c_parser.ParseCommandSub()
1196
1197 else:
1198 raise AssertionError(left_id)
1199
1200 return CommandSub(left_token, node, right_token)
1201
1202 def _ReadExprSub(self, lex_mode):
1203 # type: (lex_mode_t) -> word_part.ExprSub
1204 """$[d->key] $[obj.method()] etc."""
1205 left_token = self.cur_token
1206
1207 self._SetNext(lex_mode_e.Expr)
1208 enode, right_token = self.parse_ctx.ParseYshExpr(
1209 self.lexer, grammar_nt.ysh_expr_sub)
1210
1211 self._SetNext(lex_mode) # Move past ]
1212 return word_part.ExprSub(left_token, enode, right_token)
1213
1214 def ParseVarDecl(self, kw_token):
1215 # type: (Token) -> command.VarDecl
1216 """
1217 oil_var_decl: name_type_list '=' testlist end_stmt
1218
1219 Note that assignments must end with \n ; } or EOF. Unlike shell
1220 assignments, we disallow:
1221
1222 var x = 42 | wc -l
1223 var x = 42 && echo hi
1224 """
1225 self._SetNext(lex_mode_e.Expr)
1226 enode, last_token = self.parse_ctx.ParseVarDecl(kw_token, self.lexer)
1227 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1228 # wants
1229 if last_token.id == Id.Op_RBrace:
1230 last_token.id = Id.Lit_RBrace
1231
1232 # Let the CommandParser see the Op_Semi or Op_Newline.
1233 self.buffered_word = last_token
1234 self._SetNext(lex_mode_e.ShCommand) # always back to this
1235 return enode
1236
1237 def ParseMutation(self, kw_token, var_checker):
1238 # type: (Token, VarChecker) -> command.Mutation
1239 """
1240 setvar i = 42
1241 setvar i += 1
1242 setvar a[i] = 42
1243 setvar a[i] += 1
1244 setvar d.key = 42
1245 setvar d.key += 1
1246 """
1247 self._SetNext(lex_mode_e.Expr)
1248 enode, last_token = self.parse_ctx.ParseMutation(kw_token, self.lexer)
1249 # Hack to move } from what the Expr lexer modes gives to what CommandParser
1250 # wants
1251 if last_token.id == Id.Op_RBrace:
1252 last_token.id = Id.Lit_RBrace
1253
1254 for lhs in enode.lhs:
1255 UP_lhs = lhs
1256 with tagswitch(lhs) as case:
1257 if case(y_lhs_e.Var):
1258 lhs = cast(Token, UP_lhs)
1259 var_checker.Check(kw_token.id, lexer.LazyStr(lhs), lhs)
1260
1261 # Note: this does not cover cases like
1262 # setvar (a[0])[1] = v
1263 # setvar (d.key).other = v
1264 # This leaks into catching all typos statically, which may be
1265 # possible if 'use' makes all names explicit.
1266 elif case(y_lhs_e.Subscript):
1267 lhs = cast(Subscript, UP_lhs)
1268 if lhs.obj.tag() == expr_e.Var:
1269 v = cast(expr.Var, lhs.obj)
1270 var_checker.Check(kw_token.id, v.name, v.left)
1271
1272 elif case(y_lhs_e.Attribute):
1273 lhs = cast(Attribute, UP_lhs)
1274 if lhs.obj.tag() == expr_e.Var:
1275 v = cast(expr.Var, lhs.obj)
1276 var_checker.Check(kw_token.id, v.name, v.left)
1277
1278 # Let the CommandParser see the Op_Semi or Op_Newline.
1279 self.buffered_word = last_token
1280 self._SetNext(lex_mode_e.ShCommand) # always back to this
1281 return enode
1282
1283 def ParseBareDecl(self):
1284 # type: () -> expr_t
1285 """
1286 x = {name: val}
1287 """
1288 self._SetNext(lex_mode_e.Expr)
1289 self._GetToken()
1290 enode, last_token = self.parse_ctx.ParseYshExpr(
1291 self.lexer, grammar_nt.command_expr)
1292 if last_token.id == Id.Op_RBrace:
1293 last_token.id = Id.Lit_RBrace
1294 self.buffered_word = last_token
1295 self._SetNext(lex_mode_e.ShCommand)
1296 return enode
1297
1298 def ParseYshExprForCommand(self):
1299 # type: () -> expr_t
1300
1301 # Fudge for this case
1302 # for x in(y) {
1303 # versus
1304 # for x in (y) {
1305 #
1306 # In the former case, ReadWord on 'in' puts the lexer past (.
1307 # Also see LookPastSpace in CommandParers.
1308 # A simpler solution would be nicer.
1309
1310 if self.token_type == Id.Op_LParen:
1311 self.lexer.MaybeUnreadOne()
1312
1313 enode, _ = self.parse_ctx.ParseYshExpr(self.lexer, grammar_nt.ysh_expr)
1314
1315 self._SetNext(lex_mode_e.ShCommand)
1316 return enode
1317
1318 def ParseCommandExpr(self):
1319 # type: () -> expr_t
1320 """
1321 = 1+2
1322 """
1323 enode, last_token = self.parse_ctx.ParseYshExpr(
1324 self.lexer, grammar_nt.command_expr)
1325
1326 # In some cases, such as the case statement, we expect *the lexer* to be
1327 # pointing at the token right after the expression. But the expression
1328 # parser must have read to the `last_token`. Unreading places the lexer
1329 # back in the expected state. Ie:
1330 #
1331 # case (x) { case (x) {
1332 # (else) { = x } (else) { = x }
1333 # ^ The lexer is here ^ Unread to here
1334 # } }
1335 assert last_token.id in (Id.Op_Newline, Id.Eof_Real, Id.Op_Semi,
1336 Id.Op_RBrace), last_token
1337 if last_token.id != Id.Eof_Real:
1338 # Eof_Real is the only token we cannot unread
1339 self.lexer.MaybeUnreadOne()
1340
1341 return enode
1342
1343 def ParseProc(self, node):
1344 # type: (Proc) -> None
1345
1346 # proc name-with-hyphens() must be accepted
1347 self._SetNext(lex_mode_e.ShCommand)
1348 self._GetToken()
1349 # example: 'proc f[' gets you Lit_ArrayLhsOpen
1350 if self.token_type != Id.Lit_Chars:
1351 p_die('Invalid proc name %s' % ui.PrettyToken(self.cur_token),
1352 self.cur_token)
1353
1354 # TODO: validate this more. Disallow proc 123 { }, which isn't disallowed
1355 # for shell functions. Similar to IsValidVarName().
1356 node.name = self.cur_token
1357
1358 last_token = self.parse_ctx.ParseProc(self.lexer, node)
1359
1360 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1361 assert last_token.id == Id.Op_LBrace
1362 last_token.id = Id.Lit_LBrace
1363 self.buffered_word = last_token
1364
1365 self._SetNext(lex_mode_e.ShCommand)
1366
1367 def ParseFunc(self, node):
1368 # type: (Func) -> None
1369 last_token = self.parse_ctx.ParseFunc(self.lexer, node)
1370
1371 # Translate from lex_mode_e.{Expr => ShCommand}, for CommandParser
1372 assert last_token.id == Id.Op_LBrace
1373 last_token.id = Id.Lit_LBrace
1374 self.buffered_word = last_token
1375
1376 self._SetNext(lex_mode_e.ShCommand)
1377
1378 def ParseYshCasePattern(self):
1379 # type: () -> Tuple[pat_t, Token]
1380 pat, left_tok, last_token = self.parse_ctx.ParseYshCasePattern(
1381 self.lexer)
1382
1383 if last_token.id == Id.Op_LBrace:
1384 last_token.id = Id.Lit_LBrace
1385 self.buffered_word = last_token
1386
1387 return pat, left_tok
1388
1389 def NewlineOkForYshCase(self):
1390 # type: () -> Id_t
1391 """Check for optional newline and consume it.
1392
1393 This is a special case of `_NewlineOk` which fixed some "off-by-one" issues
1394 which crop up while parsing Ysh Case Arms. For more details, see
1395 #oil-dev > Progress On YSH Case Grammar on zulip.
1396
1397 Returns a token id which is filled with the choice of
1398
1399 word { echo word }
1400 (3) { echo expr }
1401 /e/ { echo eggex }
1402 } # right brace
1403 """
1404 while True:
1405 next_id = self.lexer.LookAheadOne(lex_mode_e.Expr)
1406
1407 # Cannot lookahead past lines
1408 if next_id == Id.Unknown_Tok:
1409 self.lexer.MoveToNextLine()
1410 continue
1411
1412 next_kind = consts.GetKind(next_id)
1413 if next_id != Id.Op_Newline and next_kind != Kind.Ignored:
1414 break
1415
1416 self.lexer.Read(lex_mode_e.Expr)
1417
1418 if next_id in (Id.Op_RBrace, Id.Op_LParen, Id.Arith_Slash):
1419 self._SetNext(lex_mode_e.Expr) # Continue in expression mode
1420 else:
1421 # Consume the trailing Op_Newline
1422 self._SetNext(lex_mode_e.ShCommand)
1423 self._GetToken()
1424
1425 return next_id
1426
1427 def _ReadArithExpr(self, end_id):
1428 # type: (Id_t) -> arith_expr_t
1429 """Read and parse an arithmetic expression in various contexts.
1430
1431 $(( 1+2 ))
1432 (( a=1+2 ))
1433 ${a[ 1+2 ]}
1434 ${a : 1+2 : 1+2}
1435
1436 See tests/arith-context.test.sh for ambiguous cases.
1437
1438 ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
1439
1440 ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
1441
1442 See the assertion in ArithParser.Parse() -- unexpected extra input.
1443 """
1444 # calls self.ReadWord(lex_mode_e.Arith)
1445 anode = self.a_parser.Parse()
1446 cur_id = self.a_parser.CurrentId()
1447 if end_id != Id.Undefined_Tok and cur_id != end_id:
1448 p_die(
1449 'Unexpected token after arithmetic expression (%s != %s)' %
1450 (ui.PrettyId(cur_id), ui.PrettyId(end_id)),
1451 loc.Word(self.a_parser.cur_word))
1452 return anode
1453
1454 def _ReadArithSub(self):
1455 # type: () -> word_part.ArithSub
1456 """Read an arith substitution, which contains an arith expression, e.g.
1457
1458 $((a + 1)).
1459 """
1460 left_tok = self.cur_token
1461
1462 # The second one needs to be disambiguated in stuff like stuff like:
1463 # $(echo $(( 1+2 )) )
1464 self.lexer.PushHint(Id.Op_RParen, Id.Right_DollarDParen)
1465
1466 # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
1467 # could save the lexer/reader state here, and retry if the arithmetic parse
1468 # fails. But we can almost always catch this at parse time. There could
1469 # be some exceptions like:
1470 # $((echo * foo)) # looks like multiplication
1471 # $((echo / foo)) # looks like division
1472
1473 self._SetNext(lex_mode_e.Arith)
1474 anode = self._ReadArithExpr(Id.Arith_RParen)
1475
1476 # TODO: This could be DQ or Arith too
1477 self._SetNext(lex_mode_e.ShCommand)
1478
1479 # PROBLEM: $(echo $(( 1 + 2 )) )
1480 # Two right parens break the Id.Eof_RParen scheme
1481 self._GetToken()
1482 if self.token_type != Id.Right_DollarDParen:
1483 p_die('Expected second ) to end arith sub', self.cur_token)
1484
1485 right_tok = self.cur_token
1486 return word_part.ArithSub(left_tok, anode, right_tok)
1487
1488 def ReadDParen(self):
1489 # type: () -> Tuple[arith_expr_t, Token]
1490 """Read ((1+ 2)) -- command context.
1491
1492 We're using the word parser because it's very similar to _ReadArithExpr
1493 above.
1494
1495 This also returns the terminating `Op_DRightParen` token for use as location
1496 tracking.
1497 """
1498 # The second one needs to be disambiguated in stuff like stuff like:
1499 # TODO: Be consistent with ReadForExpression below and use lex_mode_e.Arith?
1500 # Then you can get rid of this.
1501 self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
1502
1503 self._SetNext(lex_mode_e.Arith)
1504 anode = self._ReadArithExpr(Id.Arith_RParen)
1505
1506 self._SetNext(lex_mode_e.ShCommand)
1507
1508 # PROBLEM: $(echo $(( 1 + 2 )) )
1509 self._GetToken()
1510 right = self.cur_token
1511 if self.token_type != Id.Op_DRightParen:
1512 p_die('Expected second ) to end arith statement', self.cur_token)
1513
1514 self._SetNext(lex_mode_e.ShCommand)
1515
1516 return anode, right
1517
1518 def _SetNextNonSpace(self):
1519 # type: () -> None
1520 """Same logic as _ReadWord, but for ReadForExpression."""
1521 while True:
1522 self._SetNext(lex_mode_e.Arith)
1523 self._GetToken()
1524 if self.token_kind not in (Kind.Ignored, Kind.WS):
1525 break
1526
1527 def ReadForExpression(self):
1528 # type: () -> command.ForExpr
1529 """Read ((i=0; i<5; ++i)) -- part of command context."""
1530 self._SetNextNonSpace() # skip over ((
1531
1532 self._GetToken()
1533 cur_id = self.token_type # for end of arith expressions
1534
1535 if cur_id == Id.Arith_Semi: # for (( ; i < 10; i++ ))
1536 init_node = None # type: Optional[arith_expr_t]
1537 else:
1538 init_node = self.a_parser.Parse()
1539 cur_id = self.a_parser.CurrentId()
1540 self._SetNextNonSpace()
1541
1542 # It's odd to keep track of both cur_id and self.token_type in this
1543 # function, but it works, and is tested in 'test/parse_error.sh
1544 # arith-integration'
1545 if cur_id != Id.Arith_Semi: # for (( x=0 b; ... ))
1546 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1547
1548 self._GetToken()
1549 cur_id = self.token_type
1550
1551 if cur_id == Id.Arith_Semi: # for (( ; ; i++ ))
1552 cond_node = None # type: Optional[arith_expr_t]
1553 else:
1554 cond_node = self.a_parser.Parse()
1555 cur_id = self.a_parser.CurrentId()
1556 self._SetNextNonSpace()
1557
1558 if cur_id != Id.Arith_Semi: # for (( x=0; x<5 b ))
1559 p_die("Expected ; here", loc.Word(self.a_parser.cur_word))
1560
1561 self._GetToken()
1562 cur_id = self.token_type
1563
1564 if cur_id == Id.Arith_RParen: # for (( ; ; ))
1565 update_node = None # type: Optional[arith_expr_t]
1566 else:
1567 update_node = self._ReadArithExpr(Id.Arith_RParen)
1568 self._SetNextNonSpace()
1569
1570 self._GetToken()
1571 if self.token_type != Id.Arith_RParen:
1572 p_die('Expected ) to end for loop expression', self.cur_token)
1573 self._SetNext(lex_mode_e.ShCommand)
1574
1575 # redirects is None, will be assigned in CommandEvaluator
1576 node = command.ForExpr.CreateNull()
1577 node.init = init_node
1578 node.cond = cond_node
1579 node.update = update_node
1580 return node
1581
1582 def _ReadArrayLiteral(self):
1583 # type: () -> word_part_t
1584 """a=(1 2 3)
1585
1586 TODO: See osh/cmd_parse.py:164 for Id.Lit_ArrayLhsOpen, for a[x++]=1
1587
1588 We want:
1589
1590 A=(['x']=1 ["x"]=2 [$x$y]=3)
1591
1592 Maybe allow this as a literal string? Because I think I've seen it before?
1593 Or maybe force people to patch to learn the rule.
1594
1595 A=([x]=4)
1596
1597 Starts with Lit_Other '[', and then it has Lit_ArrayLhsClose
1598 Maybe enforce that ALL have keys or NONE of have keys.
1599 """
1600 self._SetNext(lex_mode_e.ShCommand) # advance past (
1601 self._GetToken()
1602 if self.cur_token.id != Id.Op_LParen:
1603 p_die('Expected ( after =', self.cur_token)
1604 left_token = self.cur_token
1605 right_token = None # type: Token
1606
1607 # MUST use a new word parser (with same lexer).
1608 w_parser = self.parse_ctx.MakeWordParser(self.lexer, self.line_reader)
1609 words = [] # type: List[CompoundWord]
1610 done = False
1611 while not done:
1612 w = w_parser.ReadWord(lex_mode_e.ShCommand)
1613 with tagswitch(w) as case:
1614 if case(word_e.Operator):
1615 tok = cast(Token, w)
1616 if tok.id == Id.Right_ShArrayLiteral:
1617 right_token = tok
1618 done = True # can't use break here
1619 # Unlike command parsing, array parsing allows embedded \n.
1620 elif tok.id == Id.Op_Newline:
1621 continue
1622 else:
1623 p_die('Unexpected token in array literal', loc.Word(w))
1624
1625 elif case(word_e.Compound):
1626 words.append(cast(CompoundWord, w))
1627
1628 else:
1629 raise AssertionError()
1630
1631 if len(words) == 0: # a=() is empty indexed array
1632 # Needed for type safety, doh
1633 no_words = [] # type: List[word_t]
1634 node = ShArrayLiteral(left_token, no_words, right_token)
1635 return node
1636
1637 pairs = [] # type: List[AssocPair]
1638 # If the first one is a key/value pair, then the rest are assumed to be.
1639 pair = word_.DetectAssocPair(words[0])
1640 if pair:
1641 pairs.append(pair)
1642
1643 n = len(words)
1644 for i in xrange(1, n):
1645 w2 = words[i]
1646 pair = word_.DetectAssocPair(w2)
1647 if not pair:
1648 p_die("Expected associative array pair", loc.Word(w2))
1649
1650 pairs.append(pair)
1651
1652 # invariant List?
1653 return word_part.BashAssocLiteral(left_token, pairs, right_token)
1654
1655 # Brace detection for arrays but NOT associative arrays
1656 words2 = braces.BraceDetectAll(words)
1657 words3 = word_.TildeDetectAll(words2)
1658 return ShArrayLiteral(left_token, words3, right_token)
1659
1660 def ParseProcCallArgs(self, start_symbol):
1661 # type: (int) -> ArgList
1662 """ json write (x) """
1663 self.lexer.MaybeUnreadOne()
1664
1665 arg_list = ArgList.CreateNull(alloc_lists=True)
1666 arg_list.left = self.cur_token
1667 self.parse_ctx.ParseProcCallArgs(self.lexer, arg_list, start_symbol)
1668 return arg_list
1669
1670 def _MaybeReadWordPart(self, is_first, lex_mode, parts):
1671 # type: (bool, lex_mode_t, List[word_part_t]) -> bool
1672 """Helper for _ReadCompoundWord3."""
1673 done = False
1674
1675 if self.token_type == Id.Lit_EscapedChar:
1676 tok = self.cur_token
1677 assert tok.length == 2
1678 ch = lexer.TokenSliceLeft(tok, 1)
1679 if not self.parse_opts.parse_backslash():
1680 if not pyutil.IsValidCharEscape(ch):
1681 p_die('Invalid char escape in unquoted word (OILS-ERR-13)',
1682 self.cur_token)
1683
1684 part = word_part.EscapedLiteral(self.cur_token,
1685 ch) # type: word_part_t
1686 else:
1687 part = self.cur_token
1688
1689 if is_first and self.token_type == Id.Lit_VarLike: # foo=
1690 parts.append(part)
1691 # Unfortunately it's awkward to pull the check for a=(1 2) up to
1692 # _ReadWord.
1693 next_id = self.lexer.LookPastSpace(lex_mode)
1694 if next_id == Id.Op_LParen:
1695 self.lexer.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
1696 part2 = self._ReadArrayLiteral()
1697 parts.append(part2)
1698
1699 # Array literal must be the last part of the word.
1700 self._SetNext(lex_mode)
1701 self._GetToken()
1702 # EOF, whitespace, newline, Right_Subshell
1703 if self.token_kind not in KINDS_THAT_END_WORDS:
1704 p_die('Unexpected token after array literal',
1705 self.cur_token)
1706 done = True
1707
1708 elif (is_first and self.parse_opts.parse_at() and
1709 self.token_type == Id.Lit_Splice):
1710
1711 splice_tok = self.cur_token
1712 part2 = word_part.Splice(splice_tok,
1713 lexer.TokenSliceLeft(splice_tok, 1))
1714
1715 parts.append(part2)
1716
1717 # @words must be the last part of the word
1718 self._SetNext(lex_mode)
1719 self._GetToken()
1720 # EOF, whitespace, newline, Right_Subshell
1721 if self.token_kind not in KINDS_THAT_END_WORDS:
1722 p_die('Unexpected token after array splice', self.cur_token)
1723 done = True
1724
1725 elif (is_first and self.parse_opts.parse_at() and
1726 self.token_type == Id.Lit_AtLBracket): # @[split(x)]
1727 part2 = self._ReadExprSub(lex_mode_e.DQ)
1728 parts.append(part2)
1729
1730 # @[split(x)]
1731 self._SetNext(lex_mode)
1732 self._GetToken()
1733 # EOF, whitespace, newline, Right_Subshell
1734 if self.token_kind not in KINDS_THAT_END_WORDS:
1735 p_die('Unexpected token after Expr splice', self.cur_token)
1736 done = True
1737
1738 elif (is_first and self.parse_opts.parse_at() and
1739 self.token_type == Id.Lit_AtLBraceDot):
1740 p_die('TODO: @{.myproc builtin sub}', self.cur_token)
1741
1742 elif (is_first and self.parse_opts.parse_at_all() and
1743 self.token_type == Id.Lit_At):
1744 # Because $[x] ${x} and perhaps $/x/ are reserved, it makes sense for @
1745 # at the beginning of a word to be reserved.
1746
1747 # Although should we relax 'echo @' ? I'm tempted to have a shortcut for
1748 # @_argv and
1749 p_die('Literal @ starting a word must be quoted (parse_at_all)',
1750 self.cur_token)
1751
1752 else:
1753 # not a literal with lookahead; append it
1754 parts.append(part)
1755
1756 return done
1757
1758 def _ReadCompoundWord(self, lex_mode):
1759 # type: (lex_mode_t) -> CompoundWord
1760 return self._ReadCompoundWord3(lex_mode, Id.Undefined_Tok, True)
1761
1762 def _ReadCompoundWord3(self, lex_mode, eof_type, empty_ok):
1763 # type: (lex_mode_t, Id_t, bool) -> CompoundWord
1764 """
1765 Precondition: Looking at the first token of the first word part
1766 Postcondition: Looking at the token after, e.g. space or operator
1767
1768 NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
1769 could be an operator delimiting a compound word. Can we change lexer modes
1770 and remove this special case?
1771 """
1772 w = CompoundWord([])
1773 num_parts = 0
1774 brace_count = 0
1775 done = False
1776 is_triple_quoted = None # type: Optional[BoolParamBox]
1777
1778 while not done:
1779 self._GetToken()
1780
1781 allow_done = empty_ok or num_parts != 0
1782 if allow_done and self.token_type == eof_type:
1783 done = True # e.g. for ${foo//pat/replace}
1784
1785 # Keywords like "for" are treated like literals
1786 elif self.token_kind in (Kind.Lit, Kind.History, Kind.KW,
1787 Kind.ControlFlow, Kind.BoolUnary,
1788 Kind.BoolBinary):
1789
1790 # Syntax error for { and }
1791 if self.token_type == Id.Lit_LBrace:
1792 brace_count += 1
1793 elif self.token_type == Id.Lit_RBrace:
1794 brace_count -= 1
1795 elif self.token_type == Id.Lit_Dollar:
1796 if not self.parse_opts.parse_dollar():
1797 if num_parts == 0 and lex_mode == lex_mode_e.ShCommand:
1798 next_byte = self.lexer.ByteLookAhead()
1799 # TODO: switch lexer modes and parse $/d+/. But not ${a:-$/d+/}
1800 if next_byte == '/':
1801 #log('next_byte %r', next_byte)
1802 pass
1803
1804 p_die('Literal $ should be quoted like \$',
1805 self.cur_token)
1806
1807 done = self._MaybeReadWordPart(num_parts == 0, lex_mode,
1808 w.parts)
1809
1810 elif self.token_kind == Kind.VSub:
1811 vsub_token = self.cur_token
1812
1813 part = SimpleVarSub(vsub_token) # type: word_part_t
1814 w.parts.append(part)
1815
1816 elif self.token_kind == Kind.ExtGlob:
1817 # If parse_at, we can take over @( to start @(seq 3)
1818 # Users can also use look at ,(*.py|*.sh)
1819 if (self.parse_opts.parse_at() and
1820 self.token_type == Id.ExtGlob_At and num_parts == 0):
1821 cs_part = self._ReadCommandSub(Id.Left_AtParen,
1822 d_quoted=False)
1823 # RARE mutation of tok.id!
1824 cs_part.left_token.id = Id.Left_AtParen
1825 part = cs_part # for type safety
1826
1827 # Same check as _MaybeReadWordPart. @(seq 3)x is illegal, just like
1828 # a=(one two)x and @arrayfunc(3)x.
1829 self._GetToken()
1830 if self.token_kind not in KINDS_THAT_END_WORDS:
1831 p_die('Unexpected token after @()', self.cur_token)
1832 done = True
1833
1834 else:
1835 part = self._ReadExtGlob()
1836 w.parts.append(part)
1837
1838 elif self.token_kind == Kind.BashRegex:
1839 if self.token_type == Id.BashRegex_LParen: # Opening (
1840 part = self._ReadBashRegexGroup()
1841 w.parts.append(part)
1842 else:
1843 assert self.token_type == Id.BashRegex_AllowedInParens
1844 p_die('Invalid token in bash regex', self.cur_token)
1845
1846 elif self.token_kind == Kind.Left:
1847 try_triple_quote = (self.parse_opts.parse_triple_quote() and
1848 lex_mode == lex_mode_e.ShCommand and
1849 num_parts == 0)
1850
1851 # Save allocation
1852 if try_triple_quote:
1853 is_triple_quoted = BoolParamBox(False)
1854
1855 part = self._ReadUnquotedLeftParts(is_triple_quoted)
1856 w.parts.append(part)
1857
1858 # NOT done yet, will advance below
1859 elif self.token_kind == Kind.Right:
1860 # Still part of the word; will be done on the next iter.
1861 if self.token_type == Id.Right_DoubleQuote:
1862 pass
1863 # Never happens, no PushHint for this case.
1864 #elif self.token_type == Id.Right_DollarParen:
1865 # pass
1866 elif self.token_type == Id.Right_Subshell:
1867 # LEXER HACK for (case x in x) ;; esac )
1868 # Rewind before it's used
1869 assert self.next_lex_mode == lex_mode_e.Undefined
1870 if self.lexer.MaybeUnreadOne():
1871 self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
1872 self._SetNext(lex_mode)
1873 done = True
1874 else:
1875 done = True
1876
1877 elif self.token_kind == Kind.Ignored:
1878 done = True
1879
1880 else:
1881 # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
1882 # so to test for ESAC, we can read ) before getting a chance to
1883 # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
1884 # token and do it again.
1885
1886 # We get Id.Op_RParen at top level: case x in x) ;; esac
1887 # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
1888 if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
1889 # Rewind before it's used
1890 assert self.next_lex_mode == lex_mode_e.Undefined
1891 if self.lexer.MaybeUnreadOne():
1892 if self.token_type == Id.Eof_RParen:
1893 # Redo translation
1894 self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
1895 self._SetNext(lex_mode)
1896
1897 done = True # anything we don't recognize means we're done
1898
1899 if not done:
1900 self._SetNext(lex_mode)
1901 num_parts += 1
1902
1903 if (self.parse_opts.parse_brace() and num_parts > 1 and
1904 brace_count != 0):
1905 # accept { and }, but not foo{
1906 p_die(
1907 'Word has unbalanced { }. Maybe add a space or quote it like \{',
1908 loc.Word(w))
1909
1910 if is_triple_quoted and is_triple_quoted.b and num_parts > 1:
1911 p_die('Unexpected parts after triple quoted string',
1912 loc.WordPart(w.parts[-1]))
1913
1914 if 0:
1915 from _devbuild.gen.syntax_asdl import word_part_str
1916 word_key = ' '.join(word_part_str(p.tag()) for p in w.parts)
1917 WORD_HIST[word_key] += 1
1918 return w
1919
1920 def _ReadArithWord(self):
1921 # type: () -> Optional[word_t]
1922 """ Helper for ReadArithWord() """
1923 self._GetToken()
1924
1925 if self.token_kind == Kind.Unknown:
1926 # e.g. happened during dynamic parsing of unset 'a[$foo]' in gherkin
1927 p_die(
1928 'Unexpected token while parsing arithmetic: %r' %
1929 lexer.TokenVal(self.cur_token), self.cur_token)
1930
1931 elif self.token_kind == Kind.Eof:
1932 return self.cur_token
1933
1934 elif self.token_kind == Kind.Ignored:
1935 # Space should be ignored.
1936 self._SetNext(lex_mode_e.Arith)
1937 return None
1938
1939 elif self.token_kind in (Kind.Arith, Kind.Right):
1940 # Id.Right_DollarDParen IS just a normal token, handled by ArithParser
1941 self._SetNext(lex_mode_e.Arith)
1942 return self.cur_token
1943
1944 elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub):
1945 return self._ReadCompoundWord(lex_mode_e.Arith)
1946
1947 else:
1948 raise AssertionError(self.cur_token)
1949
1950 def _ReadWord(self, word_mode):
1951 # type: (lex_mode_t) -> Optional[word_t]
1952 """Helper function for ReadWord()."""
1953
1954 # Change the pseudo lexer mode to a real lexer mode
1955 if word_mode == lex_mode_e.ShCommandFakeBrack:
1956 lex_mode = lex_mode_e.ShCommand
1957 else:
1958 lex_mode = word_mode
1959
1960 self._GetToken()
1961
1962 if self.token_kind == Kind.Eof:
1963 # No advance
1964 return self.cur_token
1965
1966 # Allow Arith for ) at end of for loop?
1967 elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
1968 self._SetNext(lex_mode)
1969
1970 # Newlines are complicated. See 3x2 matrix in the comment about
1971 # self.multiline and self.newline_state above.
1972 if self.token_type == Id.Op_Newline:
1973 if self.multiline:
1974 if self.newline_state > 1:
1975 # This points at a blank line, but at least it gives the line number
1976 p_die('Invalid blank line in multiline mode',
1977 self.cur_token)
1978 return None
1979
1980 if self.returned_newline: # skip
1981 return None
1982
1983 return self.cur_token
1984
1985 elif self.token_kind == Kind.Right:
1986 if self.token_type not in (Id.Right_Subshell, Id.Right_ShFunction,
1987 Id.Right_CasePat,
1988 Id.Right_ShArrayLiteral):
1989 raise AssertionError(self.cur_token)
1990
1991 self._SetNext(lex_mode)
1992 return self.cur_token
1993
1994 elif self.token_kind in (Kind.Ignored, Kind.WS):
1995 self._SetNext(lex_mode)
1996 return None
1997
1998 else:
1999 assert self.token_kind in (Kind.VSub, Kind.Lit, Kind.History,
2000 Kind.Left, Kind.KW, Kind.ControlFlow,
2001 Kind.BoolUnary, Kind.BoolBinary,
2002 Kind.ExtGlob,
2003 Kind.BashRegex), 'Unhandled token kind'
2004
2005 if (word_mode == lex_mode_e.ShCommandFakeBrack and
2006 self.parse_opts.parse_bracket() and
2007 self.token_type == Id.Lit_LBracket):
2008 # Change [ from Kind.Lit -> Kind.Op
2009 # So CommandParser can treat
2010 # assert [42 === x]
2011 # like
2012 # json write (x)
2013 bracket_word = self.cur_token
2014 bracket_word.id = Id.Op_LBracket
2015
2016 self._SetNext(lex_mode)
2017 return bracket_word
2018
2019 # We're beginning a word. If we see Id.Lit_Pound, change to
2020 # lex_mode_e.Comment and read until end of line.
2021 if self.token_type == Id.Lit_Pound:
2022 self._SetNext(lex_mode_e.Comment)
2023 self._GetToken()
2024
2025 # NOTE: The # could be the last character in the file. It can't be
2026 # Eof_{RParen,Backtick} because #) and #` are comments.
2027 assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
2028 self.cur_token
2029
2030 # The next iteration will go into Kind.Ignored and set lex state to
2031 # lex_mode_e.ShCommand/etc.
2032 return None # tell ReadWord() to try again after comment
2033
2034 elif self.token_type == Id.Lit_TPound: ### doc comment
2035 self._SetNext(lex_mode_e.Comment)
2036 self._GetToken()
2037
2038 if self.token_type == Id.Ignored_Comment and self.emit_doc_token:
2039 return self.cur_token
2040
2041 return None # tell ReadWord() to try again after comment
2042
2043 else:
2044 # r'' u'' b''
2045 if (self.token_type == Id.Lit_Chars and
2046 self.lexer.LookAheadOne(
2047 lex_mode_e.ShCommand) == Id.Left_SingleQuote):
2048
2049 # When shopt -s parse_raw_string:
2050 # echo r'hi' is like echo 'hi'
2051 #
2052 # echo u'\u{3bc}' b'\yff' works
2053
2054 tok = self.cur_token
2055 if self.parse_opts.parse_ysh_string():
2056 if lexer.TokenEquals(tok, 'r'):
2057 left_id = Id.Left_RSingleQuote
2058 elif lexer.TokenEquals(tok, 'u'):
2059 left_id = Id.Left_USingleQuote
2060 elif lexer.TokenEquals(tok, 'b'):
2061 left_id = Id.Left_BSingleQuote
2062 else:
2063 left_id = Id.Undefined_Tok
2064
2065 if left_id != Id.Undefined_Tok:
2066 # skip the r, and then 'foo' will be read as normal
2067 self._SetNext(lex_mode_e.ShCommand)
2068
2069 self._GetToken()
2070 assert self.token_type == Id.Left_SingleQuote, self.token_type
2071
2072 # Read the word in a different lexer mode
2073 return self._ReadYshSingleQuoted(left_id)
2074
2075 return self._ReadCompoundWord(lex_mode)
2076
2077 def ParseVarRef(self):
2078 # type: () -> BracedVarSub
2079 """DYNAMIC parsing of what's inside ${!ref}
2080
2081 # Same as VarOf production
2082 VarRefExpr = VarOf EOF
2083 """
2084 self._SetNext(lex_mode_e.VSub_1)
2085
2086 self._GetToken()
2087 if self.token_kind != Kind.VSub:
2088 p_die('Expected var name', self.cur_token)
2089
2090 part = self._ParseVarOf()
2091 # NOTE: no ${ } means no part.left and part.right
2092 part.left = part.token # cheat to make test pass
2093 part.right = part.token
2094
2095 self._GetToken()
2096 if self.token_type != Id.Eof_Real:
2097 p_die('Expected end of var ref expression', self.cur_token)
2098 return part
2099
2100 def LookPastSpace(self):
2101 # type: () -> Id_t
2102 """Look ahead to the next token.
2103
2104 For the CommandParser to recognize
2105 array= (1 2 3)
2106 YSH for ( versus bash for ((
2107 YSH if ( versus if test
2108 YSH while ( versus while test
2109 YSH bare assignment 'grep =' versus 'grep foo'
2110 """
2111 assert self.token_type != Id.Undefined_Tok
2112 if self.cur_token.id == Id.WS_Space:
2113 id_ = self.lexer.LookPastSpace(lex_mode_e.ShCommand)
2114 else:
2115 id_ = self.cur_token.id
2116 return id_
2117
2118 def LookAheadFuncParens(self):
2119 # type: () -> bool
2120 """Special lookahead for f( ) { echo hi; } to check for ( )"""
2121 assert self.token_type != Id.Undefined_Tok
2122
2123 # We have to handle 2 cases because we buffer a token
2124 if self.cur_token.id == Id.Op_LParen: # saw funcname(
2125 return self.lexer.LookAheadFuncParens(1) # go back one char
2126
2127 elif self.cur_token.id == Id.WS_Space: # saw funcname WHITESPACE
2128 return self.lexer.LookAheadFuncParens(0)
2129
2130 else:
2131 return False
2132
2133 def ReadWord(self, word_mode):
2134 # type: (lex_mode_t) -> word_t
2135 """Read the next word, using the given lexer mode.
2136
2137 This is a stateful wrapper for the stateless _ReadWord function.
2138 """
2139 assert word_mode in (lex_mode_e.ShCommand,
2140 lex_mode_e.ShCommandFakeBrack,
2141 lex_mode_e.DBracket, lex_mode_e.BashRegex)
2142
2143 if self.buffered_word: # For integration with pgen2
2144 w = self.buffered_word
2145 self.buffered_word = None
2146 else:
2147 while True:
2148 w = self._ReadWord(word_mode)
2149 if w is not None:
2150 break
2151
2152 self.returned_newline = (word_.CommandId(w) == Id.Op_Newline)
2153 return w
2154
2155 def ReadArithWord(self):
2156 # type: () -> word_t
2157 while True:
2158 w = self._ReadArithWord()
2159 if w is not None:
2160 break
2161 return w
2162
2163 def ReadHereDocBody(self, parts):
2164 # type: (List[word_part_t]) -> None
2165 """
2166 A here doc is like a double quoted context, except " isn't special.
2167 """
2168 self._ReadLikeDQ(None, False, parts)
2169 # Returns nothing
2170
2171 def ReadForPlugin(self):
2172 # type: () -> CompoundWord
2173 """For $PS1, $PS4, etc.
2174
2175 This is just like reading a here doc line. "\n" is allowed, as
2176 well as the typical substitutions ${x} $(echo hi) $((1 + 2)).
2177 """
2178 w = CompoundWord([])
2179 self._ReadLikeDQ(None, False, w.parts)
2180 return w
2181
2182 def EmitDocToken(self, b):
2183 # type: (bool) -> None
2184 self.emit_doc_token = b
2185
2186 def Multiline(self, b):
2187 # type: (bool) -> None
2188 self.multiline = b
2189
2190
2191if 0:
2192 import collections
2193 WORD_HIST = collections.Counter()
2194
2195# vim: sw=4