| 1 | #!/usr/bin/env python
|
| 2 | # Copyright 2016 Andy Chu. All rights reserved.
|
| 3 | # Licensed under the Apache License, Version 2.0 (the "License");
|
| 4 | # you may not use this file except in compliance with the License.
|
| 5 | # You may obtain a copy of the License at
|
| 6 | #
|
| 7 | # http://www.apache.org/licenses/LICENSE-2.0
|
| 8 | """
|
| 9 | word_parse.py - Parse the shell word language.
|
| 10 | """
|
| 11 |
|
| 12 | from asdl import const
|
| 13 |
|
| 14 | from osh.meta import Id, Kind, LookupKind
|
| 15 | from core import braces
|
| 16 | from core import word
|
| 17 | from core import tdop
|
| 18 | from core import util
|
| 19 |
|
| 20 | from osh import arith_parse
|
| 21 | from osh.meta import ast, types
|
| 22 |
|
| 23 | word_part_e = ast.word_part_e
|
| 24 | word_e = ast.word_e
|
| 25 | lex_mode_e = types.lex_mode_e
|
| 26 |
|
| 27 | p_die = util.p_die
|
| 28 | log = util.log
|
| 29 |
|
| 30 | # Substitutions can be nested, but which inner subs are allowed depends on the
|
| 31 | # outer sub. See _ReadLeftParts vs. _ReadDoubleQuotedLeftParts.
|
| 32 |
|
| 33 | # lex_mode_e.OUTER
|
| 34 | # All subs and quotes are allowed --
|
| 35 | # $v ${v} $() `` $(()) '' "" $'' $"" <() >()
|
| 36 | #
|
| 37 | # lex_mode_e.DQ
|
| 38 | # Var, Command, Arith, but no quotes
|
| 39 | # $v ${v} $() `` $(())
|
| 40 | # No process substitution.
|
| 41 | #
|
| 42 | # lex_mode_e.ARITH:
|
| 43 | # Similar to DQ: Var, Command, Arith sub. No process sub. bash has no
|
| 44 | # quotes, but we are changing this in oil. We are adding ALL FOUR kinds of
|
| 45 | # quotes , because we need those for associtative array indexing.
|
| 46 | #
|
| 47 | # lex_mode_e.VS_ARG_UNQ
|
| 48 | # Like UNQUOTED, except we stop at }. Everything is allowed, even process
|
| 49 | # substitution.
|
| 50 | #
|
| 51 | # ${X:-$v} ${X:-${v}} ${X:-$(echo hi)} ${X:-`echo hi`} ${X:-$((1+2))}
|
| 52 | # ${X:-'single'} ${X:-"double"} ${X:-$'\n'} ${X:-<(echo hi)}
|
| 53 | #
|
| 54 | # But space is SIGNIFICANT. ${a:- b }
|
| 55 | # So you should NOT just read a bunch of words after :-, unless you also
|
| 56 | # preserve the space tokens between.
|
| 57 | # In other words, like DS_VS_ARG, except SINGLE Quotes allowed?
|
| 58 | #
|
| 59 | # lex_mode_e.VS_ARG_DQ
|
| 60 | # Can't be lex_mode_e.DQ because here we respect $' and $" tokens, while <(
|
| 61 | # token is not respected.
|
| 62 | #
|
| 63 | # Like VS_ARG_UNQ, but single quotes are NOT respected (they appear
|
| 64 | # literally), and process substitution is not respected (ditto).
|
| 65 | #
|
| 66 | # "" and $'' and $"" are respected, but not ''. I need a matrix for this.
|
| 67 | #
|
| 68 | # Like DQ, except nested "" and $'' and $"" are RESPECTED.
|
| 69 | #
|
| 70 | # It's weird that double quotes are allowed. Not sure why that would be.
|
| 71 | # Unquoted is also allowed, so " a "b" c " $'' and $"" are lame, because they
|
| 72 | # don't appear in the DQ context. I think I should parse those but DISALLOW.
|
| 73 | # You should always make $'' and $"" as a separate var!
|
| 74 |
|
| 75 | class WordParser(object):
|
| 76 |
|
| 77 | def __init__(self, lexer, line_reader, lex_mode=lex_mode_e.OUTER):
|
| 78 | self.lexer = lexer
|
| 79 | self.line_reader = line_reader
|
| 80 | self.Reset(lex_mode=lex_mode)
|
| 81 |
|
| 82 | def _Peek(self):
|
| 83 | """Helper method."""
|
| 84 | if self.next_lex_mode is not None:
|
| 85 | self.prev_token = self.cur_token # for completion
|
| 86 | self.cur_token = self.lexer.Read(self.next_lex_mode)
|
| 87 | self.token_kind = LookupKind(self.cur_token.id)
|
| 88 | self.token_type = self.cur_token.id
|
| 89 |
|
| 90 | self.next_lex_mode = None
|
| 91 | return self.cur_token
|
| 92 |
|
| 93 | def _Next(self, lex_mode):
|
| 94 | """Set the next lex state, but don't actually read a token.
|
| 95 |
|
| 96 | We need this for proper interactive parsing.
|
| 97 | """
|
| 98 | self.next_lex_mode = lex_mode
|
| 99 |
|
| 100 | def Reset(self, lex_mode=lex_mode_e.OUTER):
|
| 101 | """Called by interactive loop."""
|
| 102 | # For _Peek()
|
| 103 | self.prev_token = None # for completion
|
| 104 | self.cur_token = None
|
| 105 | self.token_kind = Kind.Undefined
|
| 106 | self.token_type = Id.Undefined_Tok
|
| 107 |
|
| 108 | self.next_lex_mode = lex_mode
|
| 109 |
|
| 110 | # For newline. TODO: I think we can do this iteratively, without member
|
| 111 | # state.
|
| 112 | self.cursor = None
|
| 113 | self.cursor_was_newline = False
|
| 114 |
|
| 115 | self.error_stack = []
|
| 116 |
|
| 117 | def AddErrorContext(self, msg, *args, **kwargs):
|
| 118 | err = util.ParseError(msg, *args, **kwargs)
|
| 119 | self.error_stack.append(err)
|
| 120 |
|
| 121 | def Error(self):
|
| 122 | return self.error_stack
|
| 123 |
|
| 124 | def _BadToken(self, msg, token):
|
| 125 | """
|
| 126 | Args:
|
| 127 | msg: format string with a single %s token
|
| 128 | token: Token
|
| 129 | """
|
| 130 | self.AddErrorContext(msg, token, token=token)
|
| 131 |
|
| 132 | def PrevToken(self):
|
| 133 | """Inspect state. Used by completion.
|
| 134 |
|
| 135 | cur_token is usually Id.Op_Newline \n, so we need the previous one.
|
| 136 | """
|
| 137 | return self.prev_token
|
| 138 |
|
| 139 | def _ReadVarOpArg(self, arg_lex_mode, eof_type=Id.Undefined_Tok,
|
| 140 | empty_ok=True):
|
| 141 | # NOTE: Operators like | and < are not treated as special, so ${a:- | >} is
|
| 142 | # valid, even when unquoted.
|
| 143 | self._Next(arg_lex_mode)
|
| 144 | self._Peek()
|
| 145 |
|
| 146 | w = self._ReadCompoundWord(
|
| 147 | lex_mode=arg_lex_mode, eof_type=eof_type, empty_ok=empty_ok)
|
| 148 | # This is for "${s:-}", ${s/a//}, etc. It is analogous to
|
| 149 | # LooksLikeAssignment where we turn x= into x=''. It has the same
|
| 150 | # potential problem of not having spids.
|
| 151 | #
|
| 152 | # NOTE: empty_ok is False only for the PatSub pattern, which means we'll
|
| 153 | # return a CompoundWord with no parts, which is explicitly checked with a
|
| 154 | # custom error message.
|
| 155 | if not w.parts and arg_lex_mode == lex_mode_e.VS_ARG_DQ and empty_ok:
|
| 156 | w.parts.append(ast.EmptyPart())
|
| 157 | return w
|
| 158 |
|
| 159 | def _ReadSliceArg(self):
|
| 160 | """Read an arithmetic expression for either part of ${a : i+1 : i+2}."""
|
| 161 | anode = self._ReadArithExpr(do_next=False)
|
| 162 | return anode
|
| 163 |
|
| 164 | def _ReadSliceVarOp(self):
|
| 165 | """ VarOf ':' ArithExpr (':' ArithExpr )? """
|
| 166 | self._Next(lex_mode_e.ARITH)
|
| 167 | self._Peek()
|
| 168 | if self.token_type == Id.Arith_Colon: # A pun for Id.VOp2_Colon
|
| 169 | begin = None # no beginning specified
|
| 170 | else:
|
| 171 | begin = self._ReadSliceArg()
|
| 172 | if not begin: return None
|
| 173 | #print('BEGIN', begin)
|
| 174 | #print('BVS2', self.cur_token)
|
| 175 |
|
| 176 | if self.token_type == Id.Arith_RBrace:
|
| 177 | return ast.Slice(begin, None) # No length specified
|
| 178 |
|
| 179 | # Id.Arith_Colon is a pun for Id.VOp2_Colon
|
| 180 | elif self.token_type == Id.Arith_Colon:
|
| 181 | self._Next(lex_mode_e.ARITH)
|
| 182 | length = self._ReadSliceArg()
|
| 183 | if not length: return None
|
| 184 |
|
| 185 | #print('after colon', self.cur_token)
|
| 186 | return ast.Slice(begin, length)
|
| 187 |
|
| 188 | else:
|
| 189 | self.AddErrorContext("Unexpected token in slice: %s", self.cur_token)
|
| 190 | return None
|
| 191 |
|
| 192 | def _ReadPatSubVarOp(self, lex_mode):
|
| 193 | """
|
| 194 | Match = ('/' | '#' | '%') WORD
|
| 195 | VarSub = ...
|
| 196 | | VarOf '/' Match '/' WORD
|
| 197 | """
|
| 198 | do_all = False
|
| 199 | do_prefix = False
|
| 200 | do_suffix = False
|
| 201 |
|
| 202 | pat = self._ReadVarOpArg(lex_mode, eof_type=Id.Lit_Slash, empty_ok=False)
|
| 203 | if not pat: return None
|
| 204 |
|
| 205 | if len(pat.parts) == 1:
|
| 206 | ok, s, quoted = word.StaticEval(pat)
|
| 207 | if ok and s == '/' and not quoted: # Looks like ${a////c}, read again
|
| 208 | self._Next(lex_mode)
|
| 209 | self._Peek()
|
| 210 | p = ast.LiteralPart(self.cur_token)
|
| 211 | pat.parts.append(p)
|
| 212 |
|
| 213 | if len(pat.parts) == 0:
|
| 214 | self._BadToken("Pattern must not be empty: %r", token=self.cur_token)
|
| 215 | return None
|
| 216 | else:
|
| 217 | first_part = pat.parts[0]
|
| 218 | if first_part.tag == word_part_e.LiteralPart:
|
| 219 | lit_id = first_part.token.id
|
| 220 | if lit_id == Id.Lit_Slash:
|
| 221 | do_all = True
|
| 222 | pat.parts.pop(0)
|
| 223 | elif lit_id == Id.Lit_Pound:
|
| 224 | do_prefix = True
|
| 225 | pat.parts.pop(0)
|
| 226 | elif lit_id == Id.Lit_Percent:
|
| 227 | do_suffix = True
|
| 228 | pat.parts.pop(0)
|
| 229 |
|
| 230 | #self._Peek()
|
| 231 | if self.token_type == Id.Right_VarSub:
|
| 232 | # e.g. ${v/a} is the same as ${v/a/} -- empty replacement string
|
| 233 | return ast.PatSub(pat, None, do_all, do_prefix, do_suffix)
|
| 234 |
|
| 235 | elif self.token_type == Id.Lit_Slash:
|
| 236 | replace = self._ReadVarOpArg(lex_mode) # do not stop at /
|
| 237 | if not replace: return None
|
| 238 |
|
| 239 | self._Peek()
|
| 240 | if self.token_type == Id.Right_VarSub:
|
| 241 | return ast.PatSub(pat, replace, do_all, do_prefix, do_suffix)
|
| 242 |
|
| 243 | else:
|
| 244 | self._BadToken("Expected } after pat sub, got %s", self.cur_token)
|
| 245 | return None
|
| 246 |
|
| 247 | else:
|
| 248 | self._BadToken("Expected } after pat sub, got %s", self.cur_token)
|
| 249 | return None
|
| 250 |
|
| 251 | def _ReadSubscript(self):
|
| 252 | """ Subscript = '[' ('@' | '*' | ArithExpr) ']'
|
| 253 | """
|
| 254 | # Lookahead to see if we get @ or *. Otherwise read a full arithmetic
|
| 255 | # expression.
|
| 256 | t2 = self.lexer.LookAhead(lex_mode_e.ARITH)
|
| 257 | if t2.id in (Id.Lit_At, Id.Arith_Star):
|
| 258 | op = ast.WholeArray(t2.id)
|
| 259 |
|
| 260 | self._Next(lex_mode_e.ARITH) # skip past [
|
| 261 | self._Peek()
|
| 262 | self._Next(lex_mode_e.ARITH) # skip past @
|
| 263 | self._Peek()
|
| 264 | else:
|
| 265 | anode = self._ReadArithExpr()
|
| 266 | if not anode:
|
| 267 | return None
|
| 268 | op = ast.ArrayIndex(anode)
|
| 269 |
|
| 270 | #self._Peek() # Can't do this here. Should the test go elsewhere?
|
| 271 | if self.token_type != Id.Arith_RBracket: # Should be looking at ]
|
| 272 | self._BadToken('Expected ] after subscript, got %s', self.cur_token)
|
| 273 | return None
|
| 274 |
|
| 275 | self._Next(lex_mode_e.VS_2) # skip past ]
|
| 276 | self._Peek() # Needed to be in the same spot as no subscript
|
| 277 |
|
| 278 | return op
|
| 279 |
|
| 280 | def _ParseVarOf(self):
|
| 281 | """
|
| 282 | VarOf = NAME Subscript?
|
| 283 | | NUMBER # no subscript allowed, none of these are arrays
|
| 284 | # ${@[1]} doesn't work, even though slicing does
|
| 285 | | VarSymbol
|
| 286 | """
|
| 287 | self._Peek()
|
| 288 | name_token = self.cur_token
|
| 289 | self._Next(lex_mode_e.VS_2)
|
| 290 |
|
| 291 | self._Peek() # Check for []
|
| 292 | if self.token_type == Id.VOp2_LBracket:
|
| 293 | bracket_op = self._ReadSubscript()
|
| 294 | if not bracket_op: return None
|
| 295 | else:
|
| 296 | bracket_op = None
|
| 297 |
|
| 298 | part = ast.BracedVarSub(name_token)
|
| 299 | part.bracket_op = bracket_op
|
| 300 | return part
|
| 301 |
|
| 302 | def _ParseVarExpr(self, arg_lex_mode):
|
| 303 | """
|
| 304 | Start parsing at the op -- we already skipped past the name.
|
| 305 | """
|
| 306 | part = self._ParseVarOf()
|
| 307 | if not part: return None
|
| 308 |
|
| 309 | self._Peek()
|
| 310 | if self.token_type == Id.Right_VarSub:
|
| 311 | return part # no ops
|
| 312 |
|
| 313 | # Or maybe this is a VarOpKind
|
| 314 |
|
| 315 | op_kind = self.token_kind
|
| 316 |
|
| 317 | if op_kind == Kind.VTest:
|
| 318 | op_id = self.token_type
|
| 319 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
| 320 | if self.token_type != Id.Right_VarSub:
|
| 321 | self._BadToken('Unexpected token after test arg: %s', self.cur_token)
|
| 322 | return None
|
| 323 |
|
| 324 | part.suffix_op = ast.StringUnary(op_id, arg_word)
|
| 325 |
|
| 326 | elif op_kind == Kind.VOp1:
|
| 327 | op_id = self.token_type
|
| 328 | arg_word = self._ReadVarOpArg(arg_lex_mode)
|
| 329 | if self.token_type != Id.Right_VarSub:
|
| 330 | self._BadToken('Unexpected token after unary op: %s', self.cur_token)
|
| 331 | return None
|
| 332 |
|
| 333 | op = ast.StringUnary(op_id, arg_word)
|
| 334 | part.suffix_op = op
|
| 335 |
|
| 336 | elif op_kind == Kind.VOp2:
|
| 337 | if self.token_type == Id.VOp2_Slash:
|
| 338 | op = self._ReadPatSubVarOp(arg_lex_mode)
|
| 339 | if not op: return None
|
| 340 | # Checked by the method above
|
| 341 | assert self.token_type == Id.Right_VarSub, self.cur_token
|
| 342 |
|
| 343 | elif self.token_type == Id.VOp2_Colon:
|
| 344 | op = self._ReadSliceVarOp()
|
| 345 | if not op: return None
|
| 346 | if self.token_type != Id.Arith_RBrace:
|
| 347 | self._BadToken('Unexpected token after slice: %s', self.cur_token)
|
| 348 | return None
|
| 349 |
|
| 350 | else:
|
| 351 | p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
|
| 352 |
|
| 353 | part.suffix_op = op
|
| 354 |
|
| 355 | # NOTE: Arith_RBrace is for slicing, because it reads } in arithmetic
|
| 356 | # mode. It's redundantly checked above.
|
| 357 | if self.token_type not in (Id.Right_VarSub, Id.Arith_RBrace):
|
| 358 | self._BadToken('Unexpected token after var sub: %s', self.cur_token)
|
| 359 | return None
|
| 360 |
|
| 361 | # Now look for ops
|
| 362 | return part
|
| 363 |
|
| 364 | def _ReadBracedBracedVarSub(self, d_quoted=False):
|
| 365 | """For the ${} expression language.
|
| 366 |
|
| 367 | NAME = [a-zA-Z_][a-zA-Z0-9_]*
|
| 368 | NUMBER = [0-9]+ # ${10}, ${11}, ...
|
| 369 |
|
| 370 | Subscript = '[' ('@' | '*' | ArithExpr) ']'
|
| 371 | VarSymbol = '!' | '@' | '#' | ...
|
| 372 | VarOf = NAME Subscript?
|
| 373 | | NUMBER # no subscript allowed, none of these are arrays
|
| 374 | # ${@[1]} doesn't work, even though slicing does
|
| 375 | | VarSymbol
|
| 376 |
|
| 377 | TEST_OP = '-' | ':-' | '=' | ':=' | '+' | ':+' | '?' | ':?'
|
| 378 | STRIP_OP = '#' | '##' | '%' | '%%'
|
| 379 | CASE_OP = ',' | ',,' | '^' | '^^'
|
| 380 |
|
| 381 | UnaryOp = TEST_OP | STRIP_OP | CASE_OP | ...
|
| 382 | Match = ('/' | '#' | '%') WORD # match all / prefix / suffix
|
| 383 | VarExpr = VarOf
|
| 384 | | VarOf UnaryOp WORD
|
| 385 | | VarOf ':' ArithExpr (':' ArithExpr )?
|
| 386 | | VarOf '/' Match '/' WORD
|
| 387 |
|
| 388 | LengthExpr = '#' VarOf # can't apply operators after length
|
| 389 |
|
| 390 | RefOrKeys = '!' VarExpr # CAN apply operators after a named ref
|
| 391 | # ${!ref[0]} vs ${!keys[@]} resolved later
|
| 392 |
|
| 393 | PrefixQuery = '!' NAME ('*' | '@') # list variable names with a prefix
|
| 394 |
|
| 395 | VarSub = LengthExpr
|
| 396 | | RefOrKeys
|
| 397 | | PrefixQuery
|
| 398 | | VarExpr
|
| 399 |
|
| 400 | NOTES:
|
| 401 | - Arithmetic expressions are used twice, inside subscripts ${a[x+1]} and
|
| 402 | slicing ${a:x+1:y+2}
|
| 403 | - ${#} and ${!} need LL(2) lookahead (considering how my tokenizer works)
|
| 404 | - @ and * are technically arithmetic expressions in this implementation
|
| 405 | - We don't account for bash 4.4: ${param@operator} -- Q E P A a. Note that
|
| 406 | it's also vectorized.
|
| 407 |
|
| 408 | Strictness over bash:
|
| 409 | echo ${a[0][0]} doesn't do anything useful, so we disallow it from the
|
| 410 | grammar
|
| 411 | ! and # prefixes can't be composed, even though named refs can be composed
|
| 412 | with other operators
|
| 413 | '#' means 4 different things: length prefix, VarSymbol, UnaryOp to strip a
|
| 414 | prefix, and it can also be a literal part of WORD.
|
| 415 |
|
| 416 | From the parser's point of view, the prefix # can't be combined with
|
| 417 | UnaryOp/slicing/matching, and the ! can. However
|
| 418 |
|
| 419 | ${a[@]:1:2} is not allowed
|
| 420 | ${#a[@]:1:2} is allowed, but gives the wrong answer
|
| 421 | """
|
| 422 | left_spid = self.cur_token.span_id
|
| 423 |
|
| 424 | if d_quoted:
|
| 425 | arg_lex_mode = lex_mode_e.VS_ARG_DQ
|
| 426 | else:
|
| 427 | arg_lex_mode = lex_mode_e.VS_ARG_UNQ
|
| 428 |
|
| 429 | self._Next(lex_mode_e.VS_1)
|
| 430 | self._Peek()
|
| 431 |
|
| 432 | ty = self.token_type
|
| 433 |
|
| 434 | if ty == Id.VSub_Pound:
|
| 435 | # Disambiguate
|
| 436 | t = self.lexer.LookAhead(lex_mode_e.VS_1)
|
| 437 | #print("\t# LOOKAHEAD", t)
|
| 438 | if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
|
| 439 | # e.g. a name, '#' is the prefix
|
| 440 | self._Next(lex_mode_e.VS_1)
|
| 441 | part = self._ParseVarOf()
|
| 442 |
|
| 443 | self._Peek()
|
| 444 | if self.token_type != Id.Right_VarSub:
|
| 445 | self._BadToken("Expected } after length expression, got %r",
|
| 446 | self.cur_token)
|
| 447 | return None
|
| 448 |
|
| 449 | part.prefix_op = Id.VSub_Pound # length
|
| 450 |
|
| 451 | else: # not a prefix, '#' is the variable
|
| 452 | part = self._ParseVarExpr(arg_lex_mode)
|
| 453 | if not part: return None
|
| 454 |
|
| 455 | elif ty == Id.VSub_Bang:
|
| 456 | t = self.lexer.LookAhead(lex_mode_e.VS_1)
|
| 457 | #print("\t! LOOKAHEAD", t)
|
| 458 | if t.id not in (Id.Unknown_Tok, Id.Right_VarSub):
|
| 459 | # e.g. a name, '!' is the prefix
|
| 460 | # ${!a} -- this is a ref
|
| 461 | # ${!3} -- this is ref
|
| 462 | # ${!a[1]} -- this is a ref
|
| 463 | # ${!a[@]} -- this is a keys
|
| 464 | # No lookahead -- do it in a second step, or at runtime
|
| 465 | self._Next(lex_mode_e.VS_1)
|
| 466 | part = self._ParseVarExpr(arg_lex_mode)
|
| 467 | if not part: return None
|
| 468 |
|
| 469 | part.prefix_op = Id.VSub_Bang
|
| 470 |
|
| 471 | else: # not a prefix, '!' is the variable
|
| 472 | part = self._ParseVarExpr(arg_lex_mode)
|
| 473 | if not part: return None
|
| 474 |
|
| 475 | # VS_NAME, VS_NUMBER, symbol that isn't # or !
|
| 476 | elif self.token_kind == Kind.VSub:
|
| 477 | part = self._ParseVarExpr(arg_lex_mode)
|
| 478 | if not part: return None
|
| 479 |
|
| 480 | else:
|
| 481 | # e.g. ${^}
|
| 482 | p_die('Unexpected token %s', self.cur_token, token=self.cur_token)
|
| 483 |
|
| 484 | part.spids.append(left_spid)
|
| 485 |
|
| 486 | # Does this work?
|
| 487 | right_spid = self.cur_token.span_id
|
| 488 | part.spids.append(right_spid)
|
| 489 |
|
| 490 | return part
|
| 491 |
|
| 492 | def _ReadSingleQuotedPart(self, lex_mode):
|
| 493 | left = self.cur_token
|
| 494 | tokens = []
|
| 495 |
|
| 496 | done = False
|
| 497 | while not done:
|
| 498 | self._Next(lex_mode)
|
| 499 | self._Peek()
|
| 500 |
|
| 501 | # Kind.Char emitted in DOLLAR_SQ state
|
| 502 | if self.token_kind in (Kind.Lit, Kind.Char):
|
| 503 | tokens.append(self.cur_token)
|
| 504 |
|
| 505 | elif self.token_kind == Kind.Eof:
|
| 506 | self.AddErrorContext('Unexpected EOF in single-quoted string')
|
| 507 | return False
|
| 508 |
|
| 509 | elif self.token_kind == Kind.Right:
|
| 510 | done = True # assume Id.Right_SingleQuote
|
| 511 |
|
| 512 | else:
|
| 513 | raise AssertionError(
|
| 514 | 'Unhandled token in single-quoted part %s (%d)' %
|
| 515 | (self.cur_token, self.token_kind))
|
| 516 |
|
| 517 | return ast.SingleQuotedPart(left, tokens)
|
| 518 |
|
| 519 | def _ReadDoubleQuotedLeftParts(self):
|
| 520 | """Read substitution parts in a double quoted context."""
|
| 521 | if self.token_type in (Id.Left_CommandSub, Id.Left_Backtick):
|
| 522 | return self._ReadCommandSubPart(self.token_type)
|
| 523 |
|
| 524 | if self.token_type == Id.Left_VarSub:
|
| 525 | return self._ReadBracedBracedVarSub(d_quoted=True)
|
| 526 |
|
| 527 | if self.token_type == Id.Left_ArithSub:
|
| 528 | return self._ReadArithSubPart()
|
| 529 |
|
| 530 | if self.token_type == Id.Left_ArithSub2:
|
| 531 | return self._ReadArithSub2Part()
|
| 532 |
|
| 533 | raise AssertionError(self.cur_token)
|
| 534 |
|
| 535 | def _ReadLeftParts(self):
|
| 536 | """Read substitutions and quoted strings."""
|
| 537 |
|
| 538 | if self.token_type == Id.Left_DoubleQuote:
|
| 539 | return self._ReadDoubleQuotedPart()
|
| 540 |
|
| 541 | if self.token_type == Id.Left_DollarDoubleQuote:
|
| 542 | # NOTE: $"" is treated as "" for now. Does it make sense to add the
|
| 543 | # token to the part?
|
| 544 | return self._ReadDoubleQuotedPart()
|
| 545 |
|
| 546 | if self.token_type == Id.Left_SingleQuote:
|
| 547 | return self._ReadSingleQuotedPart(lex_mode_e.SQ)
|
| 548 |
|
| 549 | if self.token_type == Id.Left_DollarSingleQuote:
|
| 550 | return self._ReadSingleQuotedPart(lex_mode_e.DOLLAR_SQ)
|
| 551 |
|
| 552 | if self.token_type in (
|
| 553 | Id.Left_CommandSub, Id.Left_Backtick, Id.Left_ProcSubIn,
|
| 554 | Id.Left_ProcSubOut):
|
| 555 | return self._ReadCommandSubPart(self.token_type)
|
| 556 |
|
| 557 | if self.token_type == Id.Left_VarSub:
|
| 558 | return self._ReadBracedBracedVarSub(d_quoted=False)
|
| 559 |
|
| 560 | if self.token_type == Id.Left_ArithSub:
|
| 561 | return self._ReadArithSubPart()
|
| 562 |
|
| 563 | if self.token_type == Id.Left_ArithSub2:
|
| 564 | return self._ReadArithSub2Part()
|
| 565 |
|
| 566 | raise AssertionError('%s not handled' % self.cur_token)
|
| 567 |
|
| 568 | def _ReadExtGlobPart(self):
|
| 569 | """
|
| 570 | Grammar:
|
| 571 | Item = CompoundWord | EPSILON # important: @(foo|) is allowed
|
| 572 | LEFT = '@(' | '*(' | '+(' | '?(' | '!('
|
| 573 | RIGHT = ')'
|
| 574 | ExtGlob = LEFT (Item '|')* Item RIGHT # ITEM may be empty
|
| 575 | CompoundWord includes ExtGlobPart
|
| 576 | """
|
| 577 | left_token = self.cur_token
|
| 578 | arms = []
|
| 579 | part = ast.ExtGlobPart(left_token, arms) # return value
|
| 580 | part.spids.append(left_token.span_id)
|
| 581 |
|
| 582 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ExtGlob)
|
| 583 | self._Next(lex_mode_e.EXTGLOB) # advance past LEFT
|
| 584 |
|
| 585 | read_word = False # did we just a read a word? To handle @(||).
|
| 586 |
|
| 587 | while True:
|
| 588 | self._Peek()
|
| 589 | #log('t %r', self.cur_token)
|
| 590 |
|
| 591 | if self.token_type == Id.Right_ExtGlob:
|
| 592 | if not read_word:
|
| 593 | arms.append(ast.CompoundWord())
|
| 594 | part.spids.append(self.cur_token.span_id)
|
| 595 | break
|
| 596 |
|
| 597 | elif self.token_type == Id.Op_Pipe:
|
| 598 | if not read_word:
|
| 599 | arms.append(ast.CompoundWord())
|
| 600 | read_word = False
|
| 601 | self._Next(lex_mode_e.EXTGLOB)
|
| 602 |
|
| 603 | # lex mode EXTGLOB should only produce these 4 kinds of tokens
|
| 604 | elif self.token_kind in (Kind.Lit, Kind.Left, Kind.VSub, Kind.ExtGlob):
|
| 605 | w = self._ReadCompoundWord(lex_mode=lex_mode_e.EXTGLOB)
|
| 606 | arms.append(w)
|
| 607 | read_word = True
|
| 608 |
|
| 609 | elif self.token_kind == Kind.Eof:
|
| 610 | self.AddErrorContext(
|
| 611 | 'Unexpected EOF reading extended glob that began here',
|
| 612 | token=left_token)
|
| 613 | return None
|
| 614 |
|
| 615 | else:
|
| 616 | raise AssertionError('Unexpected token %r' % self.cur_token)
|
| 617 |
|
| 618 | return part
|
| 619 |
|
| 620 | def _ReadDoubleQuotedPart(self, eof_type=Id.Undefined_Tok, here_doc=False):
|
| 621 | """
|
| 622 | Args:
|
| 623 | eof_type: for stopping at }, Id.Lit_RBrace
|
| 624 | here_doc: Whether we are reading in a here doc context
|
| 625 |
|
| 626 | Also ${foo%%a b c} # treat this as double quoted. until you hit
|
| 627 | """
|
| 628 | quoted_part = ast.DoubleQuotedPart()
|
| 629 | left_spid = const.NO_INTEGER
|
| 630 | right_spid = const.NO_INTEGER # gets set later
|
| 631 |
|
| 632 | if self.cur_token is not None: # None in here doc case
|
| 633 | left_spid = self.cur_token.span_id
|
| 634 |
|
| 635 | done = False
|
| 636 | while not done:
|
| 637 | self._Next(lex_mode_e.DQ)
|
| 638 | self._Peek()
|
| 639 | #print(self.cur_token)
|
| 640 |
|
| 641 | if self.token_type == eof_type: # e.g. stop at }
|
| 642 | done = True
|
| 643 | continue
|
| 644 |
|
| 645 | elif self.token_kind == Kind.Lit:
|
| 646 | if self.token_type == Id.Lit_EscapedChar:
|
| 647 | part = ast.EscapedLiteralPart(self.cur_token)
|
| 648 | else:
|
| 649 | part = ast.LiteralPart(self.cur_token)
|
| 650 | quoted_part.parts.append(part)
|
| 651 |
|
| 652 | elif self.token_kind == Kind.Left:
|
| 653 | part = self._ReadDoubleQuotedLeftParts()
|
| 654 | if not part:
|
| 655 | return None
|
| 656 | quoted_part.parts.append(part)
|
| 657 |
|
| 658 | elif self.token_kind == Kind.VSub:
|
| 659 | part = ast.SimpleVarSub(self.cur_token)
|
| 660 | quoted_part.parts.append(part)
|
| 661 |
|
| 662 | elif self.token_kind == Kind.Right:
|
| 663 | assert self.token_type == Id.Right_DoubleQuote
|
| 664 | if here_doc:
|
| 665 | # Turn Id.Right_DoubleQuote into a literal part
|
| 666 | quoted_part.parts.append(ast.LiteralPart(self.cur_token))
|
| 667 | else:
|
| 668 | done = True # assume Id.Right_DoubleQuote
|
| 669 | right_spid = self.cur_token.span_id
|
| 670 |
|
| 671 | elif self.token_kind == Kind.Eof:
|
| 672 | if here_doc: # here docs will have an EOF in their token stream
|
| 673 | done = True
|
| 674 | else:
|
| 675 | self.AddErrorContext(
|
| 676 | 'Unexpected EOF reading double-quoted string that began here',
|
| 677 | span_id=left_spid)
|
| 678 | return False
|
| 679 |
|
| 680 | else:
|
| 681 | raise AssertionError(self.cur_token)
|
| 682 |
|
| 683 | quoted_part.spids.extend((left_spid, right_spid))
|
| 684 | return quoted_part
|
| 685 |
|
| 686 | def _ReadCommandSubPart(self, token_type):
|
| 687 | """
|
| 688 | NOTE: This is not in the grammar, because word parts aren't in the grammar!
|
| 689 |
|
| 690 | command_sub = '$(' command_list ')'
|
| 691 | """
|
| 692 | left_token = self.cur_token
|
| 693 | left_spid = left_token.span_id
|
| 694 |
|
| 695 | #print('_ReadCommandSubPart', self.cur_token)
|
| 696 | self._Next(lex_mode_e.OUTER) # advance past $( or `
|
| 697 |
|
| 698 | # Set the lexer in a state so ) becomes the EOF token.
|
| 699 | #print('_ReadCommandSubPart lexer.PushHint ) -> EOF')
|
| 700 | if token_type in (
|
| 701 | Id.Left_CommandSub, Id.Left_ProcSubIn, Id.Left_ProcSubOut):
|
| 702 | self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
| 703 | elif token_type == Id.Left_Backtick:
|
| 704 | self.lexer.PushHint(Id.Left_Backtick, Id.Eof_Backtick)
|
| 705 | else:
|
| 706 | raise AssertionError(self.token_type)
|
| 707 |
|
| 708 | from osh import parse_lib
|
| 709 | c_parser = parse_lib.MakeParserForCommandSub(self.line_reader, self.lexer)
|
| 710 |
|
| 711 | node = c_parser.ParseWholeFile() # `` and $() allowed
|
| 712 | if not node:
|
| 713 | # Example of parse error:
|
| 714 | # echo $(cat |) OR
|
| 715 | # echo `cat |`
|
| 716 | error_stack = c_parser.Error()
|
| 717 | self.error_stack.extend(error_stack)
|
| 718 | print(self.error_stack)
|
| 719 | self.AddErrorContext('Error parsing command list in command sub')
|
| 720 | return None
|
| 721 |
|
| 722 | # Hm this creates its own word parser, which is thrown away?
|
| 723 | #print('X', self.cur_token)
|
| 724 | right_spid = c_parser.w_parser.cur_token.span_id
|
| 725 |
|
| 726 | cs_part = ast.CommandSubPart(node, left_token)
|
| 727 | cs_part.spids.append(left_spid)
|
| 728 | cs_part.spids.append(right_spid)
|
| 729 | return cs_part
|
| 730 |
|
| 731 | def _ReadArithExpr(self, do_next=True):
|
| 732 | """Read and parse an arithmetic expression in various contexts.
|
| 733 |
|
| 734 | $(( 1+2 ))
|
| 735 | (( a=1+2 ))
|
| 736 | ${a[ 1+2 ]}
|
| 737 | ${a : 1+2 : 1+2}
|
| 738 |
|
| 739 | See tests/arith-context.test.sh for ambiguous cases.
|
| 740 |
|
| 741 | ${a[a[0]]} is valid # VS_RBRACKET vs Id.Arith_RBracket
|
| 742 |
|
| 743 | ${s : a<b?0:1 : 1} # VS_COLON vs Id.Arith_Colon
|
| 744 |
|
| 745 | TODO: Instead of having an eof_type. I think we should use just run the
|
| 746 | arith parser until it's done. That will take care of both : and ]. We
|
| 747 | switch the state back.
|
| 748 |
|
| 749 | See the assertion in ArithParser.Parse() -- unexpected extra input.
|
| 750 | """
|
| 751 | if do_next:
|
| 752 | self._Next(lex_mode_e.ARITH)
|
| 753 | # calls self.ReadWord(lex_mode_e.ARITH)
|
| 754 | a_parser = tdop.TdopParser(arith_parse.SPEC, self)
|
| 755 | anode = a_parser.Parse()
|
| 756 | if not anode:
|
| 757 | error_stack = a_parser.Error()
|
| 758 | self.error_stack.extend(error_stack)
|
| 759 | return anode # could be None
|
| 760 |
|
| 761 | def _ReadArithSubPart(self):
|
| 762 | """
|
| 763 | Read an arith substitution, which contains an arith expression, e.g.
|
| 764 | $((a + 1)).
|
| 765 | """
|
| 766 | left_span_id = self.cur_token.span_id
|
| 767 |
|
| 768 | # The second one needs to be disambiguated in stuff like stuff like:
|
| 769 | # $(echo $(( 1+2 )) )
|
| 770 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ArithSub)
|
| 771 |
|
| 772 | # NOTE: To disambiguate $(( as arith sub vs. command sub and subshell, we
|
| 773 | # could save the lexer/reader state here, and retry if the arithmetic parse
|
| 774 | # fails. But we can almost always catch this at parse time. There could
|
| 775 | # be some exceptions like:
|
| 776 | # $((echo * foo)) # looks like multiplication
|
| 777 | # $((echo / foo)) # looks like division
|
| 778 |
|
| 779 | anode = self._ReadArithExpr()
|
| 780 | if not anode:
|
| 781 | self.AddErrorContext("Error parsing arith sub part")
|
| 782 | return None
|
| 783 |
|
| 784 | if self.token_type != Id.Arith_RParen:
|
| 785 | self._BadToken('Expected first paren to end arith sub, got %s',
|
| 786 | self.cur_token)
|
| 787 | return None
|
| 788 |
|
| 789 | self._Next(lex_mode_e.OUTER) # TODO: This could be DQ or ARITH too
|
| 790 |
|
| 791 | # PROBLEM: $(echo $(( 1 + 2 )) )
|
| 792 | # Two right parens break the Id.Eof_RParen scheme
|
| 793 | self._Peek()
|
| 794 | if self.token_type != Id.Right_ArithSub:
|
| 795 | self._BadToken('Expected second paren to end arith sub, got %s',
|
| 796 | self.cur_token)
|
| 797 | return None
|
| 798 | right_span_id = self.cur_token.span_id
|
| 799 |
|
| 800 | node = ast.ArithSubPart(anode)
|
| 801 | node.spids.append(left_span_id)
|
| 802 | node.spids.append(right_span_id)
|
| 803 | return node
|
| 804 |
|
| 805 | def _ReadArithSub2Part(self):
|
| 806 | """Non-standard arith sub $[a + 1]."""
|
| 807 | left_span_id = self.cur_token.span_id
|
| 808 |
|
| 809 | anode = self._ReadArithExpr()
|
| 810 | if not anode:
|
| 811 | self.AddErrorContext("Error parsing arith sub part")
|
| 812 | return None
|
| 813 |
|
| 814 | if self.token_type != Id.Arith_RBracket:
|
| 815 | self.AddErrorContext("Expected ], got %s", self.cur_token)
|
| 816 | return None
|
| 817 | right_span_id = self.cur_token.span_id
|
| 818 |
|
| 819 | node = ast.ArithSubPart(anode)
|
| 820 | node.spids.append(left_span_id)
|
| 821 | node.spids.append(right_span_id)
|
| 822 | return node
|
| 823 |
|
| 824 | def ReadDParen(self):
|
| 825 | """Read ((1+ 2)) -- command context.
|
| 826 |
|
| 827 | We're using the word parser because it's very similar to _ReadArithExpr
|
| 828 | above.
|
| 829 | """
|
| 830 | # The second one needs to be disambiguated in stuff like stuff like:
|
| 831 | # TODO: Be consistent with ReadForExpression below and use lex_mode_e.ARITH?
|
| 832 | # Then you can get rid of this.
|
| 833 | self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
|
| 834 |
|
| 835 | anode = self._ReadArithExpr()
|
| 836 | if not anode:
|
| 837 | self.AddErrorContext("Error parsing dparen statement")
|
| 838 | return None
|
| 839 |
|
| 840 | #print('xx ((', self.cur_token)
|
| 841 | if self.token_type != Id.Arith_RParen:
|
| 842 | self._BadToken('Expected first paren to end arith sub, got %s',
|
| 843 | self.cur_token)
|
| 844 | return None
|
| 845 | self._Next(lex_mode_e.OUTER)
|
| 846 |
|
| 847 | # PROBLEM: $(echo $(( 1 + 2 )) )
|
| 848 | self._Peek()
|
| 849 | if self.token_type != Id.Op_DRightParen:
|
| 850 | self._BadToken('Expected second paren to end arith sub, got %s',
|
| 851 | self.cur_token)
|
| 852 | return None
|
| 853 | self._Next(lex_mode_e.OUTER)
|
| 854 |
|
| 855 | return anode
|
| 856 |
|
| 857 | def ReadForExpression(self):
|
| 858 | """Read ((i=0; i<5; ++i)) -- part of command context.
|
| 859 |
|
| 860 | """
|
| 861 | # No PushHint because we're in arith state.
|
| 862 | #self.lexer.PushHint(Id.Op_RParen, Id.Op_DRightParen)
|
| 863 |
|
| 864 | self._Next(lex_mode_e.ARITH) # skip over ((
|
| 865 |
|
| 866 | self._Peek()
|
| 867 | if self.token_type == Id.Arith_Semi:
|
| 868 | #print('Got empty init')
|
| 869 | init_node = None
|
| 870 | else:
|
| 871 | init_node = self._ReadArithExpr(do_next=False)
|
| 872 | if not init_node:
|
| 873 | self.AddErrorContext("Error parsing for init")
|
| 874 | return None
|
| 875 | self._Next(lex_mode_e.ARITH)
|
| 876 | #print('INIT',init_node)
|
| 877 |
|
| 878 | self._Peek()
|
| 879 | if self.token_type == Id.Arith_Semi:
|
| 880 | #print('Got empty condition')
|
| 881 | cond_node = None
|
| 882 | else:
|
| 883 | cond_node = self._ReadArithExpr(do_next=False)
|
| 884 | if not cond_node:
|
| 885 | self.AddErrorContext("Error parsing for cond")
|
| 886 | return None
|
| 887 | self._Next(lex_mode_e.ARITH)
|
| 888 | #print('COND',cond_node)
|
| 889 |
|
| 890 | self._Peek()
|
| 891 | if self.token_type == Id.Arith_RParen:
|
| 892 | #print('Got empty update')
|
| 893 | update_node = None
|
| 894 | else:
|
| 895 | update_node = self._ReadArithExpr(do_next=False)
|
| 896 | if not update_node:
|
| 897 | self.AddErrorContext("Error parsing for update")
|
| 898 | return None
|
| 899 | self._Next(lex_mode_e.ARITH)
|
| 900 | #print('UPDATE',update_node)
|
| 901 |
|
| 902 | #print('TT', self.cur_token)
|
| 903 | # Second paren
|
| 904 | self._Peek()
|
| 905 | if self.token_type != Id.Arith_RParen:
|
| 906 | self._BadToken('Expected right paren to end for loop expression, got %s',
|
| 907 | self.cur_token)
|
| 908 | return None
|
| 909 | self._Next(lex_mode_e.OUTER)
|
| 910 |
|
| 911 | return ast.ForExpr(init_node, cond_node, update_node)
|
| 912 |
|
| 913 | def _ReadArrayLiteralPart(self):
|
| 914 | self._Next(lex_mode_e.OUTER) # advance past (
|
| 915 | self._Peek()
|
| 916 | if self.cur_token.id != Id.Op_LParen:
|
| 917 | self.AddErrorContext('Expected ( after =', token=self.cur_token)
|
| 918 | return None
|
| 919 |
|
| 920 | # MUST use a new word parser (with same lexer).
|
| 921 | w_parser = WordParser(self.lexer, self.line_reader)
|
| 922 | words = []
|
| 923 | while True:
|
| 924 | w = w_parser.ReadWord(lex_mode_e.OUTER)
|
| 925 | if not w:
|
| 926 | self.error_stack.extend(w_parser.Error())
|
| 927 | return None
|
| 928 |
|
| 929 | if w.tag == word_e.TokenWord:
|
| 930 | word_id = word.CommandId(w)
|
| 931 | if word_id == Id.Right_ArrayLiteral:
|
| 932 | break
|
| 933 | # Unlike command parsing, array parsing allows embedded \n.
|
| 934 | elif word_id == Id.Op_Newline:
|
| 935 | continue
|
| 936 | else:
|
| 937 | self.AddErrorContext(
|
| 938 | 'Unexpected word in array literal: %s', w, word=w)
|
| 939 | return None
|
| 940 |
|
| 941 | words.append(w)
|
| 942 |
|
| 943 | words2 = braces.BraceDetectAll(words)
|
| 944 | words3 = word.TildeDetectAll(words2)
|
| 945 |
|
| 946 | return ast.ArrayLiteralPart(words3)
|
| 947 |
|
| 948 | def _ReadCompoundWord(self, eof_type=Id.Undefined_Tok,
|
| 949 | lex_mode=lex_mode_e.OUTER, empty_ok=True):
|
| 950 | """
|
| 951 | Precondition: Looking at the first token of the first word part
|
| 952 | Postcondition: Looking at the token after, e.g. space or operator
|
| 953 |
|
| 954 | NOTE: eof_type is necessary because / is a literal, i.e. Lit_Slash, but it
|
| 955 | could be an operator delimiting a compound word. Can we change lexer modes
|
| 956 | and remove this special case?
|
| 957 | """
|
| 958 | #print('_ReadCompoundWord', lex_mode)
|
| 959 | word = ast.CompoundWord()
|
| 960 |
|
| 961 | num_parts = 0
|
| 962 | done = False
|
| 963 | while not done:
|
| 964 | allow_done = empty_ok or num_parts != 0
|
| 965 | self._Peek()
|
| 966 | #print('CW',self.cur_token)
|
| 967 | if allow_done and self.token_type == eof_type:
|
| 968 | done = True # e.g. for ${foo//pat/replace}
|
| 969 |
|
| 970 | # Keywords like "for" are treated like literals
|
| 971 | elif self.token_kind in (
|
| 972 | Kind.Lit, Kind.KW, Kind.Assign, Kind.ControlFlow, Kind.BoolUnary,
|
| 973 | Kind.BoolBinary):
|
| 974 | if self.token_type == Id.Lit_EscapedChar:
|
| 975 | part = ast.EscapedLiteralPart(self.cur_token)
|
| 976 | else:
|
| 977 | part = ast.LiteralPart(self.cur_token)
|
| 978 | #part.xspans.append(self.cur_token.span_id)
|
| 979 |
|
| 980 | word.parts.append(part)
|
| 981 |
|
| 982 | if self.token_type == Id.Lit_VarLike:
|
| 983 | #print('@', self.cursor)
|
| 984 | #print('@', self.cur_token)
|
| 985 |
|
| 986 | t = self.lexer.LookAhead(lex_mode_e.OUTER)
|
| 987 | if t.id == Id.Op_LParen:
|
| 988 | self.lexer.PushHint(Id.Op_RParen, Id.Right_ArrayLiteral)
|
| 989 | part2 = self._ReadArrayLiteralPart()
|
| 990 | if not part2:
|
| 991 | self.AddErrorContext('_ReadArrayLiteralPart failed')
|
| 992 | return False
|
| 993 | word.parts.append(part2)
|
| 994 |
|
| 995 | elif self.token_kind == Kind.VSub:
|
| 996 | part = ast.SimpleVarSub(self.cur_token)
|
| 997 | word.parts.append(part)
|
| 998 |
|
| 999 | elif self.token_kind == Kind.ExtGlob:
|
| 1000 | part = self._ReadExtGlobPart()
|
| 1001 | if not part:
|
| 1002 | return None
|
| 1003 | word.parts.append(part)
|
| 1004 |
|
| 1005 | elif self.token_kind == Kind.Left:
|
| 1006 | #print('_ReadLeftParts')
|
| 1007 | part = self._ReadLeftParts()
|
| 1008 | if not part:
|
| 1009 | return None
|
| 1010 | word.parts.append(part)
|
| 1011 |
|
| 1012 | # NOT done yet, will advance below
|
| 1013 | elif self.token_kind == Kind.Right:
|
| 1014 | # Still part of the word; will be done on the next iter.
|
| 1015 | if self.token_type == Id.Right_DoubleQuote:
|
| 1016 | pass
|
| 1017 | elif self.token_type == Id.Right_CommandSub:
|
| 1018 | pass
|
| 1019 | elif self.token_type == Id.Right_Subshell:
|
| 1020 | # LEXER HACK for (case x in x) ;; esac )
|
| 1021 | assert self.next_lex_mode is None # Rewind before it's used
|
| 1022 | if self.lexer.MaybeUnreadOne():
|
| 1023 | self.lexer.PushHint(Id.Op_RParen, Id.Right_Subshell)
|
| 1024 | self._Next(lex_mode)
|
| 1025 | done = True
|
| 1026 | else:
|
| 1027 | done = True
|
| 1028 |
|
| 1029 | elif self.token_kind == Kind.Ignored:
|
| 1030 | done = True
|
| 1031 |
|
| 1032 | else:
|
| 1033 | # LEXER HACK for unbalanced case clause. 'case foo in esac' is valid,
|
| 1034 | # so to test for ESAC, we can read ) before getting a chance to
|
| 1035 | # PushHint(Id.Op_RParen, Id.Right_CasePat). So here we unread one
|
| 1036 | # token and do it again.
|
| 1037 |
|
| 1038 | # We get Id.Op_RParen at top level: case x in x) ;; esac
|
| 1039 | # We get Id.Eof_RParen inside ComSub: $(case x in x) ;; esac )
|
| 1040 | if self.token_type in (Id.Op_RParen, Id.Eof_RParen):
|
| 1041 | assert self.next_lex_mode is None # Rewind before it's used
|
| 1042 | if self.lexer.MaybeUnreadOne():
|
| 1043 | if self.token_type == Id.Eof_RParen:
|
| 1044 | # Redo translation
|
| 1045 | self.lexer.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
| 1046 | self._Next(lex_mode)
|
| 1047 |
|
| 1048 | done = True # anything we don't recognize means we're done
|
| 1049 |
|
| 1050 | if not done:
|
| 1051 | self._Next(lex_mode)
|
| 1052 | num_parts += 1
|
| 1053 | return word
|
| 1054 |
|
| 1055 | def _ReadArithWord(self):
|
| 1056 | """Helper function for ReadArithWord."""
|
| 1057 | #assert self.token_type != Id.Undefined_Tok
|
| 1058 | self._Peek()
|
| 1059 | #print('_ReadArithWord', self.cur_token)
|
| 1060 |
|
| 1061 | if self.token_kind == Kind.Unknown:
|
| 1062 | self.AddErrorContext("Unknown token in arith context: %s",
|
| 1063 | self.cur_token, token=self.cur_token)
|
| 1064 | return None, False
|
| 1065 |
|
| 1066 | elif self.token_kind == Kind.Eof:
|
| 1067 | # Just return EOF token
|
| 1068 | w = ast.TokenWord(self.cur_token)
|
| 1069 | return w, False
|
| 1070 | #self.AddErrorContext("Unexpected EOF in arith context: %s",
|
| 1071 | # self.cur_token, token=self.cur_token)
|
| 1072 | #return None, False
|
| 1073 |
|
| 1074 | elif self.token_kind == Kind.Ignored:
|
| 1075 | # Space should be ignored. TODO: change this to SPACE_SPACE and
|
| 1076 | # SPACE_NEWLINE? or SPACE_TOK.
|
| 1077 | self._Next(lex_mode_e.ARITH)
|
| 1078 | return None, True # Tell wrapper to try again
|
| 1079 |
|
| 1080 | elif self.token_kind in (Kind.Arith, Kind.Right):
|
| 1081 | # Id.Right_ArithSub IS just a normal token, handled by ArithParser
|
| 1082 | self._Next(lex_mode_e.ARITH)
|
| 1083 | w = ast.TokenWord(self.cur_token)
|
| 1084 | return w, False
|
| 1085 |
|
| 1086 | elif self.token_kind in (Kind.Lit, Kind.Left):
|
| 1087 | w = self._ReadCompoundWord(lex_mode=lex_mode_e.ARITH)
|
| 1088 | if not w:
|
| 1089 | return None, True
|
| 1090 | return w, False
|
| 1091 |
|
| 1092 | elif self.token_kind == Kind.VSub:
|
| 1093 | part = ast.SimpleVarSub(self.cur_token)
|
| 1094 | self._Next(lex_mode_e.ARITH)
|
| 1095 | w = ast.CompoundWord([part])
|
| 1096 | return w, False
|
| 1097 |
|
| 1098 | else:
|
| 1099 | self._BadToken("Unexpected token parsing arith sub: %s", self.cur_token)
|
| 1100 | return None, False
|
| 1101 |
|
| 1102 | raise AssertionError("Shouldn't get here")
|
| 1103 |
|
| 1104 | def _ReadWord(self, lex_mode):
|
| 1105 | """Helper function for Read().
|
| 1106 |
|
| 1107 | Returns:
|
| 1108 | 2-tuple (word, need_more)
|
| 1109 | word: Word, or None if there was an error, or need_more is set
|
| 1110 | need_more: True if the caller should call us again
|
| 1111 | """
|
| 1112 | #print('_Read', lex_mode, self.cur_token)
|
| 1113 | self._Peek()
|
| 1114 |
|
| 1115 | if self.token_kind == Kind.Eof:
|
| 1116 | # No advance
|
| 1117 | return ast.TokenWord(self.cur_token), False
|
| 1118 |
|
| 1119 | # Allow Arith for ) at end of for loop?
|
| 1120 | elif self.token_kind in (Kind.Op, Kind.Redir, Kind.Arith):
|
| 1121 | self._Next(lex_mode)
|
| 1122 | if self.token_type == Id.Op_Newline:
|
| 1123 | if self.cursor_was_newline:
|
| 1124 | #print('SKIP(nl)', self.cur_token)
|
| 1125 | return None, True
|
| 1126 |
|
| 1127 | return ast.TokenWord(self.cur_token), False
|
| 1128 |
|
| 1129 | elif self.token_kind == Kind.Right:
|
| 1130 | #print('WordParser.Read: Kind.Right', self.cur_token)
|
| 1131 | if self.token_type not in (
|
| 1132 | Id.Right_Subshell, Id.Right_FuncDef, Id.Right_CasePat,
|
| 1133 | Id.Right_ArrayLiteral):
|
| 1134 | raise AssertionError(self.cur_token)
|
| 1135 |
|
| 1136 | self._Next(lex_mode)
|
| 1137 | return ast.TokenWord(self.cur_token), False
|
| 1138 |
|
| 1139 | elif self.token_kind in (Kind.Ignored, Kind.WS):
|
| 1140 | self._Next(lex_mode)
|
| 1141 | return None, True # tell Read() to try again
|
| 1142 |
|
| 1143 | elif self.token_kind in (
|
| 1144 | Kind.VSub, Kind.Lit, Kind.Left, Kind.KW, Kind.Assign, Kind.ControlFlow,
|
| 1145 | Kind.BoolUnary, Kind.BoolBinary, Kind.ExtGlob):
|
| 1146 | # We're beginning a word. If we see Id.Lit_Pound, change to
|
| 1147 | # lex_mode_e.COMMENT and read until end of line. (TODO: How to add
|
| 1148 | # comments to AST?)
|
| 1149 |
|
| 1150 | # TODO: Can we do the same thing for Tilde here? Enter a state where we
|
| 1151 | # look for / too.
|
| 1152 | if self.token_type == Id.Lit_Pound:
|
| 1153 | self._Next(lex_mode_e.COMMENT)
|
| 1154 | self._Peek()
|
| 1155 |
|
| 1156 | # NOTE: The # could be the last character in the file. It can't be
|
| 1157 | # Eof_{RParen,Backtick} because #) and #` are comments.
|
| 1158 | assert self.token_type in (Id.Ignored_Comment, Id.Eof_Real), \
|
| 1159 | self.cur_token
|
| 1160 |
|
| 1161 | # The next iteration will go into Kind.Ignored and set lex state to
|
| 1162 | # lex_mode_e.OUTER/etc.
|
| 1163 | return None, True # tell Read() to try again after comment
|
| 1164 |
|
| 1165 | else:
|
| 1166 | w = self._ReadCompoundWord(lex_mode=lex_mode)
|
| 1167 | if not w:
|
| 1168 | self.AddErrorContext(
|
| 1169 | 'Error reading command word', token=self.cur_token)
|
| 1170 | return None, False
|
| 1171 | return w, False
|
| 1172 |
|
| 1173 | else:
|
| 1174 | raise AssertionError(
|
| 1175 | 'Unhandled: %s (%s)' % (self.cur_token, self.token_kind))
|
| 1176 |
|
| 1177 | raise AssertionError("Shouldn't get here")
|
| 1178 |
|
| 1179 | def LookAhead(self):
|
| 1180 | """Look ahead to the next token.
|
| 1181 |
|
| 1182 | For the command parser to recognize func () { } and array= (1 2 3). And
|
| 1183 | probably coprocesses.
|
| 1184 | """
|
| 1185 | assert self.token_type != Id.Undefined_Tok
|
| 1186 | if self.cur_token.id == Id.WS_Space:
|
| 1187 | t = self.lexer.LookAhead(lex_mode_e.OUTER)
|
| 1188 | else:
|
| 1189 | t = self.cur_token
|
| 1190 | return t.id
|
| 1191 |
|
| 1192 | def ReadWord(self, lex_mode):
|
| 1193 | """Read the next Word.
|
| 1194 |
|
| 1195 | Returns:
|
| 1196 | Word, or None if there was an error
|
| 1197 | """
|
| 1198 | # Implementation note: This is an stateful/iterative function that calls
|
| 1199 | # the stateless "_ReadWord" function.
|
| 1200 | while True:
|
| 1201 | if lex_mode == lex_mode_e.ARITH:
|
| 1202 | # TODO: Can this be unified?
|
| 1203 | w, need_more = self._ReadArithWord()
|
| 1204 | elif lex_mode in (
|
| 1205 | lex_mode_e.OUTER, lex_mode_e.DBRACKET, lex_mode_e.BASH_REGEX):
|
| 1206 | w, need_more = self._ReadWord(lex_mode)
|
| 1207 | else:
|
| 1208 | raise AssertionError('Invalid lex state %s' % lex_mode)
|
| 1209 | if not need_more:
|
| 1210 | break
|
| 1211 |
|
| 1212 | if not w: # Assumes AddErrorContext was already called
|
| 1213 | return None
|
| 1214 |
|
| 1215 | self.cursor = w
|
| 1216 |
|
| 1217 | # TODO: Do consolidation of newlines in the lexer?
|
| 1218 | # Note that there can be an infinite (Id.Ignored_Comment Id.Op_Newline
|
| 1219 | # Id.Ignored_Comment Id.Op_Newline) sequence, so we have to keep track of
|
| 1220 | # the last non-ignored token.
|
| 1221 | self.cursor_was_newline = (word.CommandId(self.cursor) == Id.Op_Newline)
|
| 1222 | return self.cursor
|
| 1223 |
|
| 1224 | def ReadHereDocBody(self):
|
| 1225 | """
|
| 1226 | Sort of like Read(), except we're in a double quoted context, but not using
|
| 1227 | double quotes.
|
| 1228 |
|
| 1229 | Returns:
|
| 1230 | CompoundWord. NOTE: We could also just use a DoubleQuotedPart for both
|
| 1231 | cases?
|
| 1232 | """
|
| 1233 | w = ast.CompoundWord()
|
| 1234 | dq = self._ReadDoubleQuotedPart(here_doc=True)
|
| 1235 | if not dq:
|
| 1236 | self.AddErrorContext('Error parsing here doc body')
|
| 1237 | return False
|
| 1238 | w.parts.append(dq)
|
| 1239 | return w
|