| 1 | # Copyright 2016 Andy Chu. All rights reserved.
|
| 2 | # Licensed under the Apache License, Version 2.0 (the "License");
|
| 3 | # you may not use this file except in compliance with the License.
|
| 4 | # You may obtain a copy of the License at
|
| 5 | #
|
| 6 | # http://www.apache.org/licenses/LICENSE-2.0
|
| 7 | """
|
| 8 | lexer.py - Library for lexing.
|
| 9 | """
|
| 10 |
|
| 11 | from _devbuild.gen.syntax_asdl import Token, SourceLine
|
| 12 | from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
|
| 13 | from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str
|
| 14 | from mycpp.mylib import log
|
| 15 | from frontend import match
|
| 16 |
|
| 17 | unused = log, Id_str
|
| 18 |
|
| 19 | from typing import List, Tuple, Counter, TYPE_CHECKING
|
| 20 | if TYPE_CHECKING:
|
| 21 | from core.alloc import Arena
|
| 22 | from frontend.reader import _Reader
|
| 23 |
|
| 24 | #
|
| 25 | # Optimized Token functions that use str.find(substr, start, end) to avoid
|
| 26 | # allocation of temporary slice
|
| 27 | #
|
| 28 |
|
| 29 |
|
| 30 | def IsPlusEquals(tok):
|
| 31 | # type: (Token) -> bool
|
| 32 | """Common pattern to test if we got foo= or foo+=
|
| 33 | """
|
| 34 | i = tok.col + tok.length - 2 # 'foo+='[-2] is '+'
|
| 35 | return tok.line.content.find('+', i, i + 1) != -1
|
| 36 |
|
| 37 |
|
| 38 | def TokenContains(tok, substr):
|
| 39 | # type: (Token, str) -> bool
|
| 40 | return tok.line.content.find(substr, tok.col, tok.col + tok.length) != -1
|
| 41 |
|
| 42 |
|
| 43 | def TokenEquals(tok, s):
|
| 44 | # type: (Token, str) -> bool
|
| 45 | if len(s) != tok.length:
|
| 46 | return False
|
| 47 | return TokenContains(tok, s)
|
| 48 |
|
| 49 |
|
| 50 | def TokenStartsWith(tok, s):
|
| 51 | # type: (Token, str) -> bool
|
| 52 | return tok.line.content.find(s, tok.col, tok.col + len(s)) != -1
|
| 53 |
|
| 54 |
|
| 55 | def TokenEndsWith(tok, s):
|
| 56 | # type: (Token, str) -> bool
|
| 57 | end = tok.col + tok.length
|
| 58 | return tok.line.content.find(s, end - len(s), end) != -1
|
| 59 |
|
| 60 |
|
| 61 | # Also: IsWhitespace, IsLeadingSpace
|
| 62 |
|
| 63 |
|
| 64 | def TokenVal(tok):
|
| 65 | # type: (Token) -> str
|
| 66 | """Compute string value on demand."""
|
| 67 | return tok.line.content[tok.col:tok.col + tok.length]
|
| 68 |
|
| 69 |
|
| 70 | def TokenSliceLeft(tok, left_index):
|
| 71 | # type: (Token, int) -> str
|
| 72 | """Slice token directly, without creating intermediate string."""
|
| 73 | assert left_index > 0
|
| 74 | start = tok.col + left_index
|
| 75 | return tok.line.content[start:tok.col + tok.length]
|
| 76 |
|
| 77 |
|
| 78 | def TokenSliceRight(tok, right_index):
|
| 79 | # type: (Token, int) -> str
|
| 80 | """Slice token directly, without creating intermediate string."""
|
| 81 | assert right_index < 0
|
| 82 | end = tok.col + tok.length + right_index
|
| 83 | return tok.line.content[tok.col:end]
|
| 84 |
|
| 85 |
|
| 86 | def TokenSlice(tok, left, right):
|
| 87 | # type: (Token, int, int) -> str
|
| 88 | """Slice token directly, without creating intermediate string."""
|
| 89 | assert left > 0, left
|
| 90 | assert right < 0, right
|
| 91 | start = tok.col + left
|
| 92 | end = tok.col + tok.length + right
|
| 93 | return tok.line.content[start:end]
|
| 94 |
|
| 95 |
|
| 96 | def LazyStr(tok):
|
| 97 | # type: (Token) -> str
|
| 98 | """Materialize the tval on demand, with special case for $myvar.
|
| 99 |
|
| 100 | Most tokens do NOT need strings. We avoid allocating them in the lexer.
|
| 101 |
|
| 102 | Note: SingleQuoted could have lazy sval, NOT at the token level.
|
| 103 | """
|
| 104 | if 0:
|
| 105 | LAZY_ID_HIST[tok.id] += 1
|
| 106 |
|
| 107 | if tok.tval is None:
|
| 108 | if tok.id in (Id.VSub_DollarName, Id.VSub_Number): # $x or $2
|
| 109 | # Special case for SimpleVarSub - completion also relies on this
|
| 110 | tok.tval = TokenSliceLeft(tok, 1)
|
| 111 | else:
|
| 112 | tok.tval = TokenVal(tok)
|
| 113 |
|
| 114 | return tok.tval
|
| 115 |
|
| 116 |
|
| 117 | def DummyToken(id_, val):
|
| 118 | # type: (int, str) -> Token
|
| 119 |
|
| 120 | col = -1
|
| 121 | length = -1
|
| 122 | return Token(id_, length, col, None, val)
|
| 123 |
|
| 124 |
|
| 125 | class LineLexer(object):
|
| 126 |
|
| 127 | def __init__(self, arena):
|
| 128 | # type: (Arena) -> None
|
| 129 | self.arena = arena
|
| 130 | self.replace_last_token = False # For MaybeUnreadOne
|
| 131 |
|
| 132 | # Singleton instance because we don't allow globals.
|
| 133 | # 2023-09: I tried LineLexer::Read() returning None, but that is subtly
|
| 134 | # incorrect, e.g. in Lexer::Read() with NUL bytes.
|
| 135 | self.eol_tok = DummyToken(Id.Eol_Tok, '')
|
| 136 |
|
| 137 | self.Reset(None, 0) # Invalid src_line to start
|
| 138 |
|
| 139 | def __repr__(self):
|
| 140 | # type: () -> str
|
| 141 | return '<LineLexer at pos %d of line %r>' % (self.line_pos,
|
| 142 | self.src_line)
|
| 143 |
|
| 144 | def Reset(self, src_line, line_pos):
|
| 145 | # type: (SourceLine, int) -> None
|
| 146 | self.src_line = src_line
|
| 147 | self.line_pos = line_pos
|
| 148 |
|
| 149 | def MaybeUnreadOne(self):
|
| 150 | # type: () -> bool
|
| 151 | """Return True if we can unread one character, or False otherwise.
|
| 152 |
|
| 153 | NOTE: Only call this when you know the last token was exactly one character!
|
| 154 | """
|
| 155 | if self.line_pos == 0:
|
| 156 | return False
|
| 157 | else:
|
| 158 | self.line_pos -= 1
|
| 159 | self.replace_last_token = True # don't add the next token to the arena
|
| 160 | return True
|
| 161 |
|
| 162 | def GetEofToken(self, id_):
|
| 163 | # type: (int) -> Token
|
| 164 | """Create a new span ID for syntax errors involving the EOF token."""
|
| 165 | if self.src_line is None:
|
| 166 | # There are ZERO lines now. Add a dummy line 0 so the Token has a source
|
| 167 | # to display errors.
|
| 168 | src_line = self.arena.AddLine('', 0)
|
| 169 | else:
|
| 170 | src_line = self.src_line
|
| 171 |
|
| 172 | return self.arena.NewToken(id_, self.line_pos, 0, src_line)
|
| 173 |
|
| 174 | def LookAheadOne(self, lex_mode):
|
| 175 | # type: (lex_mode_t) -> Id_t
|
| 176 | """Look ahead exactly one token in the given lexer mode."""
|
| 177 | pos = self.line_pos
|
| 178 | line_str = self.src_line.content
|
| 179 | n = len(line_str)
|
| 180 | if pos == n:
|
| 181 | return Id.Unknown_Tok
|
| 182 | else:
|
| 183 | tok_type, _ = match.OneToken(lex_mode, line_str, pos)
|
| 184 | return tok_type
|
| 185 |
|
| 186 | def AssertAtEndOfLine(self):
|
| 187 | # type: () -> None
|
| 188 | assert self.line_pos == len(self.src_line.content), \
|
| 189 | '%d %s' % (self.line_pos, self.src_line.content)
|
| 190 |
|
| 191 | def LookPastSpace(self, lex_mode):
|
| 192 | # type: (lex_mode_t) -> Id_t
|
| 193 | """Look ahead in current line for non-space token, using given lexer
|
| 194 | mode.
|
| 195 |
|
| 196 | Does NOT advance self.line_pos.
|
| 197 |
|
| 198 | Called with at least the following modes:
|
| 199 | lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
|
| 200 | lex_mode_e.VSub_1
|
| 201 | lex_mode_e.ShCommand
|
| 202 |
|
| 203 | Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
|
| 204 | """
|
| 205 | pos = self.line_pos
|
| 206 | line_str = self.src_line.content
|
| 207 | n = len(line_str)
|
| 208 | #print('Look ahead from pos %d, line %r' % (pos,self.line))
|
| 209 | while True:
|
| 210 | if pos == n:
|
| 211 | # We don't allow lookahead while already at end of line, because it
|
| 212 | # would involve interacting with the line reader, and we never need
|
| 213 | # it. In lex_mode_e.ShCommand, there is an explicit newline token, but
|
| 214 | # lex_mode_e.Arith doesn't have it.
|
| 215 | return Id.Unknown_Tok
|
| 216 |
|
| 217 | tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
|
| 218 |
|
| 219 | # NOTE: Instead of hard-coding this token, we could pass it in.
|
| 220 | # LookPastSpace(lex_mode, past_token_type)
|
| 221 | # - WS_Space only given in lex_mode_e.ShCommand
|
| 222 | # - Id.Ignored_Space given in lex_mode_e.Expr
|
| 223 | if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
|
| 224 | break
|
| 225 | pos = end_pos
|
| 226 |
|
| 227 | return tok_type
|
| 228 |
|
| 229 | def LookAheadFuncParens(self, unread):
|
| 230 | # type: (int) -> bool
|
| 231 | """For finding the () in 'f ( ) { echo hi; }'.
|
| 232 |
|
| 233 | Args:
|
| 234 | unread: either 0 or 1, for the number of characters to go back
|
| 235 |
|
| 236 | The lookahead is limited to the current line, which sacrifices a rare
|
| 237 | corner case. This not recognized as a function:
|
| 238 |
|
| 239 | foo\
|
| 240 | () {}
|
| 241 |
|
| 242 | whereas this is
|
| 243 |
|
| 244 | foo()
|
| 245 | {}
|
| 246 | """
|
| 247 | pos = self.line_pos - unread
|
| 248 | assert pos > 0, pos
|
| 249 | tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
|
| 250 | self.src_line.content, pos)
|
| 251 | return tok_type == Id.LookAhead_FuncParens
|
| 252 |
|
| 253 | def ByteLookAhead(self):
|
| 254 | # type: () -> str
|
| 255 | """Lookahead a single byte.
|
| 256 |
|
| 257 | Useful when you know the token is one char.
|
| 258 | """
|
| 259 | pos = self.line_pos
|
| 260 | if pos == len(self.src_line.content):
|
| 261 | return ''
|
| 262 | else:
|
| 263 | return self.src_line.content[pos]
|
| 264 |
|
| 265 | def ByteLookBack(self):
|
| 266 | # type: () -> int
|
| 267 | """A little hack for stricter proc arg list syntax.
|
| 268 |
|
| 269 | There has to be a space before the paren.
|
| 270 |
|
| 271 | Yes: json write (x)
|
| 272 | No: json write(x)
|
| 273 | """
|
| 274 | pos = self.line_pos - 2
|
| 275 | if pos < 0:
|
| 276 | return -1
|
| 277 | else:
|
| 278 | return ord(self.src_line.content[pos])
|
| 279 |
|
| 280 | def Read(self, lex_mode):
|
| 281 | # type: (lex_mode_t) -> Token
|
| 282 |
|
| 283 | # Inner loop optimization
|
| 284 | if self.src_line:
|
| 285 | line_str = self.src_line.content
|
| 286 | else:
|
| 287 | line_str = ''
|
| 288 | line_pos = self.line_pos
|
| 289 |
|
| 290 | tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
|
| 291 | if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
|
| 292 | # LineLexer tells Lexer to read a new line.
|
| 293 | return self.eol_tok
|
| 294 |
|
| 295 | # NOTE: We're putting the arena hook in LineLexer and not Lexer because we
|
| 296 | # want it to be "low level". The only thing fabricated here is a newline
|
| 297 | # added at the last line, so we don't end with \0.
|
| 298 | if self.replace_last_token: # make another token from the last span
|
| 299 | self.arena.UnreadOne()
|
| 300 | self.replace_last_token = False
|
| 301 |
|
| 302 | tok_len = end_pos - line_pos
|
| 303 | t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line)
|
| 304 |
|
| 305 | self.line_pos = end_pos
|
| 306 | return t
|
| 307 |
|
| 308 |
|
| 309 | class Lexer(object):
|
| 310 | """Read lines from the line_reader, split them into tokens with line_lexer,
|
| 311 | returning them in a stream."""
|
| 312 |
|
| 313 | def __init__(self, line_lexer, line_reader):
|
| 314 | # type: (LineLexer, _Reader) -> None
|
| 315 | """
|
| 316 | Args:
|
| 317 | line_lexer: Underlying object to get tokens from
|
| 318 | line_reader: get new lines from here
|
| 319 | """
|
| 320 | self.line_lexer = line_lexer
|
| 321 | self.line_reader = line_reader
|
| 322 |
|
| 323 | self.line_id = -1 # Invalid one
|
| 324 | self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
|
| 325 | self.emit_comp_dummy = False
|
| 326 |
|
| 327 | def ResetInputObjects(self):
|
| 328 | # type: () -> None
|
| 329 | self.line_lexer.Reset(None, 0)
|
| 330 |
|
| 331 | def MaybeUnreadOne(self):
|
| 332 | # type: () -> bool
|
| 333 | return self.line_lexer.MaybeUnreadOne()
|
| 334 |
|
| 335 | def LookAheadOne(self, lex_mode):
|
| 336 | # type: (lex_mode_t) -> Id_t
|
| 337 | return self.line_lexer.LookAheadOne(lex_mode)
|
| 338 |
|
| 339 | def LookPastSpace(self, lex_mode):
|
| 340 | # type: (lex_mode_t) -> Id_t
|
| 341 | return self.line_lexer.LookPastSpace(lex_mode)
|
| 342 |
|
| 343 | def LookAheadFuncParens(self, unread):
|
| 344 | # type: (int) -> bool
|
| 345 | return self.line_lexer.LookAheadFuncParens(unread)
|
| 346 |
|
| 347 | def ByteLookAhead(self):
|
| 348 | # type: () -> str
|
| 349 | return self.line_lexer.ByteLookAhead()
|
| 350 |
|
| 351 | def ByteLookBack(self):
|
| 352 | # type: () -> int
|
| 353 | return self.line_lexer.ByteLookBack()
|
| 354 |
|
| 355 | def EmitCompDummy(self):
|
| 356 | # type: () -> None
|
| 357 | """Emit Id.Lit_CompDummy right before EOF, for completion."""
|
| 358 | self.emit_comp_dummy = True
|
| 359 |
|
| 360 | def PushHint(self, old_id, new_id):
|
| 361 | # type: (Id_t, Id_t) -> None
|
| 362 | """Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
|
| 363 | Id.Op_RParen -> Id.Eof_RParen.
|
| 364 |
|
| 365 | Problems for $() nesting.
|
| 366 |
|
| 367 | - posix:
|
| 368 | - case foo) and case (foo)
|
| 369 | - func() {}
|
| 370 | - subshell ( )
|
| 371 | - bash extensions:
|
| 372 | - precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
|
| 373 | - arrays: a=(1 2 3), a+=(4 5)
|
| 374 | """
|
| 375 | #log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
|
| 376 | self.translation_stack.append((old_id, new_id))
|
| 377 |
|
| 378 | def MoveToNextLine(self):
|
| 379 | # type: () -> bool
|
| 380 | """For lookahead on the next line.
|
| 381 |
|
| 382 | This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
|
| 383 |
|
| 384 | We use this because otherwise calling `LookPastSpace` would return
|
| 385 | `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
|
| 386 | example, take this case:
|
| 387 |
|
| 388 | case (x) {
|
| 389 | ^--- We are here
|
| 390 |
|
| 391 | (else) {
|
| 392 | ^--- We want lookahead to here
|
| 393 |
|
| 394 | echo test
|
| 395 | }
|
| 396 | }
|
| 397 |
|
| 398 | But, without `MoveToNextLine`, it is impossible to peek the '(' without
|
| 399 | consuming it. And consuming it would be a problem once we want to hand off
|
| 400 | pattern parsing to the expression parser.
|
| 401 | """
|
| 402 | # Only call this when you've seen \n
|
| 403 | self.line_lexer.AssertAtEndOfLine()
|
| 404 |
|
| 405 | src_line, line_pos = self.line_reader.GetLine()
|
| 406 | if src_line is None:
|
| 407 | return False # EOF, so we failed at moving to next line
|
| 408 |
|
| 409 | self.line_lexer.Reset(src_line, line_pos) # fill with a new line
|
| 410 | return True
|
| 411 |
|
| 412 | def _Read(self, lex_mode):
|
| 413 | # type: (lex_mode_t) -> Token
|
| 414 | """Read from the normal line buffer, not an alias."""
|
| 415 | t = self.line_lexer.Read(lex_mode)
|
| 416 | if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
|
| 417 | src_line, line_pos = self.line_reader.GetLine()
|
| 418 |
|
| 419 | if src_line is None: # no more lines
|
| 420 | if self.emit_comp_dummy:
|
| 421 | id_ = Id.Lit_CompDummy
|
| 422 | self.emit_comp_dummy = False # emit EOF the next time
|
| 423 | else:
|
| 424 | id_ = Id.Eof_Real
|
| 425 | return self.line_lexer.GetEofToken(id_)
|
| 426 |
|
| 427 | self.line_lexer.Reset(src_line, line_pos) # fill with a new line
|
| 428 | t = self.line_lexer.Read(lex_mode)
|
| 429 |
|
| 430 | # e.g. translate ) or ` into EOF
|
| 431 | if len(self.translation_stack):
|
| 432 | old_id, new_id = self.translation_stack[-1] # top
|
| 433 | if t.id == old_id:
|
| 434 | #log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
|
| 435 | self.translation_stack.pop()
|
| 436 | t.id = new_id
|
| 437 |
|
| 438 | return t
|
| 439 |
|
| 440 | def Read(self, lex_mode):
|
| 441 | # type: (lex_mode_t) -> Token
|
| 442 | while True:
|
| 443 | t = self._Read(lex_mode)
|
| 444 | if t.id != Id.Ignored_LineCont:
|
| 445 | break
|
| 446 |
|
| 447 | if 0:
|
| 448 | ID_HIST[t.id] += 1
|
| 449 | return t
|
| 450 |
|
| 451 |
|
| 452 | if 0: # mylib.PYTHON: note: breaks tarball build
|
| 453 | import collections
|
| 454 | ID_HIST = collections.Counter() # type: Counter[Id_t]
|
| 455 | LAZY_ID_HIST = collections.Counter() # type: Counter[Id_t]
|