1 | # Copyright 2016 Andy Chu. All rights reserved.
|
2 | # Licensed under the Apache License, Version 2.0 (the "License");
|
3 | # you may not use this file except in compliance with the License.
|
4 | # You may obtain a copy of the License at
|
5 | #
|
6 | # http://www.apache.org/licenses/LICENSE-2.0
|
7 | """
|
8 | lexer.py - Library for lexing.
|
9 | """
|
10 |
|
11 | from _devbuild.gen.syntax_asdl import Token, SourceLine
|
12 | from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
|
13 | from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str, Kind
|
14 | from asdl import runtime
|
15 | from mycpp.mylib import log
|
16 | from frontend import consts
|
17 | from frontend import match
|
18 |
|
19 | unused = log, Id_str
|
20 |
|
21 | from typing import List, Tuple, Optional, Counter, TYPE_CHECKING
|
22 | if TYPE_CHECKING:
|
23 | from core.alloc import Arena
|
24 | from frontend.reader import _Reader
|
25 |
|
26 |
|
27 | def IsPlusEquals(tok):
|
28 | # type: (Token) -> bool
|
29 | """Common pattern to test if we got foo= or foo+=
|
30 |
|
31 | Note: can be replaced by s.find('+', index, index+1), which avoids
|
32 | allocation.
|
33 | """
|
34 | index = tok.col + tok.length - 2
|
35 | return tok.line.content[index] == '+'
|
36 |
|
37 |
|
38 | # Also: IsWhitespace, IsLeadingSpace
|
39 |
|
40 |
|
41 | def TokenEquals(tok, s):
|
42 | # type: (Token, str) -> bool
|
43 |
|
44 | # TODO: Use tok.line.content.find(substr, start, end)
|
45 |
|
46 | raise NotImplementedError()
|
47 |
|
48 |
|
49 | def TokenContains(tok, substr):
|
50 | # type: (Token, str) -> bool
|
51 |
|
52 | # TODO: Use tok.line.content.find(substr, start, end)
|
53 |
|
54 | raise NotImplementedError()
|
55 |
|
56 |
|
57 | def TokenStartsWith(tok, s):
|
58 | # type: (Token, str) -> bool
|
59 |
|
60 | # TODO: Use tok.line.content.startswith(substr, start, end)
|
61 |
|
62 | raise NotImplementedError()
|
63 |
|
64 |
|
65 | def TokenEndsWith(tok, s):
|
66 | # type: (Token, str) -> bool
|
67 |
|
68 | # TODO: Use tok.line.content.startswith(substr, start, end)
|
69 |
|
70 | raise NotImplementedError()
|
71 |
|
72 |
|
73 | def TokenVal(tok):
|
74 | # type: (Token) -> str
|
75 | """Compute string value on demand."""
|
76 | return tok.line.content[tok.col:tok.col + tok.length]
|
77 |
|
78 |
|
79 | def TokenSliceLeft(tok, left_index):
|
80 | # type: (Token, int) -> str
|
81 | """Slice token directly, without creating intermediate string."""
|
82 | assert left_index > 0
|
83 | left = tok.col + left_index
|
84 | return tok.line.content[left:tok.col + tok.length]
|
85 |
|
86 |
|
87 | def TokenSliceRight(tok, right_index):
|
88 | # type: (Token, int) -> str
|
89 | """Slice token directly, without creating intermediate string."""
|
90 | assert right_index < 0
|
91 | right = tok.col + tok.length + right_index
|
92 | return tok.line.content[tok.col:right]
|
93 |
|
94 |
|
95 | def TokenSlice(tok, left, right):
|
96 | # type: (Token, int, int) -> str
|
97 | """Slice token directly, without creating intermediate string."""
|
98 | assert left > 0
|
99 | start = tok.col + left
|
100 | end = tok.col + tok.length + right
|
101 | return tok.line.content[start:end]
|
102 |
|
103 |
|
104 | def DummyToken(id_, val):
|
105 | # type: (int, str) -> Token
|
106 |
|
107 | col = -1
|
108 | length = -1
|
109 | return Token(id_, col, length, runtime.NO_SPID, None, val)
|
110 |
|
111 |
|
112 | class LineLexer(object):
|
113 |
|
114 | def __init__(self, arena):
|
115 | # type: (Arena) -> None
|
116 | self.arena = arena
|
117 | self.replace_last_token = False # For MaybeUnreadOne
|
118 |
|
119 | # Singleton instance because we don't allow globals.
|
120 | # 2023-09: I tried LineLexer::Read() returning None, but that is subtly
|
121 | # incorrect, e.g. in Lexer::Read() with NUL bytes.
|
122 | self.eol_tok = DummyToken(Id.Eol_Tok, '')
|
123 |
|
124 | self.Reset(None, 0) # Invalid src_line to start
|
125 |
|
126 | def __repr__(self):
|
127 | # type: () -> str
|
128 | return '<LineLexer at pos %d of line %r>' % (self.line_pos,
|
129 | self.src_line)
|
130 |
|
131 | def Reset(self, src_line, line_pos):
|
132 | # type: (SourceLine, int) -> None
|
133 | #assert line, repr(line) # can't be empty or None
|
134 | self.src_line = src_line
|
135 | self.line_pos = line_pos
|
136 |
|
137 | def MaybeUnreadOne(self):
|
138 | # type: () -> bool
|
139 | """Return True if we can unread one character, or False otherwise.
|
140 |
|
141 | NOTE: Only call this when you know the last token was exactly one character!
|
142 | """
|
143 | if self.line_pos == 0:
|
144 | return False
|
145 | else:
|
146 | self.line_pos -= 1
|
147 | self.replace_last_token = True # don't add the next token to the arena
|
148 | return True
|
149 |
|
150 | def GetEofToken(self, id_):
|
151 | # type: (int) -> Token
|
152 | """Create a new span ID for syntax errors involving the EOF token."""
|
153 | if self.src_line is None:
|
154 | # There are ZERO lines now. Add a dummy line 0 so the Token has a source
|
155 | # to display errors.
|
156 | src_line = self.arena.AddLine('', 0)
|
157 | else:
|
158 | src_line = self.src_line
|
159 |
|
160 | return self.arena.NewToken(id_, self.line_pos, 0, src_line, '')
|
161 |
|
162 | def LookAheadOne(self, lex_mode):
|
163 | # type: (lex_mode_t) -> Id_t
|
164 | """Look ahead exactly one token in the given lexer mode."""
|
165 | pos = self.line_pos
|
166 | line_str = self.src_line.content
|
167 | n = len(line_str)
|
168 | if pos == n:
|
169 | return Id.Unknown_Tok
|
170 | else:
|
171 | tok_type, _ = match.OneToken(lex_mode, line_str, pos)
|
172 | return tok_type
|
173 |
|
174 | def AssertAtEndOfLine(self):
|
175 | # type: () -> None
|
176 | assert self.line_pos == len(self.src_line.content), \
|
177 | '%d %s' % (self.line_pos, self.src_line.content)
|
178 |
|
179 | def LookPastSpace(self, lex_mode):
|
180 | # type: (lex_mode_t) -> Id_t
|
181 | """Look ahead in current line for non-space token, using given lexer
|
182 | mode.
|
183 |
|
184 | Does NOT advance self.line_pos.
|
185 |
|
186 | Called with at least the following modes:
|
187 | lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
|
188 | lex_mode_e.VSub_1
|
189 | lex_mode_e.ShCommand
|
190 |
|
191 | Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
|
192 | """
|
193 | pos = self.line_pos
|
194 | line_str = self.src_line.content
|
195 | n = len(line_str)
|
196 | #print('Look ahead from pos %d, line %r' % (pos,self.line))
|
197 | while True:
|
198 | if pos == n:
|
199 | # We don't allow lookahead while already at end of line, because it
|
200 | # would involve interacting with the line reader, and we never need
|
201 | # it. In lex_mode_e.ShCommand, there is an explicit newline token, but
|
202 | # lex_mode_e.Arith doesn't have it.
|
203 | return Id.Unknown_Tok
|
204 |
|
205 | tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
|
206 |
|
207 | # NOTE: Instead of hard-coding this token, we could pass it in.
|
208 | # LookPastSpace(lex_mode, past_token_type)
|
209 | # - WS_Space only given in lex_mode_e.ShCommand
|
210 | # - Id.Ignored_Space given in lex_mode_e.Expr
|
211 | if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
|
212 | break
|
213 | pos = end_pos
|
214 |
|
215 | return tok_type
|
216 |
|
217 | def LookAheadFuncParens(self, unread):
|
218 | # type: (int) -> bool
|
219 | """For finding the () in 'f ( ) { echo hi; }'.
|
220 |
|
221 | Args:
|
222 | unread: either 0 or 1, for the number of characters to go back
|
223 |
|
224 | The lookahead is limited to the current line, which sacrifices a rare
|
225 | corner case. This not recognized as a function:
|
226 |
|
227 | foo\
|
228 | () {}
|
229 |
|
230 | whereas this is
|
231 |
|
232 | foo()
|
233 | {}
|
234 | """
|
235 | pos = self.line_pos - unread
|
236 | assert pos > 0
|
237 | tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
|
238 | self.src_line.content, pos)
|
239 | return tok_type == Id.LookAhead_FuncParens
|
240 |
|
241 | def ByteLookAhead(self):
|
242 | # type: () -> str
|
243 | """Lookahead a single byte.
|
244 |
|
245 | Useful when you know the token is one char.
|
246 | """
|
247 | pos = self.line_pos
|
248 | if pos == len(self.src_line.content):
|
249 | return ''
|
250 | else:
|
251 | return self.src_line.content[pos]
|
252 |
|
253 | def ByteLookBack(self):
|
254 | # type: () -> int
|
255 | """A little hack for stricter proc arg list syntax.
|
256 |
|
257 | There has to be a space before the paren.
|
258 |
|
259 | Yes: json write (x)
|
260 | No: json write(x)
|
261 | """
|
262 | pos = self.line_pos - 2
|
263 | if pos < 0:
|
264 | return -1
|
265 | else:
|
266 | return ord(self.src_line.content[pos])
|
267 |
|
268 | def Read(self, lex_mode):
|
269 | # type: (lex_mode_t) -> Token
|
270 |
|
271 | # Inner loop optimization
|
272 | if self.src_line:
|
273 | line_str = self.src_line.content
|
274 | else:
|
275 | line_str = ''
|
276 | line_pos = self.line_pos
|
277 |
|
278 | tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
|
279 | if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
|
280 | # LineLexer tells Lexer to read a new line.
|
281 | return self.eol_tok
|
282 |
|
283 | # TODO: can inline this function with formula on 16-bit Id.
|
284 | kind = consts.GetKind(tok_type)
|
285 |
|
286 | # Save on allocations! We often don't look at the token value.
|
287 | # Whitelist doesn't work well? Use blacklist for now.
|
288 | # - Kind.KW is sometimes a literal in a word
|
289 | # - Kind.Right is for " in here docs. Lexer isn't involved.
|
290 | # - Got an error with Kind.Left too that I don't understand
|
291 | # - Kind.ControlFlow doesn't work because we word_.StaticEval()
|
292 | # if kind in (Kind.Lit, Kind.VSub, Kind.Redir, Kind.Char, Kind.Backtick, Kind.KW, Kind.Right):
|
293 | if kind in (Kind.Arith, Kind.Op, Kind.VTest, Kind.VOp0, Kind.VOp2,
|
294 | Kind.VOp3, Kind.WS, Kind.Ignored, Kind.Eof):
|
295 | tok_val = None # type: Optional[str]
|
296 | else:
|
297 | tok_val = line_str[line_pos:end_pos]
|
298 |
|
299 | # NOTE: We're putting the arena hook in LineLexer and not Lexer because we
|
300 | # want it to be "low level". The only thing fabricated here is a newline
|
301 | # added at the last line, so we don't end with \0.
|
302 | if self.replace_last_token: # make another token from the last span
|
303 | self.arena.UnreadOne()
|
304 | self.replace_last_token = False
|
305 |
|
306 | tok_len = end_pos - line_pos
|
307 | t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line,
|
308 | tok_val)
|
309 |
|
310 | self.line_pos = end_pos
|
311 | return t
|
312 |
|
313 |
|
314 | class Lexer(object):
|
315 | """Read lines from the line_reader, split them into tokens with line_lexer,
|
316 | returning them in a stream."""
|
317 |
|
318 | def __init__(self, line_lexer, line_reader):
|
319 | # type: (LineLexer, _Reader) -> None
|
320 | """
|
321 | Args:
|
322 | line_lexer: Underlying object to get tokens from
|
323 | line_reader: get new lines from here
|
324 | """
|
325 | self.line_lexer = line_lexer
|
326 | self.line_reader = line_reader
|
327 |
|
328 | self.line_id = -1 # Invalid one
|
329 | self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
|
330 | self.emit_comp_dummy = False
|
331 |
|
332 | def ResetInputObjects(self):
|
333 | # type: () -> None
|
334 | self.line_lexer.Reset(None, 0)
|
335 |
|
336 | def MaybeUnreadOne(self):
|
337 | # type: () -> bool
|
338 | return self.line_lexer.MaybeUnreadOne()
|
339 |
|
340 | def LookAheadOne(self, lex_mode):
|
341 | # type: (lex_mode_t) -> Id_t
|
342 | return self.line_lexer.LookAheadOne(lex_mode)
|
343 |
|
344 | def LookPastSpace(self, lex_mode):
|
345 | # type: (lex_mode_t) -> Id_t
|
346 | return self.line_lexer.LookPastSpace(lex_mode)
|
347 |
|
348 | def LookAheadFuncParens(self, unread):
|
349 | # type: (int) -> bool
|
350 | return self.line_lexer.LookAheadFuncParens(unread)
|
351 |
|
352 | def ByteLookAhead(self):
|
353 | # type: () -> str
|
354 | return self.line_lexer.ByteLookAhead()
|
355 |
|
356 | def ByteLookBack(self):
|
357 | # type: () -> int
|
358 | return self.line_lexer.ByteLookBack()
|
359 |
|
360 | def EmitCompDummy(self):
|
361 | # type: () -> None
|
362 | """Emit Id.Lit_CompDummy right before EOF, for completion."""
|
363 | self.emit_comp_dummy = True
|
364 |
|
365 | def PushHint(self, old_id, new_id):
|
366 | # type: (Id_t, Id_t) -> None
|
367 | """Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
|
368 | Id.Op_RParen -> Id.Eof_RParen.
|
369 |
|
370 | Problems for $() nesting.
|
371 |
|
372 | - posix:
|
373 | - case foo) and case (foo)
|
374 | - func() {}
|
375 | - subshell ( )
|
376 | - bash extensions:
|
377 | - precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
|
378 | - arrays: a=(1 2 3), a+=(4 5)
|
379 | """
|
380 | #log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
|
381 | self.translation_stack.append((old_id, new_id))
|
382 |
|
383 | def MoveToNextLine(self):
|
384 | # type: () -> None
|
385 | """For lookahead on the next line.
|
386 |
|
387 | This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
|
388 |
|
389 | We use this because otherwise calling `LookPastSpace` would return
|
390 | `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
|
391 | example, take this case:
|
392 |
|
393 | case (x) {
|
394 | ^--- We are here
|
395 |
|
396 | (else) {
|
397 | ^--- We want lookahead to here
|
398 |
|
399 | echo test
|
400 | }
|
401 | }
|
402 |
|
403 | But, without `MoveToNextLine`, it is impossible to peek the '(' without
|
404 | consuming it. And consuming it would be a problem once we want to hand off
|
405 | pattern parsing to the expression parser.
|
406 | """
|
407 | # Only call this when you've seen \n
|
408 | self.line_lexer.AssertAtEndOfLine()
|
409 |
|
410 | src_line, line_pos = self.line_reader.GetLine()
|
411 | self.line_lexer.Reset(src_line, line_pos) # fill with a new line
|
412 |
|
413 | def _Read(self, lex_mode):
|
414 | # type: (lex_mode_t) -> Token
|
415 | """Read from the normal line buffer, not an alias."""
|
416 | t = self.line_lexer.Read(lex_mode)
|
417 | if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
|
418 | src_line, line_pos = self.line_reader.GetLine()
|
419 |
|
420 | if src_line is None: # no more lines
|
421 | if self.emit_comp_dummy:
|
422 | id_ = Id.Lit_CompDummy
|
423 | self.emit_comp_dummy = False # emit EOF the next time
|
424 | else:
|
425 | id_ = Id.Eof_Real
|
426 | return self.line_lexer.GetEofToken(id_)
|
427 |
|
428 | self.line_lexer.Reset(src_line, line_pos) # fill with a new line
|
429 | t = self.line_lexer.Read(lex_mode)
|
430 |
|
431 | # e.g. translate ) or ` into EOF
|
432 | if len(self.translation_stack):
|
433 | old_id, new_id = self.translation_stack[-1] # top
|
434 | if t.id == old_id:
|
435 | #log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
|
436 | self.translation_stack.pop()
|
437 | t.id = new_id
|
438 |
|
439 | return t
|
440 |
|
441 | def Read(self, lex_mode):
|
442 | # type: (lex_mode_t) -> Token
|
443 | while True:
|
444 | t = self._Read(lex_mode)
|
445 | # TODO: Change to ALL IGNORED types, once you have SPACE_TOK. This means
|
446 | # we don't have to handle them in the VSub_1/VSub_2/etc. states.
|
447 | if t.id != Id.Ignored_LineCont:
|
448 | break
|
449 |
|
450 | #ID_HIST[t.id] += 1
|
451 | #log('> Read() Returning %s', t)
|
452 | return t
|
453 |
|
454 |
|
455 | if 0: # mylib.PYTHON: not: breaks tarball build
|
456 | import collections
|
457 | ID_HIST = collections.Counter() # type: Counter[Id_t]
|