OILS / frontend / lexer.py View on Github | oilshell.org

459 lines, 198 significant
1# Copyright 2016 Andy Chu. All rights reserved.
2# Licensed under the Apache License, Version 2.0 (the "License");
3# you may not use this file except in compliance with the License.
4# You may obtain a copy of the License at
5#
6# http://www.apache.org/licenses/LICENSE-2.0
7"""
8lexer.py - Library for lexing.
9"""
10
11from _devbuild.gen.syntax_asdl import Token, WideToken, SourceLine
12from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
13from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str
14from mycpp.mylib import log
15from frontend import match
16
17unused = log, Id_str
18
19from typing import List, Tuple, Counter, TYPE_CHECKING
20if TYPE_CHECKING:
21 from core.alloc import Arena
22 from frontend.reader import _Reader
23
24#
25# Optimized Token functions that use str.find(substr, start, end) to avoid
26# allocation of temporary slice
27#
28
29
30def IsPlusEquals(tok):
31 # type: (Token) -> bool
32 """Common pattern to test if we got foo= or foo+=
33 """
34 i = tok.col + tok.length - 2 # 'foo+='[-2] is '+'
35 return tok.line.content.find('+', i, i + 1) != -1
36
37
38def TokenContains(tok, substr):
39 # type: (Token, str) -> bool
40 return tok.line.content.find(substr, tok.col, tok.col + tok.length) != -1
41
42
43def TokenEquals(tok, s):
44 # type: (Token, str) -> bool
45 if len(s) != tok.length:
46 return False
47 return TokenContains(tok, s)
48
49
50def TokenStartsWith(tok, s):
51 # type: (Token, str) -> bool
52 return tok.line.content.find(s, tok.col, tok.col + len(s)) != -1
53
54
55def TokenEndsWith(tok, s):
56 # type: (Token, str) -> bool
57 end = tok.col + tok.length
58 return tok.line.content.find(s, end - len(s), end) != -1
59
60
61# Also: IsWhitespace, IsLeadingSpace
62
63
64def TokenVal(tok):
65 # type: (Token) -> str
66 """Compute string value on demand."""
67 return tok.line.content[tok.col:tok.col + tok.length]
68
69
70def TokenSliceLeft(tok, left_index):
71 # type: (Token, int) -> str
72 """Slice token directly, without creating intermediate string."""
73 assert left_index > 0
74 start = tok.col + left_index
75 return tok.line.content[start:tok.col + tok.length]
76
77
78def TokenSliceRight(tok, right_index):
79 # type: (Token, int) -> str
80 """Slice token directly, without creating intermediate string."""
81 assert right_index < 0
82 end = tok.col + tok.length + right_index
83 return tok.line.content[tok.col:end]
84
85
86def TokenSlice(tok, left, right):
87 # type: (Token, int, int) -> str
88 """Slice token directly, without creating intermediate string."""
89 assert left > 0, left
90 assert right < 0, right
91 start = tok.col + left
92 end = tok.col + tok.length + right
93 return tok.line.content[start:end]
94
95
96def LazyStr2(tok):
97 # type: (WideToken) -> str
98 """
99 TODO: Remove .tval from Token; use WideToken instead
100 """
101 return ''
102
103
104def LazyStr(tok):
105 # type: (Token) -> str
106 """Materialize the tval on demand, with special case for $myvar.
107
108 Most tokens do NOT need strings. We avoid allocating them in the lexer.
109
110 Note: SingleQuoted could have lazy sval, NOT at the token level.
111 """
112 if 0:
113 LAZY_ID_HIST[tok.id] += 1
114
115 if tok.tval is None:
116 if tok.id in (Id.VSub_DollarName, Id.VSub_Number): # $x or $2
117 # Special case for SimpleVarSub - completion also relies on this
118 tok.tval = TokenSliceLeft(tok, 1)
119 else:
120 tok.tval = TokenVal(tok)
121
122 return tok.tval
123
124
125def DummyToken(id_, val):
126 # type: (int, str) -> Token
127
128 col = -1
129 length = -1
130 return Token(id_, length, col, None, val)
131
132
133class LineLexer(object):
134
135 def __init__(self, arena):
136 # type: (Arena) -> None
137 self.arena = arena
138 self.replace_last_token = False # For MaybeUnreadOne
139
140 # Singleton instance because we don't allow globals.
141 # 2023-09: I tried LineLexer::Read() returning None, but that is subtly
142 # incorrect, e.g. in Lexer::Read() with NUL bytes.
143 self.eol_tok = DummyToken(Id.Eol_Tok, '')
144
145 self.Reset(None, 0) # Invalid src_line to start
146
147 def __repr__(self):
148 # type: () -> str
149 return '<LineLexer at pos %d of line %r>' % (self.line_pos,
150 self.src_line)
151
152 def Reset(self, src_line, line_pos):
153 # type: (SourceLine, int) -> None
154 self.src_line = src_line
155 self.line_pos = line_pos
156
157 def MaybeUnreadOne(self):
158 # type: () -> bool
159 """Return True if we can unread one character, or False otherwise.
160
161 NOTE: Only call this when you know the last token was exactly one character!
162 """
163 if self.line_pos == 0:
164 return False
165 else:
166 self.line_pos -= 1
167 self.replace_last_token = True # don't add the next token to the arena
168 return True
169
170 def GetEofToken(self, id_):
171 # type: (int) -> Token
172 """Create a new span ID for syntax errors involving the EOF token."""
173 if self.src_line is None:
174 # There are ZERO lines now. Add a dummy line 0 so the Token has a source
175 # to display errors.
176 src_line = self.arena.AddLine('', 0)
177 else:
178 src_line = self.src_line
179
180 return self.arena.NewToken(id_, self.line_pos, 0, src_line)
181
182 def LookAheadOne(self, lex_mode):
183 # type: (lex_mode_t) -> Id_t
184 """Look ahead exactly one token in the given lexer mode."""
185 pos = self.line_pos
186 line_str = self.src_line.content
187 n = len(line_str)
188 if pos == n:
189 return Id.Unknown_Tok
190 else:
191 tok_type, _ = match.OneToken(lex_mode, line_str, pos)
192 return tok_type
193
194 def AssertAtEndOfLine(self):
195 # type: () -> None
196 assert self.line_pos == len(self.src_line.content), \
197 '%d %s' % (self.line_pos, self.src_line.content)
198
199 def LookPastSpace(self, lex_mode):
200 # type: (lex_mode_t) -> Id_t
201 """Look ahead in current line for non-space token, using given lexer
202 mode.
203
204 Does NOT advance self.line_pos.
205
206 Called with at least the following modes:
207 lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
208 lex_mode_e.VSub_1
209 lex_mode_e.ShCommand
210
211 Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
212 """
213 pos = self.line_pos
214 line_str = self.src_line.content
215 n = len(line_str)
216 #print('Look ahead from pos %d, line %r' % (pos,self.line))
217 while True:
218 if pos == n:
219 # We don't allow lookahead while already at end of line, because it
220 # would involve interacting with the line reader, and we never need
221 # it. In lex_mode_e.ShCommand, there is an explicit newline token, but
222 # lex_mode_e.Arith doesn't have it.
223 return Id.Unknown_Tok
224
225 tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
226
227 # NOTE: Instead of hard-coding this token, we could pass it in.
228 # LookPastSpace(lex_mode, past_token_type)
229 # - WS_Space only given in lex_mode_e.ShCommand
230 # - Id.Ignored_Space given in lex_mode_e.Expr
231 if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
232 break
233 pos = end_pos
234
235 return tok_type
236
237 def LookAheadFuncParens(self, unread):
238 # type: (int) -> bool
239 """For finding the () in 'f ( ) { echo hi; }'.
240
241 Args:
242 unread: either 0 or 1, for the number of characters to go back
243
244 The lookahead is limited to the current line, which sacrifices a rare
245 corner case. This not recognized as a function:
246
247 foo\
248 () {}
249
250 whereas this is
251
252 foo()
253 {}
254 """
255 pos = self.line_pos - unread
256 assert pos > 0, pos
257 tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
258 self.src_line.content, pos)
259 return tok_type == Id.LookAhead_FuncParens
260
261 def ByteLookAhead(self):
262 # type: () -> str
263 """Lookahead a single byte.
264
265 Useful when you know the token is one char.
266 """
267 pos = self.line_pos
268 if pos == len(self.src_line.content):
269 return ''
270 else:
271 return self.src_line.content[pos]
272
273 def ByteLookBack(self):
274 # type: () -> int
275 """A little hack for stricter proc arg list syntax.
276
277 There has to be a space before the paren.
278
279 Yes: json write (x)
280 No: json write(x)
281 """
282 pos = self.line_pos - 2
283 if pos < 0:
284 return -1
285 else:
286 return ord(self.src_line.content[pos])
287
288 def Read(self, lex_mode):
289 # type: (lex_mode_t) -> Token
290
291 # Inner loop optimization
292 if self.src_line:
293 line_str = self.src_line.content
294 else:
295 line_str = ''
296 line_pos = self.line_pos
297
298 tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
299 if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
300 # LineLexer tells Lexer to read a new line.
301 return self.eol_tok
302
303 # NOTE: We're putting the arena hook in LineLexer and not Lexer because we
304 # want it to be "low level". The only thing fabricated here is a newline
305 # added at the last line, so we don't end with \0.
306 if self.replace_last_token: # make another token from the last span
307 self.arena.UnreadOne()
308 self.replace_last_token = False
309
310 tok_len = end_pos - line_pos
311 t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line)
312
313 self.line_pos = end_pos
314 return t
315
316
317class Lexer(object):
318 """Read lines from the line_reader, split them into tokens with line_lexer,
319 returning them in a stream."""
320
321 def __init__(self, line_lexer, line_reader):
322 # type: (LineLexer, _Reader) -> None
323 """
324 Args:
325 line_lexer: Underlying object to get tokens from
326 line_reader: get new lines from here
327 """
328 self.line_lexer = line_lexer
329 self.line_reader = line_reader
330
331 self.line_id = -1 # Invalid one
332 self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
333 self.emit_comp_dummy = False
334
335 def ResetInputObjects(self):
336 # type: () -> None
337 self.line_lexer.Reset(None, 0)
338
339 def MaybeUnreadOne(self):
340 # type: () -> bool
341 return self.line_lexer.MaybeUnreadOne()
342
343 def LookAheadOne(self, lex_mode):
344 # type: (lex_mode_t) -> Id_t
345 return self.line_lexer.LookAheadOne(lex_mode)
346
347 def LookPastSpace(self, lex_mode):
348 # type: (lex_mode_t) -> Id_t
349 return self.line_lexer.LookPastSpace(lex_mode)
350
351 def LookAheadFuncParens(self, unread):
352 # type: (int) -> bool
353 return self.line_lexer.LookAheadFuncParens(unread)
354
355 def ByteLookAhead(self):
356 # type: () -> str
357 return self.line_lexer.ByteLookAhead()
358
359 def ByteLookBack(self):
360 # type: () -> int
361 return self.line_lexer.ByteLookBack()
362
363 def EmitCompDummy(self):
364 # type: () -> None
365 """Emit Id.Lit_CompDummy right before EOF, for completion."""
366 self.emit_comp_dummy = True
367
368 def PushHint(self, old_id, new_id):
369 # type: (Id_t, Id_t) -> None
370 """Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
371 Id.Op_RParen -> Id.Eof_RParen.
372
373 Problems for $() nesting.
374
375 - posix:
376 - case foo) and case (foo)
377 - func() {}
378 - subshell ( )
379 - bash extensions:
380 - precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
381 - arrays: a=(1 2 3), a+=(4 5)
382 """
383 #log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
384 self.translation_stack.append((old_id, new_id))
385
386 def MoveToNextLine(self):
387 # type: () -> None
388 """For lookahead on the next line.
389
390 This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
391
392 We use this because otherwise calling `LookPastSpace` would return
393 `Id.Unknown_Tok` when the lexer has reached the end of the line. For an
394 example, take this case:
395
396 case (x) {
397 ^--- We are here
398
399 (else) {
400 ^--- We want lookahead to here
401
402 echo test
403 }
404 }
405
406 But, without `MoveToNextLine`, it is impossible to peek the '(' without
407 consuming it. And consuming it would be a problem once we want to hand off
408 pattern parsing to the expression parser.
409 """
410 # Only call this when you've seen \n
411 self.line_lexer.AssertAtEndOfLine()
412
413 src_line, line_pos = self.line_reader.GetLine()
414 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
415
416 def _Read(self, lex_mode):
417 # type: (lex_mode_t) -> Token
418 """Read from the normal line buffer, not an alias."""
419 t = self.line_lexer.Read(lex_mode)
420 if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
421 src_line, line_pos = self.line_reader.GetLine()
422
423 if src_line is None: # no more lines
424 if self.emit_comp_dummy:
425 id_ = Id.Lit_CompDummy
426 self.emit_comp_dummy = False # emit EOF the next time
427 else:
428 id_ = Id.Eof_Real
429 return self.line_lexer.GetEofToken(id_)
430
431 self.line_lexer.Reset(src_line, line_pos) # fill with a new line
432 t = self.line_lexer.Read(lex_mode)
433
434 # e.g. translate ) or ` into EOF
435 if len(self.translation_stack):
436 old_id, new_id = self.translation_stack[-1] # top
437 if t.id == old_id:
438 #log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
439 self.translation_stack.pop()
440 t.id = new_id
441
442 return t
443
444 def Read(self, lex_mode):
445 # type: (lex_mode_t) -> Token
446 while True:
447 t = self._Read(lex_mode)
448 if t.id != Id.Ignored_LineCont:
449 break
450
451 if 0:
452 ID_HIST[t.id] += 1
453 return t
454
455
456if 0: # mylib.PYTHON: note: breaks tarball build
457 import collections
458 ID_HIST = collections.Counter() # type: Counter[Id_t]
459 LAZY_ID_HIST = collections.Counter() # type: Counter[Id_t]