frontend/lexer.py

OILS / frontend / lexer.py View on Github | oilshell.org

457 lines, 189 significant

1	# Copyright 2016 Andy Chu. All rights reserved.
2	# Licensed under the Apache License, Version 2.0 (the "License");
3	# you may not use this file except in compliance with the License.
4	# You may obtain a copy of the License at
5	#
6	# http://www.apache.org/licenses/LICENSE-2.0
7	"""
8	lexer.py - Library for lexing.
9	"""
10
11	from _devbuild.gen.syntax_asdl import Token, SourceLine
12	from _devbuild.gen.types_asdl import lex_mode_t, lex_mode_e
13	from _devbuild.gen.id_kind_asdl import Id_t, Id, Id_str, Kind
14	from asdl import runtime
15	from mycpp.mylib import log
16	from frontend import consts
17	from frontend import match
18
19	unused = log, Id_str
20
21	from typing import List, Tuple, Optional, Counter, TYPE_CHECKING
22	if TYPE_CHECKING:
23	from core.alloc import Arena
24	from frontend.reader import _Reader
25
26
27	def IsPlusEquals(tok):
28	# type: (Token) -> bool
29	"""Common pattern to test if we got foo= or foo+=
30
31	Note: can be replaced by s.find('+', index, index+1), which avoids
32	allocation.
33	"""
34	index = tok.col + tok.length - 2
35	return tok.line.content[index] == '+'
36
37
38	# Also: IsWhitespace, IsLeadingSpace
39
40
41	def TokenEquals(tok, s):
42	# type: (Token, str) -> bool
43
44	# TODO: Use tok.line.content.find(substr, start, end)
45
46	raise NotImplementedError()
47
48
49	def TokenContains(tok, substr):
50	# type: (Token, str) -> bool
51
52	# TODO: Use tok.line.content.find(substr, start, end)
53
54	raise NotImplementedError()
55
56
57	def TokenStartsWith(tok, s):
58	# type: (Token, str) -> bool
59
60	# TODO: Use tok.line.content.startswith(substr, start, end)
61
62	raise NotImplementedError()
63
64
65	def TokenEndsWith(tok, s):
66	# type: (Token, str) -> bool
67
68	# TODO: Use tok.line.content.startswith(substr, start, end)
69
70	raise NotImplementedError()
71
72
73	def TokenVal(tok):
74	# type: (Token) -> str
75	"""Compute string value on demand."""
76	return tok.line.content[tok.col:tok.col + tok.length]
77
78
79	def TokenSliceLeft(tok, left_index):
80	# type: (Token, int) -> str
81	"""Slice token directly, without creating intermediate string."""
82	assert left_index > 0
83	left = tok.col + left_index
84	return tok.line.content[left:tok.col + tok.length]
85
86
87	def TokenSliceRight(tok, right_index):
88	# type: (Token, int) -> str
89	"""Slice token directly, without creating intermediate string."""
90	assert right_index < 0
91	right = tok.col + tok.length + right_index
92	return tok.line.content[tok.col:right]
93
94
95	def TokenSlice(tok, left, right):
96	# type: (Token, int, int) -> str
97	"""Slice token directly, without creating intermediate string."""
98	assert left > 0
99	start = tok.col + left
100	end = tok.col + tok.length + right
101	return tok.line.content[start:end]
102
103
104	def DummyToken(id_, val):
105	# type: (int, str) -> Token
106
107	col = -1
108	length = -1
109	return Token(id_, col, length, runtime.NO_SPID, None, val)
110
111
112	class LineLexer(object):
113
114	def __init__(self, arena):
115	# type: (Arena) -> None
116	self.arena = arena
117	self.replace_last_token = False # For MaybeUnreadOne
118
119	# Singleton instance because we don't allow globals.
120	# 2023-09: I tried LineLexer::Read() returning None, but that is subtly
121	# incorrect, e.g. in Lexer::Read() with NUL bytes.
122	self.eol_tok = DummyToken(Id.Eol_Tok, '')
123
124	self.Reset(None, 0) # Invalid src_line to start
125
126	def __repr__(self):
127	# type: () -> str
128	return '<LineLexer at pos %d of line %r>' % (self.line_pos,
129	self.src_line)
130
131	def Reset(self, src_line, line_pos):
132	# type: (SourceLine, int) -> None
133	#assert line, repr(line) # can't be empty or None
134	self.src_line = src_line
135	self.line_pos = line_pos
136
137	def MaybeUnreadOne(self):
138	# type: () -> bool
139	"""Return True if we can unread one character, or False otherwise.
140
141	NOTE: Only call this when you know the last token was exactly one character!
142	"""
143	if self.line_pos == 0:
144	return False
145	else:
146	self.line_pos -= 1
147	self.replace_last_token = True # don't add the next token to the arena
148	return True
149
150	def GetEofToken(self, id_):
151	# type: (int) -> Token
152	"""Create a new span ID for syntax errors involving the EOF token."""
153	if self.src_line is None:
154	# There are ZERO lines now. Add a dummy line 0 so the Token has a source
155	# to display errors.
156	src_line = self.arena.AddLine('', 0)
157	else:
158	src_line = self.src_line
159
160	return self.arena.NewToken(id_, self.line_pos, 0, src_line, '')
161
162	def LookAheadOne(self, lex_mode):
163	# type: (lex_mode_t) -> Id_t
164	"""Look ahead exactly one token in the given lexer mode."""
165	pos = self.line_pos
166	line_str = self.src_line.content
167	n = len(line_str)
168	if pos == n:
169	return Id.Unknown_Tok
170	else:
171	tok_type, _ = match.OneToken(lex_mode, line_str, pos)
172	return tok_type
173
174	def AssertAtEndOfLine(self):
175	# type: () -> None
176	assert self.line_pos == len(self.src_line.content), \
177	'%d %s' % (self.line_pos, self.src_line.content)
178
179	def LookPastSpace(self, lex_mode):
180	# type: (lex_mode_t) -> Id_t
181	"""Look ahead in current line for non-space token, using given lexer
182	mode.
183
184	Does NOT advance self.line_pos.
185
186	Called with at least the following modes:
187	lex_mode_e.Arith -- for ${a[@]} vs ${a[1+2]}
188	lex_mode_e.VSub_1
189	lex_mode_e.ShCommand
190
191	Note: Only ShCommand emits Id.WS_Space, but other lexer modes don't.
192	"""
193	pos = self.line_pos
194	line_str = self.src_line.content
195	n = len(line_str)
196	#print('Look ahead from pos %d, line %r' % (pos,self.line))
197	while True:
198	if pos == n:
199	# We don't allow lookahead while already at end of line, because it
200	# would involve interacting with the line reader, and we never need
201	# it. In lex_mode_e.ShCommand, there is an explicit newline token, but
202	# lex_mode_e.Arith doesn't have it.
203	return Id.Unknown_Tok
204
205	tok_type, end_pos = match.OneToken(lex_mode, line_str, pos)
206
207	# NOTE: Instead of hard-coding this token, we could pass it in.
208	# LookPastSpace(lex_mode, past_token_type)
209	# - WS_Space only given in lex_mode_e.ShCommand
210	# - Id.Ignored_Space given in lex_mode_e.Expr
211	if tok_type != Id.WS_Space and tok_type != Id.Ignored_Space:
212	break
213	pos = end_pos
214
215	return tok_type
216
217	def LookAheadFuncParens(self, unread):
218	# type: (int) -> bool
219	"""For finding the () in 'f ( ) { echo hi; }'.
220
221	Args:
222	unread: either 0 or 1, for the number of characters to go back
223
224	The lookahead is limited to the current line, which sacrifices a rare
225	corner case. This not recognized as a function:
226
227	foo\
228	() {}
229
230	whereas this is
231
232	foo()
233	{}
234	"""
235	pos = self.line_pos - unread
236	assert pos > 0
237	tok_type, _ = match.OneToken(lex_mode_e.FuncParens,
238	self.src_line.content, pos)
239	return tok_type == Id.LookAhead_FuncParens
240
241	def ByteLookAhead(self):
242	# type: () -> str
243	"""Lookahead a single byte.
244
245	Useful when you know the token is one char.
246	"""
247	pos = self.line_pos
248	if pos == len(self.src_line.content):
249	return ''
250	else:
251	return self.src_line.content[pos]
252
253	def ByteLookBack(self):
254	# type: () -> int
255	"""A little hack for stricter proc arg list syntax.
256
257	There has to be a space before the paren.
258
259	Yes: json write (x)
260	No: json write(x)
261	"""
262	pos = self.line_pos - 2
263	if pos < 0:
264	return -1
265	else:
266	return ord(self.src_line.content[pos])
267
268	def Read(self, lex_mode):
269	# type: (lex_mode_t) -> Token
270
271	# Inner loop optimization
272	if self.src_line:
273	line_str = self.src_line.content
274	else:
275	line_str = ''
276	line_pos = self.line_pos
277
278	tok_type, end_pos = match.OneToken(lex_mode, line_str, line_pos)
279	if tok_type == Id.Eol_Tok: # Do NOT add a span for this sentinel!
280	# LineLexer tells Lexer to read a new line.
281	return self.eol_tok
282
283	# TODO: can inline this function with formula on 16-bit Id.
284	kind = consts.GetKind(tok_type)
285
286	# Save on allocations! We often don't look at the token value.
287	# Whitelist doesn't work well? Use blacklist for now.
288	# - Kind.KW is sometimes a literal in a word
289	# - Kind.Right is for " in here docs. Lexer isn't involved.
290	# - Got an error with Kind.Left too that I don't understand
291	# - Kind.ControlFlow doesn't work because we word_.StaticEval()
292	# if kind in (Kind.Lit, Kind.VSub, Kind.Redir, Kind.Char, Kind.Backtick, Kind.KW, Kind.Right):
293	if kind in (Kind.Arith, Kind.Op, Kind.VTest, Kind.VOp0, Kind.VOp2,
294	Kind.VOp3, Kind.WS, Kind.Ignored, Kind.Eof):
295	tok_val = None # type: Optional[str]
296	else:
297	tok_val = line_str[line_pos:end_pos]
298
299	# NOTE: We're putting the arena hook in LineLexer and not Lexer because we
300	# want it to be "low level". The only thing fabricated here is a newline
301	# added at the last line, so we don't end with \0.
302	if self.replace_last_token: # make another token from the last span
303	self.arena.UnreadOne()
304	self.replace_last_token = False
305
306	tok_len = end_pos - line_pos
307	t = self.arena.NewToken(tok_type, line_pos, tok_len, self.src_line,
308	tok_val)
309
310	self.line_pos = end_pos
311	return t
312
313
314	class Lexer(object):
315	"""Read lines from the line_reader, split them into tokens with line_lexer,
316	returning them in a stream."""
317
318	def __init__(self, line_lexer, line_reader):
319	# type: (LineLexer, _Reader) -> None
320	"""
321	Args:
322	line_lexer: Underlying object to get tokens from
323	line_reader: get new lines from here
324	"""
325	self.line_lexer = line_lexer
326	self.line_reader = line_reader
327
328	self.line_id = -1 # Invalid one
329	self.translation_stack = [] # type: List[Tuple[Id_t, Id_t]]
330	self.emit_comp_dummy = False
331
332	def ResetInputObjects(self):
333	# type: () -> None
334	self.line_lexer.Reset(None, 0)
335
336	def MaybeUnreadOne(self):
337	# type: () -> bool
338	return self.line_lexer.MaybeUnreadOne()
339
340	def LookAheadOne(self, lex_mode):
341	# type: (lex_mode_t) -> Id_t
342	return self.line_lexer.LookAheadOne(lex_mode)
343
344	def LookPastSpace(self, lex_mode):
345	# type: (lex_mode_t) -> Id_t
346	return self.line_lexer.LookPastSpace(lex_mode)
347
348	def LookAheadFuncParens(self, unread):
349	# type: (int) -> bool
350	return self.line_lexer.LookAheadFuncParens(unread)
351
352	def ByteLookAhead(self):
353	# type: () -> str
354	return self.line_lexer.ByteLookAhead()
355
356	def ByteLookBack(self):
357	# type: () -> int
358	return self.line_lexer.ByteLookBack()
359
360	def EmitCompDummy(self):
361	# type: () -> None
362	"""Emit Id.Lit_CompDummy right before EOF, for completion."""
363	self.emit_comp_dummy = True
364
365	def PushHint(self, old_id, new_id):
366	# type: (Id_t, Id_t) -> None
367	"""Use cases: Id.Op_RParen -> Id.Right_Subshell -- disambiguate
368	Id.Op_RParen -> Id.Eof_RParen.
369
370	Problems for $() nesting.
371
372	- posix:
373	- case foo) and case (foo)
374	- func() {}
375	- subshell ( )
376	- bash extensions:
377	- precedence in [[, e.g. [[ (1 == 2) && (2 == 3) ]]
378	- arrays: a=(1 2 3), a+=(4 5)
379	"""
380	#log(' PushHint %s ==> %s', Id_str(old_id), Id_str(new_id))
381	self.translation_stack.append((old_id, new_id))
382
383	def MoveToNextLine(self):
384	# type: () -> None
385	"""For lookahead on the next line.
386
387	This is required by `ParseYshCase` and is used in `_NewlineOkForYshCase`.
388
389	We use this because otherwise calling `LookPastSpace` would return
390	`Id.Unknown_Tok` when the lexer has reached the end of the line. For an
391	example, take this case:
392
393	case (x) {
394	^--- We are here
395
396	(else) {
397	^--- We want lookahead to here
398
399	echo test
400	}
401	}
402
403	But, without `MoveToNextLine`, it is impossible to peek the '(' without
404	consuming it. And consuming it would be a problem once we want to hand off
405	pattern parsing to the expression parser.
406	"""
407	# Only call this when you've seen \n
408	self.line_lexer.AssertAtEndOfLine()
409
410	src_line, line_pos = self.line_reader.GetLine()
411	self.line_lexer.Reset(src_line, line_pos) # fill with a new line
412
413	def _Read(self, lex_mode):
414	# type: (lex_mode_t) -> Token
415	"""Read from the normal line buffer, not an alias."""
416	t = self.line_lexer.Read(lex_mode)
417	if t.id == Id.Eol_Tok: # We hit \0 aka Eol_Tok, read a new line
418	src_line, line_pos = self.line_reader.GetLine()
419
420	if src_line is None: # no more lines
421	if self.emit_comp_dummy:
422	id_ = Id.Lit_CompDummy
423	self.emit_comp_dummy = False # emit EOF the next time
424	else:
425	id_ = Id.Eof_Real
426	return self.line_lexer.GetEofToken(id_)
427
428	self.line_lexer.Reset(src_line, line_pos) # fill with a new line
429	t = self.line_lexer.Read(lex_mode)
430
431	# e.g. translate ) or ` into EOF
432	if len(self.translation_stack):
433	old_id, new_id = self.translation_stack[-1] # top
434	if t.id == old_id:
435	#log('==> TRANSLATING %s ==> %s', Id_str(t.id), Id_str(new_id))
436	self.translation_stack.pop()
437	t.id = new_id
438
439	return t
440
441	def Read(self, lex_mode):
442	# type: (lex_mode_t) -> Token
443	while True:
444	t = self._Read(lex_mode)
445	# TODO: Change to ALL IGNORED types, once you have SPACE_TOK. This means
446	# we don't have to handle them in the VSub_1/VSub_2/etc. states.
447	if t.id != Id.Ignored_LineCont:
448	break
449
450	#ID_HIST[t.id] += 1
451	#log('> Read() Returning %s', t)
452	return t
453
454
455	if 0: # mylib.PYTHON: not: breaks tarball build
456	import collections
457	ID_HIST = collections.Counter() # type: Counter[Id_t]