frontend/lexer

OILS / frontend / lexer_def.py View on Github | oilshell.org

1056 lines, 537 significant

1	"""
2	lexer_def.py -- A lexer for both OSH and YSH.
3
4	It consists of a series of lexer modes, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/dev.sh all
9
10	or at least:
11
12	build/dev.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0. The core/lexer_gen.py code
22	generator adds and extra rule for \0.
23
24	For example, use [^'\0]+ instead of [^']+ .
25
26	If this rule isn't followed, we would read uninitialized memory past the
27	sentinel. Python's regex engine knows where the end of the input string is, so
28	it doesn't require need a sentinel like \0.
29	"""
30
31	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
32	from _devbuild.gen.types_asdl import lex_mode_e
33
34	from frontend import id_kind_def
35
36	from typing import Tuple
37
38	# Initialize spec that the lexer depends on.
39	ID_SPEC = id_kind_def.IdSpec({}, {})
40
41	id_kind_def.AddKinds(ID_SPEC)
42	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
43	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
44
45
46	def C(pat, tok_type):
47	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
48	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
49	return (False, pat, tok_type)
50
51
52	def R(pat, tok_type):
53	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
54	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
55	return (True, pat, tok_type)
56
57
58	# See unit tests in frontend/match_test.py.
59	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
60	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
61
62	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
63
64	_BACKSLASH = [
65	# To be conservative, we could deny a set of chars similar to
66	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
67	# like \( and \;.
68	#
69	# strict_backslash makes this stricter.
70	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
71	C('\\\n', Id.Ignored_LineCont),
72	]
73
74	# Only 4 characters are backslash escaped inside "".
75	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
76	_DQ_BACKSLASH = [
77	R(r'\\[$`"\\]', Id.Lit_EscapedChar),
78	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
79	]
80
81	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
82
83	# All Kind.VSub
84	_VARS = [
85	# Unbraced variables
86	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
87	R(r'\$[0-9]', Id.VSub_Number),
88	C(r'$!', Id.VSub_Bang),
89	C(r'$@', Id.VSub_At),
90	C(r'$#', Id.VSub_Pound),
91	C(r'$$', Id.VSub_Dollar),
92	C(r'$*', Id.VSub_Star),
93	C(r'$-', Id.VSub_Hyphen),
94	C(r'$?', Id.VSub_QMark),
95	]
96
97	# Kind.Left that are valid in double-quoted modes.
98
99	_LEFT_SUBS = [
100	C('`', Id.Left_Backtick),
101	C('$(', Id.Left_DollarParen),
102	C('${', Id.Left_DollarBrace),
103	C('$((', Id.Left_DollarDParen),
104	C('$[', Id.Left_DollarBracket),
105	]
106
107	# Additional Kind.Left that are valid in unquoted modes.
108	_LEFT_UNQUOTED = [
109	C('"', Id.Left_DoubleQuote),
110	C("'", Id.Left_SingleQuote),
111	C('$"', Id.Left_DollarDoubleQuote),
112	C("$'", Id.Left_DollarSingleQuote),
113	]
114
115	_LEFT_PROCSUB = [
116	C('<(', Id.Left_ProcSubIn),
117	C('>(', Id.Left_ProcSubOut),
118	]
119
120	# The regexes below are in Python syntax, but are translate to re2c syntax by
121	# frontend/lexer_gen.py.
122	#
123	# http://re2c.org/manual/syntax/syntax.html
124	# https://docs.python.org/2/library/re.html
125	#
126	# We use a limited set of constructs:
127	# - + and * for repetition
128	# - Character classes [] with simple ranges and negation
129	# - Escapes like \n \0
130
131	LEXER_DEF = {} # TODO: Should be a list so we enforce order.
132
133	# Anything until the end of the line is a comment. Does not match the newline
134	# itself. We want to switch modes and possibly process Op_Newline for here
135	# docs, etc.
136	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
137
138	# A whitelist for efficiency. The shell language says that "anything else" is
139	# a literal character. In other words, a single $ \ or ! is a literal, not a
140	# syntax error. It's defined negatively, but let's define positive runs here.
141	# TODO: Add + here because it's never special? It's different for YSH though.
142
143	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
144	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_.\-]+'
145
146	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
147	# NOTE: We could add anything 128 and above to this character class? So
148	# utf-8 characters don't get split?
149	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
150	C('~', Id.Lit_Tilde), # for tilde sub
151	C('/', Id.Lit_Slash), # also for tilde sub
152	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
153	C('$', Id.Lit_Dollar), # shopt -u parse_dollar
154	C('#', Id.Lit_Pound), # For comments
155	_SIGNIFICANT_SPACE,
156	C('\n', Id.Op_Newline),
157	C('&', Id.Op_Amp),
158	C('\|', Id.Op_Pipe),
159	C('\|&', Id.Op_PipeAmp),
160	C('&&', Id.Op_DAmp),
161	C('\|\|', Id.Op_DPipe),
162	C(';', Id.Op_Semi),
163	C(';;', Id.Op_DSemi),
164	C('(', Id.Op_LParen),
165	C(')', Id.Op_RParen),
166	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
167	]
168
169	# In ShCommand and DBracket states.
170	_EXTGLOB_BEGIN = [
171	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
172	C('@(', Id.ExtGlob_At),
173	C('*(', Id.ExtGlob_Star),
174	C('+(', Id.ExtGlob_Plus),
175	C('?(', Id.ExtGlob_QMark),
176	C('!(', Id.ExtGlob_Bang),
177	]
178
179	KEYWORDS = [
180	# NOTE: { is matched elsewhere
181	C('[[', Id.KW_DLeftBracket),
182	C('!', Id.KW_Bang),
183	C('for', Id.KW_For),
184	C('while', Id.KW_While),
185	C('until', Id.KW_Until),
186	C('do', Id.KW_Do),
187	C('done', Id.KW_Done),
188	C('in', Id.KW_In),
189	C('case', Id.KW_Case),
190	C('esac', Id.KW_Esac),
191	C('if', Id.KW_If),
192	C('fi', Id.KW_Fi),
193	C('then', Id.KW_Then),
194	C('else', Id.KW_Else),
195	C('elif', Id.KW_Elif),
196	C('function', Id.KW_Function),
197	C('time', Id.KW_Time),
198
199	# YSH
200	C('const', Id.KW_Const), # maybe remove this
201	C('var', Id.KW_Var),
202	C('setvar', Id.KW_SetVar),
203	C('setglobal', Id.KW_SetGlobal),
204	C('call', Id.KW_Call),
205	C('proc', Id.KW_Proc),
206	C('func', Id.KW_Func),
207	]
208
209	# These are treated like builtins in bash, but keywords in OSH. However, we
210	# maintain compatibility with bash for the 'type' builtin.
211	CONTROL_FLOW = [
212	C('break', Id.ControlFlow_Break),
213	C('continue', Id.ControlFlow_Continue),
214	C('return', Id.ControlFlow_Return),
215	C('exit', Id.ControlFlow_Exit),
216	]
217
218	# Used by ysh/grammar_gen.py too
219	EXPR_WORDS = [
220	C('null', Id.Expr_Null),
221	C('true', Id.Expr_True),
222	C('false', Id.Expr_False),
223	C('and', Id.Expr_And),
224	C('or', Id.Expr_Or),
225	C('not', Id.Expr_Not),
226	C('for', Id.Expr_For),
227	C('while', Id.Expr_While),
228	C('is', Id.Expr_Is),
229	C('in', Id.Expr_In),
230	C('if', Id.Expr_If),
231	C('else', Id.Expr_Else),
232
233	# for function literals
234	C('func', Id.Expr_Func),
235
236	# / <capture d+/
237	C('capture', Id.Expr_Capture),
238	# / <capture d+ as date> /
239	C('as', Id.Expr_As),
240
241	# Tea Control Flow Operators
242	C('break', Id.Expr_Break),
243	C('continue', Id.Expr_Continue),
244	C('return', Id.Expr_Return),
245	]
246
247	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
248
249	# file descriptors can only have two digits, like mksh
250	# dash/zsh/etc. can have one
251	FD_NUM = r'[0-9]?[0-9]?'
252
253	# These two can must be recognized in the ShCommand state, but can't nested
254	# within [[.
255	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
256	# of <Lit_Chars "if">.
257	LEXER_DEF[lex_mode_e.ShCommand] = [
258	# These four are not allowed within [[, so they are in ShCommand but not
259	# _UNQUOTED.
260
261	# e.g. beginning of NAME=val, which will always be longer than
262	# _LITERAL_WHITELIST_REGEX.
263	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
264	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
265	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
266	C('((', Id.Op_DLeftParen),
267
268	# For static globbing, and [] for array literals
269	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
270	C(']', Id.Lit_RBracket), # e.g. *.[ch]
271	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
272	C('*', Id.Lit_Star),
273	C('?', Id.Lit_QMark),
274	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
275	C('...', Id.Lit_TDot), # ... for multiline commands
276
277	# For brace expansion {a,b}
278	C('{', Id.Lit_LBrace),
279	C('}', Id.Lit_RBrace), # Also for var sub ${a}
280	C(',', Id.Lit_Comma),
281	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
282	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
283
284	# @array and @func(1, c)
285	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
286	C('@[', Id.Lit_AtLBracket), # @[split(x)]
287	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
288	R(FD_NUM + r'<', Id.Redir_Less),
289	R(FD_NUM + r'>', Id.Redir_Great),
290	R(FD_NUM + r'<<', Id.Redir_DLess),
291	R(FD_NUM + r'<<<', Id.Redir_TLess),
292	R(FD_NUM + r'>>', Id.Redir_DGreat),
293	R(FD_NUM + r'<<-', Id.Redir_DLessDash),
294	R(FD_NUM + r'>&', Id.Redir_GreatAnd),
295	R(FD_NUM + r'<&', Id.Redir_LessAnd),
296	R(FD_NUM + r'<>', Id.Redir_LessGreat),
297	R(FD_NUM + r'>\\|', Id.Redir_Clobber),
298	R(FD_VAR_NAME + r'<', Id.Redir_Less),
299	R(FD_VAR_NAME + r'>', Id.Redir_Great),
300	R(FD_VAR_NAME + r'<<', Id.Redir_DLess),
301	R(FD_VAR_NAME + r'<<<', Id.Redir_TLess),
302	R(FD_VAR_NAME + r'>>', Id.Redir_DGreat),
303	R(FD_VAR_NAME + r'<<-', Id.Redir_DLessDash),
304	R(FD_VAR_NAME + r'>&', Id.Redir_GreatAnd),
305	R(FD_VAR_NAME + r'<&', Id.Redir_LessAnd),
306	R(FD_VAR_NAME + r'<>', Id.Redir_LessGreat),
307	R(FD_VAR_NAME + r'>\\|', Id.Redir_Clobber),
308
309	# No leading descriptor (2 is implied)
310	C(r'&>', Id.Redir_AndGreat),
311	C(r'&>>', Id.Redir_AndDGreat),
312	] + KEYWORDS + CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
313
314	# Preprocessing before ShCommand
315	LEXER_DEF[lex_mode_e.Backtick] = [
316	C(r'`', Id.Backtick_Right),
317	# A backslash, and then $ or ` or \
318	R(r'\\[$`\\]', Id.Backtick_Quoted),
319	# \" treated specially, depending on whether bacticks are double-quoted!
320	R(r'\\"', Id.Backtick_DoubleQuote),
321	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
322	R(r'[^\0]', Id.Backtick_Other), # anything else
323	]
324
325	# DBRACKET: can be like ShCommand, except:
326	# - Don't really need redirects either... Redir_Less could be Op_Less
327	# - Id.Op_DLeftParen can't be nested inside.
328	LEXER_DEF[lex_mode_e.DBracket] = [
329	C(']]', Id.Lit_DRightBracket),
330	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
331	# in addition to [[ ! a && b ]]
332	C('!', Id.KW_Bang),
333	C('<', Id.Op_Less),
334	C('>', Id.Op_Great),
335	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
336	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
337	_UNQUOTED + _EXTGLOB_BEGIN
338
339	# Inside an extended glob, most characters are literals, including spaces and
340	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
341	# nested, so _EXTGLOB_BEGIN appears here.
342	#
343	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
344	LEXER_DEF[lex_mode_e.ExtGlob] = \
345	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
346	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
347	C('\|', Id.Op_Pipe),
348	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
349	R(r'[^\0]', Id.Lit_Other), # everything else is literal
350	]
351
352	# Notes on BASH_REGEX states
353	#
354	# From bash manual:
355	#
356	# - Any part of the pattern may be quoted to force the quoted portion to be
357	# matched as a string.
358	# - Bracket expressions in regular expressions must be treated carefully, since
359	# normal quoting characters lose their meanings between brackets.
360	# - If the pattern is stored in a shell variable, quoting the variable
361	# expansion forces the entire pattern to be matched as a string.
362	#
363	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
364	#
365	# TODO: For testing, write a script to extract and save regexes... and compile
366	# them with regcomp. I've only seen constant regexes.
367	#
368	# From code: ( \| ) are treated special.
369
370	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
371
372	# NOTE: bash accounts for spaces and non-word punctuation like ; inside ()
373	# and []. We will avoid that and ask the user to extract a variable?
374	R(r'[a-zA-Z0-9_-]+', Id.Lit_Chars), # not including period
375
376	# Tokens for Tilde sub. bash weirdness: RHS of [[ x =~ ~ ]] is expanded
377	C('~', Id.Lit_Tilde),
378	C('/', Id.Lit_Slash),
379	_SIGNIFICANT_SPACE,
380
381	# Normally, \x evaluates to x. But quoted regex metacharacters like \* should
382	# evaluate to \*. Compare with ( \| ).
383	R(r'\\[*+?.^$\[\]]', Id.Lit_RegexMeta),
384
385	# NOTE: ( \| and ) aren't operators!
386	R(r'[^\0]', Id.Lit_Other), # Everything else is a literal
387	] + _BACKSLASH # These have to come after RegexMeta
388
389	LEXER_DEF[lex_mode_e.DQ] = _DQ_BACKSLASH + [
390	C('\\\n', Id.Ignored_LineCont),
391	] + _LEFT_SUBS + _VARS + [
392	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
393	C('$', Id.Lit_Dollar), # completion of var names relies on this
394	# NOTE: When parsing here doc line, this token doesn't end it.
395	C('"', Id.Right_DoubleQuote),
396	]
397
398	_VS_ARG_COMMON = [
399	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
400	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
401	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
402	C('}', Id.Right_DollarBrace), # For var sub "${a}"
403	C('$', Id.Lit_Dollar), # completion of var names relies on this
404	]
405
406	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
407	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
408	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
409	_VARS + _EXTGLOB_BEGIN + [
410
411	# Token for Tilde sub
412	C('~', Id.Lit_Tilde),
413
414	# - doesn't match ~ for tilde sub
415	# - doesn't match < and > so it doesn't eat <()
416	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
417	# not enough
418	R(r'[^$`~/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
419	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
420	]
421
422	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
423	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = \
424	_DQ_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
425
426	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
427
428	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
429
430	# Weird wart: even in double quoted state, double quotes are allowed
431	C('"', Id.Left_DoubleQuote),
432
433	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
434	C("$'", Id.Left_DollarSingleQuote),
435	]
436
437	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
438	# state.
439	LEXER_DEF[lex_mode_e.SQ_Raw] = [
440	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
441	C("'", Id.Right_SingleQuote),
442	]
443
444	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
445	#
446	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
447	# 0x1234. And \0 is 0x0.
448
449	# In Python:
450	# chr(0x00012345) == u'\U00012345'
451	#
452	# In YSH:
453	# 0x00012345 == \u{12345}
454	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
455
456	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
457
458	_X_CHAR_LOOSE = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex) # bash
459	_X_CHAR_STRICT = R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex) # YSH
460
461	_U4_CHAR_LOOSE = R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4) # bash
462
463	_U4_CHAR_STRICT = R(r'\\u[0-9a-fA-F]{4}', Id.Char_Unicode4) # JSON-only
464
465	EXPR_CHARS = [
466	# This is like Rust. We don't have the legacy C escapes like \b.
467
468	# NOTE: \' and \" are more readable versions of '"' and "'" in regexs
469	R(r'\\[0rtn\\"%s]' % "'", Id.Char_OneChar),
470	_X_CHAR_STRICT,
471
472	# Because 'a' is a string, we use the syntax #'a' for char literals.
473	# We explicitly leave out #''' because it's confusing.
474	# Note: we're not doing utf-8 validation here.
475	R(r"#'[^'\0]'", Id.Char_Pound),
476	_U_BRACED_CHAR,
477	]
478
479	# Shared between echo -e and $''.
480	_C_STRING_COMMON = [
481
482	# \x6 is valid in bash
483	_X_CHAR_LOOSE,
484	_U4_CHAR_LOOSE,
485	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
486	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
487
488	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
489	# but a lint tool could warn about it.
490	C('\\', Id.Unknown_Backslash),
491	]
492
493	ECHO_E_DEF = _C_STRING_COMMON + [
494	# Note: tokens above \0377 can either be truncated or be flagged a syntax
495	# error in strict mode.
496	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
497	C(r'\c', Id.Char_Stop),
498
499	# e.g. 'foo', anything that's not a backslash escape
500	R(r'[^\\\0]+', Id.Char_Literals),
501	]
502
503	# https://json.org/
504
505	# Note that [0-9] has to come second, because Python chooses the first match.
506	_JSON_INT = r'([1-9][0-9]*\|[0-9])' # Numbers can't start with leading 0
507	_JSON_FRACTION = r'(\.[0-9]+)?'
508	_JSON_EXP = r'([eE][-+]?[0-9]+)?'
509
510	# R5RS extended alphabetic characters
511	# https://groups.csail.mit.edu/mac/ftpdir/scheme-reports/r5rs-html/r5rs_4.html
512	#
513	# ! $ % & * + - . / : < = > ? @ ^ _ ~
514
515	# Description from Guile Scheme - https://www.gnu.org/software/guile/manual/html_node/Symbol-Read-Syntax.html
516	#
517	# "The read syntax for a symbol is a sequence of letters, digits, and extended
518	# alphabetic characters, beginning with a character that cannot begin a
519	# number. In addition, the special cases of +, -, and ... are read as symbols
520	# even though numbers can begin with +, - or ."
521	#
522	# (They should have used regular languages!)
523
524	# We take out $ and @ for our splicing syntax, i.e. $unquote and
525	# @unquote-splicing. And : for now because we use it for name:value.
526
527	# Also note Scheme allows \|a b\| for symbols with funny chars, and Guile scheme
528	# allows #{a b}#. We could use `a b` or (symbol "a b").
529
530	J8_SYMBOL_CHARS = r'!%&*+./<=>?^_~-' # - is last for regex char class
531
532	# yapf: disable
533	J8_SYMBOL_RE = (
534	r'[a-zA-Z' + J8_SYMBOL_CHARS + ']' +
535	r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
536	# yapf: enable
537
538	J8_DEF = [
539	C('"', Id.Left_DoubleQuote), # JSON string
540	# Three left quotes that are J8 only
541	C("u'", Id.Left_USingleQuote), # unicode string
542	C("'", Id.Left_USingleQuote), # '' is alias for u'' in data, not in code
543	C("b'", Id.Left_BSingleQuote), # byte string
544	C('[', Id.J8_LBracket),
545	C(']', Id.J8_RBracket),
546	C('{', Id.J8_LBrace),
547	C('}', Id.J8_RBrace),
548	C('(', Id.J8_LParen), # NIL8 only
549	C(')', Id.J8_RParen), # NIL8 only
550	C(',', Id.J8_Comma),
551	C(':', Id.J8_Colon),
552	C('null', Id.J8_Null),
553	C('true', Id.J8_Bool),
554	C('false', Id.J8_Bool),
555	R(_JSON_INT, Id.J8_Int),
556	R(_JSON_INT + _JSON_FRACTION + _JSON_EXP, Id.J8_Float),
557
558	# Identifier names come AFTER null true false.
559	# - Happens to be the same as shell identifier # names.
560	# - Note that JS allows $ as an identifier, but we don't.
561	# - Used for dict keys / NIL8 field names.
562	R(VAR_NAME_RE, Id.J8_Identifier),
563
564	# Symbol is a SUPERSET of Identifier. The first word in NIL8 can be can
565	# be either Symbol or plain Identifier, but field names can only be
566	# Identifier. JSON8 only has Identifier.
567	#R(J8_SYMBOL_RE, Id.J8_Symbol), # NIL8 only
568	R(r'[~!@$%^&*+=\|:;./<>?-]+', Id.J8_Operator), # NIL8 only
569
570	# TODO: emit Id.Ignored_Newline to count lines for error messages?
571	R(r'[ \r\n\t]+', Id.Ignored_Space),
572	# comment is # until end of line
573	# // comments are JavaScript style, but right now we might want them as
574	# symbols?
575	R(r'#[^\n\0]*', Id.Ignored_Comment), # J8 only (JSON8, NIL8)
576
577	# This will reject ASCII control chars
578	R(r'[^\0]', Id.Unknown_Tok),
579	]
580
581	# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data
582	# But \n has to be allowed in multi-line strings
583	_ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
584
585	# https://json.org list of chars, plus '
586	_JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
587
588	# Union of escapes that "" u"" b"" accept. Validation is separate.
589	J8_STR_DEF = [
590	C("'", Id.Right_SingleQuote), # end for J8
591	_JSON_ONE_CHAR,
592	C("\\'", Id.Char_OneChar),
593
594	# osh/word_parse.py relies on this. It has to match $'', which uses _C_STRING_COMMON
595	C('\\', Id.Unknown_Backslash),
596	R(r'\\y[0-9a-fA-F]{2}', Id.Char_YHex), # \yff - J8 only
597	_U_BRACED_CHAR, # \u{123456} - J8 only
598	_ASCII_CONTROL,
599
600	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
601	R(r'''[^\\'\0]+''', Id.Char_Literals),
602	]
603
604	# For "JSON strings \" \u1234"
605	JSON_STR_DEF = [
606	C('"', Id.Right_DoubleQuote), # end for JSON
607	_JSON_ONE_CHAR,
608	_U4_CHAR_STRICT, # \u1234 - JSON only
609
610	# High surrogate [\uD800, \uDC00)
611	# Low surrogate [\uDC00, \uE000)
612	# This pattern makes it easier to decode. Unpaired surrogates because Id.Char_Unicode4.
613	R(
614	r'\\u[dD][89aAbB][0-9a-fA-F][0-9a-fA-F]\\u[dD][cCdDeEfF][0-9a-fA-F][0-9a-fA-F]',
615	Id.Char_SurrogatePair),
616	_ASCII_CONTROL,
617
618	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
619	R(r'[^\\"\0]+', Id.Char_Literals),
620	R(r'[^\0]', Id.Unknown_Tok),
621	]
622
623	LEXER_DEF[lex_mode_e.J8_Str] = J8_STR_DEF
624
625	OCTAL3_RE = r'\\[0-7]{1,3}'
626
627	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
628	PS1_DEF = [
629	R(OCTAL3_RE, Id.PS_Octal3),
630	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
631	# \D{%H:%M} strftime format
632	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
633	C(r'\[', Id.PS_LBrace), # non-printing
634	C(r'\]', Id.PS_RBrace),
635	R(r'[^\\\0]+', Id.PS_Literals),
636	# e.g. \x is not a valid escape.
637	C('\\', Id.PS_BadBackslash),
638	]
639
640	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
641	# point of it is that supports other backslash escapes like \n! It just
642	# becomes a regular backslash.
643	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
644	# Weird special case matching bash: backslash that ends a line. We emit
645	# this token literally in OSH, but disable it in YSH.
646	C('\\\n', Id.Unknown_Backslash),
647
648	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
649	# with no leading 0.
650	R(OCTAL3_RE, Id.Char_Octal3),
651
652	# ' and " are escaped in $'' mode, but not echo -e.
653	C(r"\'", Id.Char_OneChar),
654	C(r'\"', Id.Char_OneChar),
655
656	# e.g. 'foo', anything that's not a backslash escape or '
657	R(r"[^\\'\0]+", Id.Char_Literals),
658	C("'", Id.Right_SingleQuote),
659	]
660
661	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
662	R(OCTAL3_RE, Id.Char_Octal3),
663	R(r"[^%\\\0]+", Id.Char_Literals),
664	C('%%', Id.Format_EscapedPercent),
665	C('%', Id.Format_Percent),
666	]
667
668	# Maybe: bash also supports %(strftime)T
669	LEXER_DEF[lex_mode_e.PrintfPercent] = [
670	# Flags
671	R('[- +#]', Id.Format_Flag),
672	C('0', Id.Format_Zero),
673	R('[1-9][0-9]*', Id.Format_Num),
674	C('*', Id.Format_Star),
675	C('.', Id.Format_Dot),
676	# We support dsq. The others we parse to display an error message.
677	R('[disqbcouxXeEfFgG]', Id.Format_Type),
678	R('$[^()\0]*$T', Id.Format_Time),
679	R(r'[^\0]', Id.Unknown_Tok), # any other char
680	]
681
682	LEXER_DEF[lex_mode_e.VSub_1] = [
683	R(VAR_NAME_RE, Id.VSub_Name),
684	# ${11} is valid, compared to $11 which is $1 and then literal 1.
685	R(r'[0-9]+', Id.VSub_Number),
686	C('!', Id.VSub_Bang),
687	C('@', Id.VSub_At),
688	C('#', Id.VSub_Pound),
689	C('$', Id.VSub_Dollar),
690	C('*', Id.VSub_Star),
691	C('-', Id.VSub_Hyphen),
692	C('?', Id.VSub_QMark),
693	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
694	C('}', Id.Right_DollarBrace),
695	C('\\\n', Id.Ignored_LineCont),
696	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
697	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
698	]
699
700	LEXER_DEF[lex_mode_e.VSub_2] = \
701	ID_SPEC.LexerPairs(Kind.VTest) + \
702	ID_SPEC.LexerPairs(Kind.VOp0) + \
703	ID_SPEC.LexerPairs(Kind.VOpOil) + \
704	ID_SPEC.LexerPairs(Kind.VOp1) + \
705	ID_SPEC.LexerPairs(Kind.VOp2) + \
706	ID_SPEC.LexerPairs(Kind.VOp3) + [
707	C('}', Id.Right_DollarBrace),
708
709	C('\\\n', Id.Ignored_LineCont),
710	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
711	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
712	]
713
714	_EXPR_ARITH_SHARED = [
715	C('\\\n', Id.Ignored_LineCont),
716	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
717	]
718
719	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
720	LEXER_DEF[lex_mode_e.Arith] = \
721	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
722
723	# Arithmetic expressions can cross newlines.
724	R(r'[ \t\r\n]+', Id.Ignored_Space),
725
726	# Examples of arith constants:
727	# 64#azAZ
728	# 0xabc 0xABC
729	# 0123
730	# A separate digits token makes this easier to parse STATICALLY. But this
731	# doesn't help with DYNAMIC parsing.
732	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
733	R(r'[0-9]+', Id.Lit_Digits),
734	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
735	C('#', Id.Lit_Pound), # for 64#a
736
737	# TODO: 64#@ interferes with VS_AT. Hm.
738	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
739
740	# A lexer for the parser that converts globs to extended regexes. Since we're
741	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
742	# don't need lexer modes here.
743	GLOB_DEF = [
744	# These could be operators in the glob, or just literals in a char class,
745	# e.g. touch '?'; echo [?].
746	C('*', Id.Glob_Star),
747	C('?', Id.Glob_QMark),
748
749	# For negation. Treated as operators inside [], but literals outside.
750	C('!', Id.Glob_Bang),
751	C('^', Id.Glob_Caret),
752
753	# Character classes.
754	C('[', Id.Glob_LBracket),
755	C(']', Id.Glob_RBracket),
756
757	# There is no whitelist of characters; backslashes are unconditionally
758	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
759	# See libc_test.py.
760	R(r'\\[^\0]', Id.Glob_EscapedChar),
761	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
762
763	# For efficiency, combine other characters into a single token, e.g. 'py' in
764	# '*.py' or 'alpha' in '[[:alpha:]]'.
765	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
766	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
767	]
768
769	# History expansion. We're doing this as "pre-lexing" since that's what bash
770	# and zsh seem to do. Example:
771	#
772	# $ foo=x
773	# $ echo $
774	# $ !!foo # expands to echo $foo and prints x
775	#
776	# We can also reuse this in the RootCompleter to expand history interactively.
777	#
778	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
779	# quotes AGAIN.
780	#
781	# Note: \! gets expanded to literal \! for the real lexer, but no history
782	# expansion occurs.
783
784	HISTORY_DEF = [
785	# Common operators.
786	R(r'![!*^$]', Id.History_Op),
787
788	# By command number.
789	R(r'!-?[0-9]+', Id.History_Num),
790
791	# Search by prefix of substring (optional '?').
792	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
793	# No hyphen since it conflits with $-1 too.
794	#
795	# Required trailing whitespace is there to avoid conflict with [!charclass]
796	# and ${!indirect}. This is a simpler hack than the one bash has. See
797	# frontend/lex_test.py.
798	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
799
800	# Comment is until end of line
801	R(r"#[^\0]*", Id.History_Other),
802
803	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
804	# end of string.
805	R(r"'[^'\0]*'?", Id.History_Other),
806
807	# Runs of chars that are definitely not special
808	R(r"[^!\\'#\0]+", Id.History_Other),
809
810	# Escaped characters. \! disables history
811	R(r'\\[^\0]', Id.History_Other),
812	# Other single chars, like a trailing \ or !
813	R(r'[^\0]', Id.History_Other),
814	]
815
816	BRACE_RANGE_DEF = [
817	R(r'-?[0-9]+', Id.Range_Int),
818	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
819	R(r'\.\.', Id.Range_Dots),
820	R(r'[^\0]', Id.Range_Other), # invalid
821	]
822
823	#
824	# YSH lexing
825	#
826
827	# Valid in lex_mode_e.{Expr,DQ}
828	# Used by ysh/grammar_gen.py
829	YSH_LEFT_SUBS = [
830	C('$(', Id.Left_DollarParen),
831	C('${', Id.Left_DollarBrace),
832	C('$[', Id.Left_DollarBracket), # TODO: Implement $[x]
833	]
834
835	# Valid in lex_mode_e.Expr, but not valid in DQ
836	# Used by ysh/grammar_gen.py
837
838	YSH_LEFT_UNQUOTED = [
839	C('"', Id.Left_DoubleQuote),
840	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
841	C("'", Id.Left_SingleQuote),
842	C("r'", Id.Left_RSingleQuote),
843	C("u'", Id.Left_USingleQuote),
844	C("b'", Id.Left_BSingleQuote),
845	C("$'", Id.Left_DollarSingleQuote),
846	C('^"', Id.Left_CaretDoubleQuote),
847	C('"""', Id.Left_TDoubleQuote),
848	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
849	C("'''", Id.Left_TSingleQuote),
850	C("r'''", Id.Left_RTSingleQuote),
851	C("u'''", Id.Left_UTSingleQuote),
852	C("b'''", Id.Left_BTSingleQuote),
853	C('@(', Id.Left_AtParen), # Split Command Sub
854	C('^(', Id.Left_CaretParen), # Block literals in expression mode
855	C('^[', Id.Left_CaretBracket), # Expr literals
856	C('^{', Id.Left_CaretBrace), # Unused
857	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
858	C('%(', Id.Left_PercentParen), # old syntax for shell-like word arrays.
859	C('%[', Id.Expr_Reserved), # Maybe: like %() without unquoted [], {}
860	C('%{', Id.Expr_Reserved), # Table literals
861	# t = %{
862	# name:Str age:Int
863	# 'andy c' 10
864	# }
865	# Significant newlines. No unquoted [], {}
866
867	# Not sure if we'll use these
868	C('@{', Id.Expr_Reserved),
869	C('@[', Id.Expr_Reserved),
870
871	# Idea: Set literals are #{a, b} like Clojure
872	]
873
874	# Used by ysh/grammar_gen.py
875	EXPR_OPS = [
876	# Terminator
877	C(';', Id.Op_Semi),
878	C('(', Id.Op_LParen),
879	C(')', Id.Op_RParen),
880	# NOTE: type expressions are expressions, e.g. Dict[Str, Int]
881	C('[', Id.Op_LBracket),
882	C(']', Id.Op_RBracket),
883	C('{', Id.Op_LBrace),
884	C('}', Id.Op_RBrace),
885	]
886
887	# Newline is significant, but sometimes elided by expr_parse.py.
888	_EXPR_NEWLINE_COMMENT = [
889	C('\n', Id.Op_Newline),
890	R(r'#[^\n\0]*', Id.Ignored_Comment),
891	R(r'[ \t\r]+', Id.Ignored_Space),
892	]
893
894	_WHITESPACE = r'[ \t\r\n]*' # not including legacy \f \v
895
896	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
897	# consistent, and avoid '00' turning into a float!
898	_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
899
900	# Used for YSH comparison operators > >= < <=
901	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?' + _DECIMAL_INT_RE + _WHITESPACE
902
903	_FLOAT_RE = (
904	_DECIMAL_INT_RE +
905	# Unlike Python, exponent can't be like 42e5_000. There's no use because
906	# 1e309 is already inf. Let's keep our code simple.
907	r'(\.' + _DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
908
909	# Ditto, used for comparison operators
910	# Added optional Optional -?
911	# Example: -3_000_000.000_001e12
912	LOOKS_LIKE_FLOAT = _WHITESPACE + '-?' + _FLOAT_RE + _WHITESPACE
913
914	# Python 3 float literals:
915
916	# digitpart ::= digit (["_"] digit)*
917	# fraction ::= "." digitpart
918	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
919	# pointfloat ::= [digitpart] fraction \| digitpart "."
920	# exponentfloat ::= (digitpart \| pointfloat) exponent
921	# floatnumber ::= pointfloat \| exponentfloat
922
923	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
924	LEXER_DEF[lex_mode_e.Expr] = \
925	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
926	EXPR_CHARS + [
927
928	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
929	#
930	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
931	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
932	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
933	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
934	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
935	# nonzerodigit ::= "1"..."9"
936	# digit ::= "0"..."9"
937	# bindigit ::= "0" \| "1"
938	# octdigit ::= "0"..."7"
939	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
940
941	R(_DECIMAL_INT_RE, Id.Expr_DecInt),
942
943	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
944	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
945	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
946
947	R(_FLOAT_RE, Id.Expr_Float),
948
949	# These can be looked up as keywords separately, so you enforce that they have
950	# space around them?
951	R(VAR_NAME_RE, Id.Expr_Name),
952
953	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
954
955	#
956	# Arith
957	#
958
959	C(',', Id.Arith_Comma),
960	C(':', Id.Arith_Colon), # for slicing a[1:2], and mylist:pop()
961
962	C('?', Id.Arith_QMark), # regex postfix
963
964	C('+', Id.Arith_Plus), # arith infix, regex postfix
965	C('-', Id.Arith_Minus), # arith infix, regex postfix
966	C('*', Id.Arith_Star),
967	C('^', Id.Arith_Caret), # xor
968	C('/', Id.Arith_Slash),
969	C('%', Id.Arith_Percent),
970
971	C('**', Id.Arith_DStar), # exponentiation
972	C('++', Id.Arith_DPlus), # Option for string/list concatenation
973
974	C('<', Id.Arith_Less),
975	C('>', Id.Arith_Great),
976	C('<=', Id.Arith_LessEqual),
977	C('>=', Id.Arith_GreatEqual),
978	C('===', Id.Expr_TEqual),
979	C('!==', Id.Expr_NotDEqual),
980
981	C('==', Id.Unknown_DEqual), # user must choose === or ~==
982
983	# Bitwise operators
984	C('&', Id.Arith_Amp),
985	C('\|', Id.Arith_Pipe),
986	C('>>', Id.Arith_DGreat),
987	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
988
989	# Bitwise complement, as well as infix pattern matching
990	C('~', Id.Arith_Tilde),
991	C('!~', Id.Expr_NotTilde),
992	C('~~', Id.Expr_DTilde),
993	C('!~~', Id.Expr_NotDTilde),
994
995	# Left out for now:
996	# ++ -- -- needed for loops, awk?
997	# ! && \|\| -- needed for find dialect
998	# = += etc.
999
1000	C('=', Id.Arith_Equal),
1001
1002	C('+=', Id.Arith_PlusEqual),
1003	C('-=', Id.Arith_MinusEqual),
1004	C('*=', Id.Arith_StarEqual),
1005	C('/=', Id.Arith_SlashEqual),
1006	C('%=', Id.Arith_PercentEqual),
1007
1008	C('>>=', Id.Arith_DGreatEqual),
1009	C('<<=', Id.Arith_DLessEqual),
1010	C('&=', Id.Arith_AmpEqual),
1011	C('\|=', Id.Arith_PipeEqual),
1012	C('^=', Id.Arith_CaretEqual), # Exponentiation
1013
1014	# Augmented assignment that YSH has, but sh and OSH don't have
1015	C('**=', Id.Expr_DStarEqual),
1016	C('//=', Id.Expr_DSlashEqual),
1017
1018	#
1019	# Expr
1020	#
1021
1022	C('!', Id.Expr_Bang), # For eggex negation
1023
1024	C('//', Id.Expr_DSlash), # For YSH integer division
1025	C('~==', Id.Expr_TildeDEqual), # approximate equality
1026
1027	C('.', Id.Expr_Dot), # d.key is alias for d['key']
1028	C('..', Id.Expr_DDot), # range 1..5
1029	C('->', Id.Expr_RArrow), # s->startswith()
1030	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
1031
1032	# Reserved this. Go uses it for channels, etc.
1033	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
1034	C('<-', Id.Expr_Reserved),
1035	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
1036	# and match (x) { 1 => "one" }
1037	# note: other languages use \|>
1038	# R/dplyr uses %>%
1039
1040	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
1041
1042	# For multiline regex literals?
1043	C('///', Id.Expr_Reserved),
1044
1045	# Splat operators
1046	C('@', Id.Expr_At),
1047	# NOTE: Unused
1048	C('@@', Id.Expr_DoubleAt),
1049	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
1050
1051	LEXER_DEF[lex_mode_e.FuncParens] = [
1052	# () with spaces
1053	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
1054	# anything else
1055	R(r'[^\0]', Id.Unknown_Tok)
1056	]