frontend/lexer

OILS / frontend / lexer_def.py View on Github | oilshell.org

1059 lines, 539 significant

1	"""
2	lexer_def.py -- A lexer for both OSH and YSH.
3
4	It consists of a series of lexer modes, each with a regex -> Id mapping.
5
6	After changing this file, run:
7
8	build/py.sh all
9
10	or at least:
11
12	build/py.sh fastlex
13
14	Input Handling
15	--------------
16
17	Every line is NUL terminated:
18
19	'one\n\0' 'last line\0'
20
21	which means that no regexes below should match \0. The core/lexer_gen.py code
22	generator adds and extra rule for \0.
23
24	For example, use [^'\0]+ instead of [^']+ .
25
26	If this rule isn't followed, we would read uninitialized memory past the
27	sentinel. Python's regex engine knows where the end of the input string is, so
28	it doesn't require need a sentinel like \0.
29	"""
30
31	from _devbuild.gen.id_kind_asdl import Id, Id_t, Kind
32	from _devbuild.gen.types_asdl import lex_mode_e
33
34	from frontend import id_kind_def
35
36	from typing import Tuple
37
38	# Initialize spec that the lexer depends on.
39	ID_SPEC = id_kind_def.IdSpec({}, {})
40
41	id_kind_def.AddKinds(ID_SPEC)
42	id_kind_def.AddBoolKinds(ID_SPEC) # must come second
43	id_kind_def.SetupTestBuiltin(ID_SPEC, {}, {}, {})
44
45
46	def C(pat, tok_type):
47	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
48	"""Lexer rule with a constant string, e.g. C('$*', VSub_Star)"""
49	return (False, pat, tok_type)
50
51
52	def R(pat, tok_type):
53	# type: (str, Id_t) -> Tuple[bool, str, Id_t]
54	"""Lexer rule with a regex string, e.g. R('\$[0-9]', VSub_Number)"""
55	return (True, pat, tok_type)
56
57
58	# See unit tests in frontend/match_test.py.
59	# We need the [^\0]* because the re2c translation assumes it's anchored like $.
60	SHOULD_HIJACK_RE = r'#![^\0]sh[ \t\r\n][^\0]'
61
62	_SIGNIFICANT_SPACE = R(r'[ \t]+', Id.WS_Space)
63
64	_BACKSLASH = [
65	# To be conservative, we could deny a set of chars similar to
66	# _LITERAL_WHITELIST_REGEX, rather than allowing all the operator characters
67	# like \( and \;.
68	#
69	# strict_backslash makes this stricter.
70	R(r'\\[^\n\0]', Id.Lit_EscapedChar),
71	C('\\\n', Id.Ignored_LineCont),
72	]
73
74	# Only 4 characters are backslash escaped inside "".
75	# https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
76	_DQ_BACKSLASH = [
77	R(r'\\[$`"\\]', Id.Lit_EscapedChar),
78	C('\\', Id.Lit_BadBackslash), # syntax error in YSH, but NOT in OSH
79	]
80
81	VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
82
83	# All Kind.VSub
84	_VARS = [
85	# Unbraced variables
86	R(r'\$' + VAR_NAME_RE, Id.VSub_DollarName),
87	R(r'\$[0-9]', Id.VSub_Number),
88	C(r'$!', Id.VSub_Bang),
89	C(r'$@', Id.VSub_At),
90	C(r'$#', Id.VSub_Pound),
91	C(r'$$', Id.VSub_Dollar),
92	C(r'$*', Id.VSub_Star),
93	C(r'$-', Id.VSub_Hyphen),
94	C(r'$?', Id.VSub_QMark),
95	]
96
97	# Kind.Left that are valid in double-quoted modes.
98
99	_LEFT_SUBS = [
100	C('`', Id.Left_Backtick),
101	C('$(', Id.Left_DollarParen),
102	C('${', Id.Left_DollarBrace),
103	C('$((', Id.Left_DollarDParen),
104	C('$[', Id.Left_DollarBracket),
105	]
106
107	# Additional Kind.Left that are valid in unquoted modes.
108	_LEFT_UNQUOTED = [
109	C('"', Id.Left_DoubleQuote),
110	C("'", Id.Left_SingleQuote),
111	C('$"', Id.Left_DollarDoubleQuote),
112	C("$'", Id.Left_DollarSingleQuote),
113	]
114
115	_LEFT_PROCSUB = [
116	C('<(', Id.Left_ProcSubIn),
117	C('>(', Id.Left_ProcSubOut),
118	]
119
120	# The regexes below are in Python syntax, but are translate to re2c syntax by
121	# frontend/lexer_gen.py.
122	#
123	# http://re2c.org/manual/syntax/syntax.html
124	# https://docs.python.org/2/library/re.html
125	#
126	# We use a limited set of constructs:
127	# - + and * for repetition
128	# - Character classes [] with simple ranges and negation
129	# - Escapes like \n \0
130
131	LEXER_DEF = {} # TODO: Should be a list so we enforce order.
132
133	# Anything until the end of the line is a comment. Does not match the newline
134	# itself. We want to switch modes and possibly process Op_Newline for here
135	# docs, etc.
136	LEXER_DEF[lex_mode_e.Comment] = [R(r'[^\n\0]*', Id.Ignored_Comment)]
137
138	# A whitelist for efficiency. The shell language says that "anything else" is
139	# a literal character. In other words, a single $ \ or ! is a literal, not a
140	# syntax error. It's defined negatively, but let's define positive runs here.
141	# TODO: Add + here because it's never special? It's different for YSH though.
142
143	# The range \x80-\xff makes sure that UTF-8 sequences are a single token.
144	_LITERAL_WHITELIST_REGEX = r'[\x80-\xffa-zA-Z0-9_.\-]+'
145
146	_UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + _VARS + [
147	# NOTE: We could add anything 128 and above to this character class? So
148	# utf-8 characters don't get split?
149	R(_LITERAL_WHITELIST_REGEX, Id.Lit_Chars),
150	C('~', Id.Lit_Tilde), # for tilde sub
151	C('/', Id.Lit_Slash), # also for tilde sub
152	C(':', Id.Lit_Colon), # for special PATH=a:~foo tilde detection
153	C('$', Id.Lit_Dollar), # shopt -u parse_dollar
154	C('#', Id.Lit_Pound), # For comments
155	_SIGNIFICANT_SPACE,
156	C('\n', Id.Op_Newline),
157	C('&', Id.Op_Amp),
158	C('\|', Id.Op_Pipe),
159	C('\|&', Id.Op_PipeAmp),
160	C('&&', Id.Op_DAmp),
161	C('\|\|', Id.Op_DPipe),
162	C(';', Id.Op_Semi),
163	# Case terminators
164	C(';;', Id.Op_DSemi),
165	C(';&', Id.Op_SemiAmp),
166	C(';;&', Id.Op_DSemiAmp),
167	C('(', Id.Op_LParen),
168	C(')', Id.Op_RParen),
169	R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
170	]
171
172	# In ShCommand and DBracket states.
173	_EXTGLOB_BEGIN = [
174	C(',(', Id.ExtGlob_Comma), # YSH synonym for @(...)
175	C('@(', Id.ExtGlob_At),
176	C('*(', Id.ExtGlob_Star),
177	C('+(', Id.ExtGlob_Plus),
178	C('?(', Id.ExtGlob_QMark),
179	C('!(', Id.ExtGlob_Bang),
180	]
181
182	KEYWORDS = [
183	# NOTE: { is matched elsewhere
184	C('[[', Id.KW_DLeftBracket),
185	C('!', Id.KW_Bang),
186	C('for', Id.KW_For),
187	C('while', Id.KW_While),
188	C('until', Id.KW_Until),
189	C('do', Id.KW_Do),
190	C('done', Id.KW_Done),
191	C('in', Id.KW_In),
192	C('case', Id.KW_Case),
193	C('esac', Id.KW_Esac),
194	C('if', Id.KW_If),
195	C('fi', Id.KW_Fi),
196	C('then', Id.KW_Then),
197	C('else', Id.KW_Else),
198	C('elif', Id.KW_Elif),
199	C('function', Id.KW_Function),
200	C('time', Id.KW_Time),
201
202	# YSH
203	C('const', Id.KW_Const), # maybe remove this
204	C('var', Id.KW_Var),
205	C('setvar', Id.KW_SetVar),
206	C('setglobal', Id.KW_SetGlobal),
207	C('call', Id.KW_Call),
208	C('proc', Id.KW_Proc),
209	C('func', Id.KW_Func),
210	]
211
212	# These are treated like builtins in bash, but keywords in OSH. However, we
213	# maintain compatibility with bash for the 'type' builtin.
214	CONTROL_FLOW = [
215	C('break', Id.ControlFlow_Break),
216	C('continue', Id.ControlFlow_Continue),
217	C('return', Id.ControlFlow_Return),
218	C('exit', Id.ControlFlow_Exit),
219	]
220
221	# Used by ysh/grammar_gen.py too
222	EXPR_WORDS = [
223	C('null', Id.Expr_Null),
224	C('true', Id.Expr_True),
225	C('false', Id.Expr_False),
226	C('and', Id.Expr_And),
227	C('or', Id.Expr_Or),
228	C('not', Id.Expr_Not),
229	C('for', Id.Expr_For),
230	C('while', Id.Expr_While),
231	C('is', Id.Expr_Is),
232	C('in', Id.Expr_In),
233	C('if', Id.Expr_If),
234	C('else', Id.Expr_Else),
235
236	# for function literals
237	C('func', Id.Expr_Func),
238
239	# / <capture d+/
240	C('capture', Id.Expr_Capture),
241	# / <capture d+ as date> /
242	C('as', Id.Expr_As),
243
244	# Tea Control Flow Operators
245	C('break', Id.Expr_Break),
246	C('continue', Id.Expr_Continue),
247	C('return', Id.Expr_Return),
248	]
249
250	FD_VAR_NAME = r'\{' + VAR_NAME_RE + r'\}'
251
252	# file descriptors can only have two digits, like mksh
253	# dash/zsh/etc. can have one
254	FD_NUM = r'[0-9]?[0-9]?'
255
256	# These two can must be recognized in the ShCommand state, but can't nested
257	# within [[.
258	# Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
259	# of <Lit_Chars "if">.
260	LEXER_DEF[lex_mode_e.ShCommand] = [
261	# These four are not allowed within [[, so they are in ShCommand but not
262	# _UNQUOTED.
263
264	# e.g. beginning of NAME=val, which will always be longer than
265	# _LITERAL_WHITELIST_REGEX.
266	R(VAR_NAME_RE + '\+?=', Id.Lit_VarLike),
267	R(VAR_NAME_RE + '\[', Id.Lit_ArrayLhsOpen),
268	R(r'\]\+?=', Id.Lit_ArrayLhsClose),
269	C('((', Id.Op_DLeftParen),
270
271	# For static globbing, and [] for array literals
272	C('[', Id.Lit_LBracket), # e.g. A=(['x']=1)
273	C(']', Id.Lit_RBracket), # e.g. *.[ch]
274	# NOTE: Glob_Star and Glob_QMark are for dynamic parsing
275	C('*', Id.Lit_Star),
276	C('?', Id.Lit_QMark),
277	C('###', Id.Lit_TPound), # like Lit_Pound, for doc comments
278	C('...', Id.Lit_TDot), # ... for multiline commands
279
280	# For brace expansion {a,b}
281	C('{', Id.Lit_LBrace),
282	C('}', Id.Lit_RBrace), # Also for var sub ${a}
283	C(',', Id.Lit_Comma),
284	C('=', Id.Lit_Equals), # for = f(x) and x = 1+2*3
285	C('@', Id.Lit_At), # for detecting @[, @' etc. shopt -s parse_at_all
286
287	# @array and @func(1, c)
288	R('@' + VAR_NAME_RE, Id.Lit_Splice), # for YSH splicing
289	C('@[', Id.Lit_AtLBracket), # @[split(x)]
290	C('@{.', Id.Lit_AtLBraceDot), # for split builtin sub @{.myproc arg1}
291	R(FD_NUM + r'<', Id.Redir_Less),
292	R(FD_NUM + r'>', Id.Redir_Great),
293	R(FD_NUM + r'<<', Id.Redir_DLess),
294	R(FD_NUM + r'<<<', Id.Redir_TLess),
295	R(FD_NUM + r'>>', Id.Redir_DGreat),
296	R(FD_NUM + r'<<-', Id.Redir_DLessDash),
297	R(FD_NUM + r'>&', Id.Redir_GreatAnd),
298	R(FD_NUM + r'<&', Id.Redir_LessAnd),
299	R(FD_NUM + r'<>', Id.Redir_LessGreat),
300	R(FD_NUM + r'>\\|', Id.Redir_Clobber),
301	R(FD_VAR_NAME + r'<', Id.Redir_Less),
302	R(FD_VAR_NAME + r'>', Id.Redir_Great),
303	R(FD_VAR_NAME + r'<<', Id.Redir_DLess),
304	R(FD_VAR_NAME + r'<<<', Id.Redir_TLess),
305	R(FD_VAR_NAME + r'>>', Id.Redir_DGreat),
306	R(FD_VAR_NAME + r'<<-', Id.Redir_DLessDash),
307	R(FD_VAR_NAME + r'>&', Id.Redir_GreatAnd),
308	R(FD_VAR_NAME + r'<&', Id.Redir_LessAnd),
309	R(FD_VAR_NAME + r'<>', Id.Redir_LessGreat),
310	R(FD_VAR_NAME + r'>\\|', Id.Redir_Clobber),
311
312	# No leading descriptor (2 is implied)
313	C(r'&>', Id.Redir_AndGreat),
314	C(r'&>>', Id.Redir_AndDGreat),
315	] + KEYWORDS + CONTROL_FLOW + _UNQUOTED + _EXTGLOB_BEGIN
316
317	# Preprocessing before ShCommand
318	LEXER_DEF[lex_mode_e.Backtick] = [
319	C(r'`', Id.Backtick_Right),
320	# A backslash, and then $ or ` or \
321	R(r'\\[$`\\]', Id.Backtick_Quoted),
322	# \" treated specially, depending on whether bacticks are double-quoted!
323	R(r'\\"', Id.Backtick_DoubleQuote),
324	R(r'[^`\\\0]+', Id.Backtick_Other), # contiguous run of literals
325	R(r'[^\0]', Id.Backtick_Other), # anything else
326	]
327
328	# DBRACKET: can be like ShCommand, except:
329	# - Don't really need redirects either... Redir_Less could be Op_Less
330	# - Id.Op_DLeftParen can't be nested inside.
331	LEXER_DEF[lex_mode_e.DBracket] = [
332	C(']]', Id.Lit_DRightBracket),
333	# Must be KW and not Op, because we can have stuff like [[ $foo == !* ]]
334	# in addition to [[ ! a && b ]]
335	C('!', Id.KW_Bang),
336	C('<', Id.Op_Less),
337	C('>', Id.Op_Great),
338	] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
339	ID_SPEC.LexerPairs(Kind.BoolBinary) + \
340	_UNQUOTED + _EXTGLOB_BEGIN
341
342	# Inside an extended glob, most characters are literals, including spaces and
343	# punctuation. We also accept \, $var, ${var}, "", etc. They can also be
344	# nested, so _EXTGLOB_BEGIN appears here.
345	#
346	# Example: echo @(<> <>\|&&\|'foo'\|$bar)
347	LEXER_DEF[lex_mode_e.ExtGlob] = \
348	_BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + _EXTGLOB_BEGIN + [
349	R(r'[^\\$`"\'\|)@*+!?\0]+', Id.Lit_Chars),
350	C('\|', Id.Op_Pipe),
351	C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
352	R(r'[^\0]', Id.Lit_Other), # everything else is literal
353	]
354
355	# Notes on BASH_REGEX states
356	#
357	# From bash manual:
358	#
359	# - Any part of the pattern may be quoted to force the quoted portion to be
360	# matched as a string.
361	# - Bracket expressions in regular expressions must be treated carefully, since
362	# normal quoting characters lose their meanings between brackets.
363	# - If the pattern is stored in a shell variable, quoting the variable
364	# expansion forces the entire pattern to be matched as a string.
365	#
366	# Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
367	#
368	# TODO: For testing, write a script to extract and save regexes... and compile
369	# them with regcomp. I've only seen constant regexes.
370	#
371	# From code: ( \| ) are treated special.
372
373	LEXER_DEF[lex_mode_e.BashRegex] = _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
374
375	# NOTE: bash accounts for spaces and non-word punctuation like ; inside ()
376	# and []. We will avoid that and ask the user to extract a variable?
377	R(r'[a-zA-Z0-9_-]+', Id.Lit_Chars), # not including period
378
379	# Tokens for Tilde sub. bash weirdness: RHS of [[ x =~ ~ ]] is expanded
380	C('~', Id.Lit_Tilde),
381	C('/', Id.Lit_Slash),
382	_SIGNIFICANT_SPACE,
383
384	# Normally, \x evaluates to x. But quoted regex metacharacters like \* should
385	# evaluate to \*. Compare with ( \| ).
386	R(r'\\[*+?.^$\[\]]', Id.Lit_RegexMeta),
387
388	# NOTE: ( \| and ) aren't operators!
389	R(r'[^\0]', Id.Lit_Other), # Everything else is a literal
390	] + _BACKSLASH # These have to come after RegexMeta
391
392	LEXER_DEF[lex_mode_e.DQ] = _DQ_BACKSLASH + [
393	C('\\\n', Id.Ignored_LineCont),
394	] + _LEFT_SUBS + _VARS + [
395	R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
396	C('$', Id.Lit_Dollar), # completion of var names relies on this
397	# NOTE: When parsing here doc line, this token doesn't end it.
398	C('"', Id.Right_DoubleQuote),
399	]
400
401	_VS_ARG_COMMON = [
402	C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
403	C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
404	C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
405	C('}', Id.Right_DollarBrace), # For var sub "${a}"
406	C('$', Id.Lit_Dollar), # completion of var names relies on this
407	]
408
409	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
410	LEXER_DEF[lex_mode_e.VSub_ArgUnquoted] = \
411	_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _LEFT_PROCSUB + \
412	_VARS + _EXTGLOB_BEGIN + [
413
414	# Token for Tilde sub
415	C('~', Id.Lit_Tilde),
416
417	# - doesn't match ~ for tilde sub
418	# - doesn't match < and > so it doesn't eat <()
419	# - doesn't match @ ! ? + * so it doesn't eat _EXTGLOB_BEGIN -- ( alone it
420	# not enough
421	R(r'[^$`~/}"\'\0\\#%<>@!?+*]+', Id.Lit_Chars),
422	R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
423	]
424
425	# Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
426	LEXER_DEF[lex_mode_e.VSub_ArgDQ] = \
427	_DQ_BACKSLASH + _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
428
429	C(r'\}', Id.Lit_EscapedChar), # For "${var-\}}"
430
431	R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
432
433	# Weird wart: even in double quoted state, double quotes are allowed
434	C('"', Id.Left_DoubleQuote),
435
436	# Another weird wart of bash/mksh: $'' is recognized but NOT ''!
437	C("$'", Id.Left_DollarSingleQuote),
438	]
439
440	# NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
441	# state.
442	LEXER_DEF[lex_mode_e.SQ_Raw] = [
443	R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
444	C("'", Id.Right_SingleQuote),
445	]
446
447	# The main purpose for EXPR_CHARS is in regex literals, e.g. [a-z \t \n].
448	#
449	# In YSH expressions, Chars are code point integers, so \u{1234} is the same as
450	# 0x1234. And \0 is 0x0.
451
452	# In Python:
453	# chr(0x00012345) == u'\U00012345'
454	#
455	# In YSH:
456	# 0x00012345 == \u{12345}
457	# chr(0x00012345) == chr(\u{12345}) == $'\u{012345}'
458
459	_U_BRACED_CHAR = R(r'\\[uU]\{[0-9a-fA-F]{1,6}\}', Id.Char_UBraced)
460
461	_X_CHAR_LOOSE = R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex) # bash
462	_X_CHAR_STRICT = R(r'\\x[0-9a-fA-F]{2}', Id.Char_Hex) # YSH
463
464	_U4_CHAR_LOOSE = R(r'\\u[0-9a-fA-F]{1,4}', Id.Char_Unicode4) # bash
465
466	_U4_CHAR_STRICT = R(r'\\u[0-9a-fA-F]{4}', Id.Char_Unicode4) # JSON-only
467
468	EXPR_CHARS = [
469	# This is like Rust. We don't have the legacy C escapes like \b.
470
471	# NOTE: \' and \" are more readable versions of '"' and "'" in regexs
472	R(r'\\[0rtn\\"%s]' % "'", Id.Char_OneChar),
473	_X_CHAR_STRICT,
474
475	# Because 'a' is a string, we use the syntax #'a' for char literals.
476	# We explicitly leave out #''' because it's confusing.
477	# Note: we're not doing utf-8 validation here.
478	R(r"#'[^'\0]'", Id.Char_Pound),
479	_U_BRACED_CHAR,
480	]
481
482	# Shared between echo -e and $''.
483	_C_STRING_COMMON = [
484
485	# \x6 is valid in bash
486	_X_CHAR_LOOSE,
487	_U4_CHAR_LOOSE,
488	R(r'\\U[0-9a-fA-F]{1,8}', Id.Char_Unicode8),
489	R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
490
491	# e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
492	# but a lint tool could warn about it.
493	C('\\', Id.Unknown_Backslash),
494	]
495
496	ECHO_E_DEF = _C_STRING_COMMON + [
497	# Note: tokens above \0377 can either be truncated or be flagged a syntax
498	# error in strict mode.
499	R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
500	C(r'\c', Id.Char_Stop),
501
502	# e.g. 'foo', anything that's not a backslash escape
503	R(r'[^\\\0]+', Id.Lit_Chars),
504	]
505
506	# https://json.org/
507
508	# Note that [0-9] has to come second, because Python chooses the first match.
509	_JSON_INT = r'([1-9][0-9]*\|[0-9])' # Numbers can't start with leading 0
510	_JSON_FRACTION = r'(\.[0-9]+)?'
511	_JSON_EXP = r'([eE][-+]?[0-9]+)?'
512
513	# R5RS extended alphabetic characters
514	# https://groups.csail.mit.edu/mac/ftpdir/scheme-reports/r5rs-html/r5rs_4.html
515	#
516	# ! $ % & * + - . / : < = > ? @ ^ _ ~
517
518	# Description from Guile Scheme - https://www.gnu.org/software/guile/manual/html_node/Symbol-Read-Syntax.html
519	#
520	# "The read syntax for a symbol is a sequence of letters, digits, and extended
521	# alphabetic characters, beginning with a character that cannot begin a
522	# number. In addition, the special cases of +, -, and ... are read as symbols
523	# even though numbers can begin with +, - or ."
524	#
525	# (They should have used regular languages!)
526
527	# We take out $ and @ for our splicing syntax, i.e. $unquote and
528	# @unquote-splicing. And : for now because we use it for name:value.
529
530	# Also note Scheme allows \|a b\| for symbols with funny chars, and Guile scheme
531	# allows #{a b}#. We could use `a b` or (symbol "a b").
532
533	J8_SYMBOL_CHARS = r'!%&*+./<=>?^_~-' # - is last for regex char class
534
535	# yapf: disable
536	J8_SYMBOL_RE = (
537	r'[a-zA-Z' + J8_SYMBOL_CHARS + ']' +
538	r'[a-zA-Z0-9' + J8_SYMBOL_CHARS + ']*')
539	# yapf: enable
540
541	J8_DEF = [
542	C('"', Id.Left_DoubleQuote), # JSON string
543	# Three left quotes that are J8 only
544	C("u'", Id.Left_USingleQuote), # unicode string
545	C("'", Id.Left_USingleQuote), # '' is alias for u'' in data, not in code
546	C("b'", Id.Left_BSingleQuote), # byte string
547	C('[', Id.J8_LBracket),
548	C(']', Id.J8_RBracket),
549	C('{', Id.J8_LBrace),
550	C('}', Id.J8_RBrace),
551	C('(', Id.J8_LParen), # NIL8 only
552	C(')', Id.J8_RParen), # NIL8 only
553	C(',', Id.J8_Comma),
554	C(':', Id.J8_Colon),
555	C('null', Id.J8_Null),
556	C('true', Id.J8_Bool),
557	C('false', Id.J8_Bool),
558	R(_JSON_INT, Id.J8_Int),
559	R(_JSON_INT + _JSON_FRACTION + _JSON_EXP, Id.J8_Float),
560
561	# Identifier names come AFTER null true false.
562	# - Happens to be the same as shell identifier # names.
563	# - Note that JS allows $ as an identifier, but we don't.
564	# - Used for dict keys / NIL8 field names.
565	R(VAR_NAME_RE, Id.J8_Identifier),
566
567	# Symbol is a SUPERSET of Identifier. The first word in NIL8 can be can
568	# be either Symbol or plain Identifier, but field names can only be
569	# Identifier. JSON8 only has Identifier.
570	#R(J8_SYMBOL_RE, Id.J8_Symbol), # NIL8 only
571	R(r'[~!@$%^&*+=\|:;./<>?-]+', Id.J8_Operator), # NIL8 only
572
573	# TODO: emit Id.Ignored_Newline to count lines for error messages?
574	R(r'[ \r\n\t]+', Id.Ignored_Space),
575	# comment is # until end of line
576	# // comments are JavaScript style, but right now we might want them as
577	# symbols?
578	R(r'#[^\n\0]*', Id.Ignored_Comment), # J8 only (JSON8, NIL8)
579
580	# This will reject ASCII control chars
581	R(r'[^\0]', Id.Unknown_Tok),
582	]
583
584	# Exclude control characters 0x00-0x1f, aka 0-31 in J8 data
585	# But \n has to be allowed in multi-line strings
586	_ASCII_CONTROL = R(r'[\x01-\x1F]', Id.Char_AsciiControl)
587
588	# https://json.org list of chars, plus '
589	_JSON_ONE_CHAR = R(r'\\[\\"/bfnrt]', Id.Char_OneChar)
590
591	# Union of escapes that "" u"" b"" accept. Validation is separate.
592	J8_STR_DEF = [
593	C("'", Id.Right_SingleQuote), # end for J8
594	_JSON_ONE_CHAR,
595	C("\\'", Id.Char_OneChar),
596
597	# osh/word_parse.py relies on this. It has to match $'', which uses _C_STRING_COMMON
598	C('\\', Id.Unknown_Backslash),
599	R(r'\\y[0-9a-fA-F]{2}', Id.Char_YHex), # \yff - J8 only
600	_U_BRACED_CHAR, # \u{123456} - J8 only
601	_ASCII_CONTROL,
602
603	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
604	R(r'''[^\\'\0]+''', Id.Lit_Chars),
605	]
606
607	# For "JSON strings \" \u1234"
608	JSON_STR_DEF = [
609	C('"', Id.Right_DoubleQuote), # end for JSON
610	_JSON_ONE_CHAR,
611	_U4_CHAR_STRICT, # \u1234 - JSON only
612
613	# High surrogate [\uD800, \uDC00)
614	# Low surrogate [\uDC00, \uE000)
615	# This pattern makes it easier to decode. Unpaired surrogates because Id.Char_Unicode4.
616	R(
617	r'\\u[dD][89aAbB][0-9a-fA-F][0-9a-fA-F]\\u[dD][cCdDeEfF][0-9a-fA-F][0-9a-fA-F]',
618	Id.Char_SurrogatePair),
619	_ASCII_CONTROL,
620
621	# Note: This will match INVALID UTF-8. UTF-8 validation is another step.
622	R(r'[^\\"\0]+', Id.Lit_Chars),
623	R(r'[^\0]', Id.Unknown_Tok),
624	]
625
626	LEXER_DEF[lex_mode_e.J8_Str] = J8_STR_DEF
627
628	OCTAL3_RE = r'\\[0-7]{1,3}'
629
630	# https://www.gnu.org/software/bash/manual/html_node/Controlling-the-PromptEvaluator.html#Controlling-the-PromptEvaluator
631	PS1_DEF = [
632	R(OCTAL3_RE, Id.PS_Octal3),
633	R(r'\\[adehHjlnrstT@AuvVwW!#$\\]', Id.PS_Subst),
634	# \D{%H:%M} strftime format
635	R(r'\\D\{[^}\0]*\}', Id.PS_Subst),
636	C(r'\[', Id.PS_LBrace), # non-printing
637	C(r'\]', Id.PS_RBrace),
638	R(r'[^\\\0]+', Id.PS_Literals),
639	# e.g. \x is not a valid escape.
640	C('\\', Id.PS_BadBackslash),
641	]
642
643	# NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
644	# point of it is that supports other backslash escapes like \n! It just
645	# becomes a regular backslash.
646	LEXER_DEF[lex_mode_e.SQ_C] = _C_STRING_COMMON + [
647	# Weird special case matching bash: backslash that ends a line. We emit
648	# this token literally in OSH, but disable it in YSH.
649	C('\\\n', Id.Unknown_Backslash),
650
651	# Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
652	# with no leading 0.
653	R(OCTAL3_RE, Id.Char_Octal3),
654
655	# ' and " are escaped in $'' mode, but not echo -e.
656	C(r"\'", Id.Char_OneChar),
657	C(r'\"', Id.Char_OneChar),
658
659	# e.g. 'foo', anything that's not a backslash escape or '
660	R(r"[^\\'\0]+", Id.Lit_Chars),
661	C("'", Id.Right_SingleQuote),
662	]
663
664	LEXER_DEF[lex_mode_e.PrintfOuter] = _C_STRING_COMMON + [
665	R(OCTAL3_RE, Id.Char_Octal3),
666	R(r"[^%\\\0]+", Id.Lit_Chars),
667	C('%%', Id.Format_EscapedPercent),
668	C('%', Id.Format_Percent),
669	]
670
671	# Maybe: bash also supports %(strftime)T
672	LEXER_DEF[lex_mode_e.PrintfPercent] = [
673	# Flags
674	R('[- +#]', Id.Format_Flag),
675	C('0', Id.Format_Zero),
676	R('[1-9][0-9]*', Id.Format_Num),
677	C('*', Id.Format_Star),
678	C('.', Id.Format_Dot),
679	# We support dsq. The others we parse to display an error message.
680	R('[disqbcouxXeEfFgG]', Id.Format_Type),
681	R('$[^()\0]*$T', Id.Format_Time),
682	R(r'[^\0]', Id.Unknown_Tok), # any other char
683	]
684
685	LEXER_DEF[lex_mode_e.VSub_1] = [
686	R(VAR_NAME_RE, Id.VSub_Name),
687	# ${11} is valid, compared to $11 which is $1 and then literal 1.
688	R(r'[0-9]+', Id.VSub_Number),
689	C('!', Id.VSub_Bang),
690	C('@', Id.VSub_At),
691	C('#', Id.VSub_Pound),
692	C('$', Id.VSub_Dollar),
693	C('*', Id.VSub_Star),
694	C('-', Id.VSub_Hyphen),
695	C('?', Id.VSub_QMark),
696	C('.', Id.VSub_Dot), # ${.myproc builtin sub}
697	C('}', Id.Right_DollarBrace),
698	C('\\\n', Id.Ignored_LineCont),
699	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
700	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
701	]
702
703	LEXER_DEF[lex_mode_e.VSub_2] = \
704	ID_SPEC.LexerPairs(Kind.VTest) + \
705	ID_SPEC.LexerPairs(Kind.VOp0) + \
706	ID_SPEC.LexerPairs(Kind.VOpYsh) + \
707	ID_SPEC.LexerPairs(Kind.VOp1) + \
708	ID_SPEC.LexerPairs(Kind.VOp2) + \
709	ID_SPEC.LexerPairs(Kind.VOp3) + [
710	C('}', Id.Right_DollarBrace),
711
712	C('\\\n', Id.Ignored_LineCont),
713	C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
714	R(r'[^\0]', Id.Unknown_Tok), # any char except newline
715	]
716
717	_EXPR_ARITH_SHARED = [
718	C('\\\n', Id.Ignored_LineCont),
719	R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
720	]
721
722	# https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
723	LEXER_DEF[lex_mode_e.Arith] = \
724	_LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
725
726	# Arithmetic expressions can cross newlines.
727	R(r'[ \t\r\n]+', Id.Ignored_Space),
728
729	# Examples of arith constants:
730	# 64#azAZ
731	# 0xabc 0xABC
732	# 0123
733	# A separate digits token makes this easier to parse STATICALLY. But this
734	# doesn't help with DYNAMIC parsing.
735	R(VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
736	R(r'[0-9]+', Id.Lit_Digits),
737	C('@', Id.Lit_At), # for 64#@ or ${a[@]}
738	C('#', Id.Lit_Pound), # for 64#a
739
740	# TODO: 64#@ interferes with VS_AT. Hm.
741	] + ID_SPEC.LexerPairs(Kind.Arith) + _EXPR_ARITH_SHARED
742
743	# A lexer for the parser that converts globs to extended regexes. Since we're
744	# only parsing character classes ([^[:space:][:alpha:]]) as opaque blobs, we
745	# don't need lexer modes here.
746	GLOB_DEF = [
747	# These could be operators in the glob, or just literals in a char class,
748	# e.g. touch '?'; echo [?].
749	C('*', Id.Glob_Star),
750	C('?', Id.Glob_QMark),
751
752	# For negation. Treated as operators inside [], but literals outside.
753	C('!', Id.Glob_Bang),
754	C('^', Id.Glob_Caret),
755
756	# Character classes.
757	C('[', Id.Glob_LBracket),
758	C(']', Id.Glob_RBracket),
759
760	# There is no whitelist of characters; backslashes are unconditionally
761	# removed. With libc.fnmatch(), the pattern r'\f' matches 'f' but not '\\f'.
762	# See libc_test.py.
763	R(r'\\[^\0]', Id.Glob_EscapedChar),
764	C('\\', Id.Glob_BadBackslash), # Trailing single backslash
765
766	# For efficiency, combine other characters into a single token, e.g. 'py' in
767	# '*.py' or 'alpha' in '[[:alpha:]]'.
768	R(r'[a-zA-Z0-9_]+', Id.Glob_CleanLiterals), # no regex escaping
769	R(r'[^\0]', Id.Glob_OtherLiteral), # anything else -- examine the char
770	]
771
772	# History expansion. We're doing this as "pre-lexing" since that's what bash
773	# and zsh seem to do. Example:
774	#
775	# $ foo=x
776	# $ echo $
777	# $ !!foo # expands to echo $foo and prints x
778	#
779	# We can also reuse this in the RootCompleter to expand history interactively.
780	#
781	# bash note: handled in lib/readline/histexpand.c. Quite messy and handles
782	# quotes AGAIN.
783	#
784	# Note: \! gets expanded to literal \! for the real lexer, but no history
785	# expansion occurs.
786
787	HISTORY_DEF = [
788	# Common operators.
789	R(r'![!*^$]', Id.History_Op),
790
791	# By command number.
792	R(r'!-?[0-9]+', Id.History_Num),
793
794	# Search by prefix of substring (optional '?').
795	# NOTE: there are no numbers allowed here! Bash doesn't seem to support it.
796	# No hyphen since it conflits with $-1 too.
797	#
798	# Required trailing whitespace is there to avoid conflict with [!charclass]
799	# and ${!indirect}. This is a simpler hack than the one bash has. See
800	# frontend/lex_test.py.
801	R(r'!\??[a-zA-Z_/.][0-9a-zA-Z_/.]+[ \t\r\n]', Id.History_Search),
802
803	# Comment is until end of line
804	R(r"#[^\0]*", Id.History_Other),
805
806	# Single quoted, e.g. 'a' or $'\n'. Terminated by another single quote or
807	# end of string.
808	R(r"'[^'\0]*'?", Id.History_Other),
809
810	# Runs of chars that are definitely not special
811	R(r"[^!\\'#\0]+", Id.History_Other),
812
813	# Escaped characters. \! disables history
814	R(r'\\[^\0]', Id.History_Other),
815	# Other single chars, like a trailing \ or !
816	R(r'[^\0]', Id.History_Other),
817	]
818
819	BRACE_RANGE_DEF = [
820	R(r'-?[0-9]+', Id.Range_Int),
821	R(r'[a-zA-Z]', Id.Range_Char), # just a single character
822	R(r'\.\.', Id.Range_Dots),
823	R(r'[^\0]', Id.Range_Other), # invalid
824	]
825
826	#
827	# YSH lexing
828	#
829
830	# Valid in lex_mode_e.{Expr,DQ}
831	# Used by ysh/grammar_gen.py
832	YSH_LEFT_SUBS = [
833	C('$(', Id.Left_DollarParen),
834	C('${', Id.Left_DollarBrace),
835	C('$[', Id.Left_DollarBracket), # TODO: Implement $[x]
836	]
837
838	# Valid in lex_mode_e.Expr, but not valid in DQ
839	# Used by ysh/grammar_gen.py
840
841	YSH_LEFT_UNQUOTED = [
842	C('"', Id.Left_DoubleQuote),
843	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
844	C("'", Id.Left_SingleQuote),
845	C("r'", Id.Left_RSingleQuote),
846	C("u'", Id.Left_USingleQuote),
847	C("b'", Id.Left_BSingleQuote),
848	C("$'", Id.Left_DollarSingleQuote),
849	C('^"', Id.Left_CaretDoubleQuote),
850	C('"""', Id.Left_TDoubleQuote),
851	# In expression mode, we add the r'' and c'' prefixes for '' and $''.
852	C("'''", Id.Left_TSingleQuote),
853	C("r'''", Id.Left_RTSingleQuote),
854	C("u'''", Id.Left_UTSingleQuote),
855	C("b'''", Id.Left_BTSingleQuote),
856	C('@(', Id.Left_AtParen), # Split Command Sub
857	C('^(', Id.Left_CaretParen), # Block literals in expression mode
858	C('^[', Id.Left_CaretBracket), # Expr literals
859	C('^{', Id.Left_CaretBrace), # Unused
860	C(':\|', Id.Left_ColonPipe), # shell-like word arrays.
861	C('%(', Id.Left_PercentParen), # old syntax for shell-like word arrays.
862	C('%[', Id.Expr_Reserved), # Maybe: like %() without unquoted [], {}
863	C('%{', Id.Expr_Reserved), # Table literals
864	# t = %{
865	# name:Str age:Int
866	# 'andy c' 10
867	# }
868	# Significant newlines. No unquoted [], {}
869
870	# Not sure if we'll use these
871	C('@{', Id.Expr_Reserved),
872	C('@[', Id.Expr_Reserved),
873
874	# Idea: Set literals are #{a, b} like Clojure
875	]
876
877	# Used by ysh/grammar_gen.py
878	EXPR_OPS = [
879	# Terminator
880	C(';', Id.Op_Semi),
881	C('(', Id.Op_LParen),
882	C(')', Id.Op_RParen),
883	# NOTE: type expressions are expressions, e.g. Dict[Str, Int]
884	C('[', Id.Op_LBracket),
885	C(']', Id.Op_RBracket),
886	C('{', Id.Op_LBrace),
887	C('}', Id.Op_RBrace),
888	]
889
890	# Newline is significant, but sometimes elided by expr_parse.py.
891	_EXPR_NEWLINE_COMMENT = [
892	C('\n', Id.Op_Newline),
893	R(r'#[^\n\0]*', Id.Ignored_Comment),
894	R(r'[ \t\r]+', Id.Ignored_Space),
895	]
896
897	_WHITESPACE = r'[ \t\r\n]*' # not including legacy \f \v
898
899	# Python allows 0 to be written 00 or 0_0_0, which is weird. But let's be
900	# consistent, and avoid '00' turning into a float!
901	_DECIMAL_INT_RE = r'[0-9](_?[0-9])*'
902
903	# Used for YSH comparison operators > >= < <=
904	LOOKS_LIKE_INTEGER = _WHITESPACE + '-?' + _DECIMAL_INT_RE + _WHITESPACE
905
906	_FLOAT_RE = (
907	_DECIMAL_INT_RE +
908	# Unlike Python, exponent can't be like 42e5_000. There's no use because
909	# 1e309 is already inf. Let's keep our code simple.
910	r'(\.' + _DECIMAL_INT_RE + ')?([eE][+\-]?[0-9]+)?')
911
912	# Ditto, used for comparison operators
913	# Added optional Optional -?
914	# Example: -3_000_000.000_001e12
915	LOOKS_LIKE_FLOAT = _WHITESPACE + '-?' + _FLOAT_RE + _WHITESPACE
916
917	# Python 3 float literals:
918
919	# digitpart ::= digit (["_"] digit)*
920	# fraction ::= "." digitpart
921	# exponent ::= ("e" \| "E") ["+" \| "-"] digitpart
922	# pointfloat ::= [digitpart] fraction \| digitpart "."
923	# exponentfloat ::= (digitpart \| pointfloat) exponent
924	# floatnumber ::= pointfloat \| exponentfloat
925
926	# NOTE: Borrowing tokens from Arith (i.e. $(( )) ), but not using LexerPairs().
927	LEXER_DEF[lex_mode_e.Expr] = \
928	_VARS + YSH_LEFT_SUBS + YSH_LEFT_UNQUOTED + EXPR_OPS + EXPR_WORDS + \
929	EXPR_CHARS + [
930
931	# https://docs.python.org/3/reference/lexical_analysis.html#integer-literals
932	#
933	# integer ::= decinteger \| bininteger \| octinteger \| hexinteger
934	# decinteger ::= nonzerodigit (["_"] digit)* \| "0"+ (["_"] "0")*
935	# bininteger ::= "0" ("b" \| "B") (["_"] bindigit)+
936	# octinteger ::= "0" ("o" \| "O") (["_"] octdigit)+
937	# hexinteger ::= "0" ("x" \| "X") (["_"] hexdigit)+
938	# nonzerodigit ::= "1"..."9"
939	# digit ::= "0"..."9"
940	# bindigit ::= "0" \| "1"
941	# octdigit ::= "0"..."7"
942	# hexdigit ::= digit \| "a"..."f" \| "A"..."F"
943
944	R(_DECIMAL_INT_RE, Id.Expr_DecInt),
945
946	R(r'0[bB](_?[01])+', Id.Expr_BinInt),
947	R(r'0[oO](_?[0-7])+', Id.Expr_OctInt),
948	R(r'0[xX](_?[0-9a-fA-F])+', Id.Expr_HexInt),
949
950	R(_FLOAT_RE, Id.Expr_Float),
951
952	# These can be looked up as keywords separately, so you enforce that they have
953	# space around them?
954	R(VAR_NAME_RE, Id.Expr_Name),
955
956	R('%' + VAR_NAME_RE, Id.Expr_Symbol),
957
958	#
959	# Arith
960	#
961
962	C(',', Id.Arith_Comma),
963	C(':', Id.Arith_Colon), # for slicing a[1:2], and mylist:pop()
964
965	C('?', Id.Arith_QMark), # regex postfix
966
967	C('+', Id.Arith_Plus), # arith infix, regex postfix
968	C('-', Id.Arith_Minus), # arith infix, regex postfix
969	C('*', Id.Arith_Star),
970	C('^', Id.Arith_Caret), # xor
971	C('/', Id.Arith_Slash),
972	C('%', Id.Arith_Percent),
973
974	C('**', Id.Arith_DStar), # exponentiation
975	C('++', Id.Arith_DPlus), # Option for string/list concatenation
976
977	C('<', Id.Arith_Less),
978	C('>', Id.Arith_Great),
979	C('<=', Id.Arith_LessEqual),
980	C('>=', Id.Arith_GreatEqual),
981	C('===', Id.Expr_TEqual),
982	C('!==', Id.Expr_NotDEqual),
983
984	C('==', Id.Unknown_DEqual), # user must choose === or ~==
985
986	# Bitwise operators
987	C('&', Id.Arith_Amp),
988	C('\|', Id.Arith_Pipe),
989	C('>>', Id.Arith_DGreat),
990	C('<<', Id.Arith_DLess), # Doesn't Java also have <<< ?
991
992	# Bitwise complement, as well as infix pattern matching
993	C('~', Id.Arith_Tilde),
994	C('!~', Id.Expr_NotTilde),
995	C('~~', Id.Expr_DTilde),
996	C('!~~', Id.Expr_NotDTilde),
997
998	# Left out for now:
999	# ++ -- -- needed for loops, awk?
1000	# ! && \|\| -- needed for find dialect
1001	# = += etc.
1002
1003	C('=', Id.Arith_Equal),
1004
1005	C('+=', Id.Arith_PlusEqual),
1006	C('-=', Id.Arith_MinusEqual),
1007	C('*=', Id.Arith_StarEqual),
1008	C('/=', Id.Arith_SlashEqual),
1009	C('%=', Id.Arith_PercentEqual),
1010
1011	C('>>=', Id.Arith_DGreatEqual),
1012	C('<<=', Id.Arith_DLessEqual),
1013	C('&=', Id.Arith_AmpEqual),
1014	C('\|=', Id.Arith_PipeEqual),
1015	C('^=', Id.Arith_CaretEqual), # Exponentiation
1016
1017	# Augmented assignment that YSH has, but sh and OSH don't have
1018	C('**=', Id.Expr_DStarEqual),
1019	C('//=', Id.Expr_DSlashEqual),
1020
1021	#
1022	# Expr
1023	#
1024
1025	C('!', Id.Expr_Bang), # For eggex negation
1026
1027	C('//', Id.Expr_DSlash), # For YSH integer division
1028	C('~==', Id.Expr_TildeDEqual), # approximate equality
1029
1030	C('.', Id.Expr_Dot), # d.key is alias for d['key']
1031	C('..', Id.Expr_DDot), # range 1..5
1032	C('->', Id.Expr_RArrow), # s->startswith()
1033	C('$', Id.Expr_Dollar), # legacy regex end: /d+ $/ (better written /d+ >/
1034
1035	# Reserved this. Go uses it for channels, etc.
1036	# I guess it conflicts with -4<-3, but that's OK -- spaces suffices.
1037	C('<-', Id.Expr_Reserved),
1038	C('=>', Id.Expr_RDArrow), # for df => filter(age > 10)
1039	# and match (x) { 1 => "one" }
1040	# note: other languages use \|>
1041	# R/dplyr uses %>%
1042
1043	C('...', Id.Expr_Ellipsis), # f(...args) and maybe a[:, ...]
1044
1045	# For multiline regex literals?
1046	C('///', Id.Expr_Reserved),
1047
1048	# Splat operators
1049	C('@', Id.Expr_At),
1050	# NOTE: Unused
1051	C('@@', Id.Expr_DoubleAt),
1052	] + _EXPR_NEWLINE_COMMENT + _EXPR_ARITH_SHARED
1053
1054	LEXER_DEF[lex_mode_e.FuncParens] = [
1055	# () with spaces
1056	R(r'[ \t]$[ \t]$', Id.LookAhead_FuncParens),
1057	# anything else
1058	R(r'[^\0]', Id.Unknown_Tok)
1059	]