| 1 | """
|
| 2 | lex.py -- Shell lexer.
|
| 3 |
|
| 4 | It consists of a series of lexer modes, each with a regex -> Id mapping.
|
| 5 |
|
| 6 | NOTE: If this changes, the lexer may need to be recompiled with
|
| 7 | build/codegen.sh lexer.
|
| 8 |
|
| 9 | Input Handling
|
| 10 | --------------
|
| 11 |
|
| 12 | Note that our style of input Handling affects the regular expressions in the
|
| 13 | lexer.
|
| 14 |
|
| 15 | We pass one line at a time to the Lexer, via LineLexer. We must be able to
|
| 16 | parse one line at a time because of interactive parsing (e.g. using the output
|
| 17 | of GNU readline.)
|
| 18 |
|
| 19 | There are two ways we could handle input:
|
| 20 |
|
| 21 | 1. Every line is NUL terminated:
|
| 22 | 'one\n\0' 'last line\0'
|
| 23 | 2. Every line is terminated by NUL, except the last:
|
| 24 | 'one\n' 'last line\0'
|
| 25 |
|
| 26 | The advantage of #2 is that in the common case of reading files, we don't have
|
| 27 | to do it one line at a time. We could slurp the whole file in, or mmap() it,
|
| 28 | etc.
|
| 29 |
|
| 30 | The second option makes the regular expressions more complicated, so I'm
|
| 31 | punting on it for now. We assume the first.
|
| 32 |
|
| 33 | That means:
|
| 34 |
|
| 35 | - No regexes below should match \0. They are added by
|
| 36 | core/lexer_gen.py for re2c.
|
| 37 |
|
| 38 | For example, [^']+ is not valid. [^'\0]+ is correct. Otherwise we would read
|
| 39 | uninitialized memory past the sentinel.
|
| 40 |
|
| 41 | Python's regex engine knows where the end of the input string is, so it
|
| 42 | doesn't require need a sentinel like \0.
|
| 43 |
|
| 44 | Note that re2c is not able to work in a mode with a strict length limit. It
|
| 45 | would cause too many extra checks? The language is then no longer regular!
|
| 46 |
|
| 47 | http://re2c.org/examples/example_03.html
|
| 48 |
|
| 49 | UPDATE: Two More Options
|
| 50 | ------------------------
|
| 51 |
|
| 52 | 3. Change the \n at the end of every line to \0. \0 becomes Id.Op_Newline, at
|
| 53 | least in lex_mode.OUTER.
|
| 54 |
|
| 55 | Advantage: This makes the regular expressions easier to generate, but allows
|
| 56 | you to read in the whole file at once instead of allocating lines.
|
| 57 |
|
| 58 | Disadvantages:
|
| 59 | - You can't mmap() the file because the data is mutated. Or it will have to be
|
| 60 | copy-on-write.
|
| 61 | - You can't get rid of comment lines if you read the whole file.
|
| 62 |
|
| 63 | 4. Read a line at a time. Throw away the lines, unless you're parsing a
|
| 64 | function, which should be obvious.
|
| 65 |
|
| 66 | After you parse the function, you can COPY all the tokens to another location.
|
| 67 | Very few tokens need their actual text data. Most of them can just be
|
| 68 | identified by ID.
|
| 69 |
|
| 70 | Contents are relevant:
|
| 71 |
|
| 72 | - Lit_Chars, Lit_Other, Lit_EscapedChar, Lit_Digits
|
| 73 | - Id.Lit_VarLike -- for the name, and for = vs +=
|
| 74 | - Id.Lit_ArithVarLike
|
| 75 | - VSub_Name, VSub_Number
|
| 76 | - Id.Redir_* for the LHS file descriptor. Although this is one or two bytes
|
| 77 | that could be copied.
|
| 78 |
|
| 79 | You can also take this opportunity to enter the strings in an intern table.
|
| 80 | How much memory would that save?
|
| 81 |
|
| 82 | Remaining constructs
|
| 83 | --------------------
|
| 84 |
|
| 85 | Case terminators:
|
| 86 | ;;& Op_DSemiAmp for case
|
| 87 | ;& Op_Semi
|
| 88 |
|
| 89 | Left Index:
|
| 90 |
|
| 91 | _VAR_NAME_RE + '\[' Lit_LeftIndexLikeOpen
|
| 92 | ]= Lit_LeftIndexLikeClose
|
| 93 |
|
| 94 | Indexed array and Associative array literals:
|
| 95 | declare -A a=([key]=value [key2]=value2)
|
| 96 | declare -a a=([1 + 2]=value [3 + 4]=value2) # parsed!
|
| 97 |
|
| 98 | Lit_LBracket Lit_RBracketEqual
|
| 99 | Left_Bracket, Right_BracketEqual?
|
| 100 | Op_LBracket Op_RBracketEqual
|
| 101 | """
|
| 102 |
|
| 103 | import re
|
| 104 |
|
| 105 | from osh.meta import Id, Kind, ID_SPEC
|
| 106 | from core.lexer import C, R
|
| 107 |
|
| 108 | from osh.meta import types
|
| 109 |
|
| 110 | lex_mode_e = types.lex_mode_e
|
| 111 |
|
| 112 |
|
| 113 | # In oil, I hope to have these lexer modes:
|
| 114 | # COMMAND
|
| 115 | # EXPRESSION (takes place of ARITH, VS_UNQ_ARG, VS_DQ_ARG)
|
| 116 | # SQ RAW_SQ DQ RAW_DQ
|
| 117 | # VS -- a single state here? Or switches into expression state, because }
|
| 118 | # is an operator
|
| 119 | # Problem: DICT_KEY might be a different state, to accept either a bare word
|
| 120 | # foo, or an expression (X=a+2), which is allowed in shell. Python doesn't
|
| 121 | # allowed unquoted words, but we want to.
|
| 122 |
|
| 123 | # TODO: There are 4 shared groups here. I think you should test if that
|
| 124 | # structure should be preserved through re2c. Do a benchmark.
|
| 125 | #
|
| 126 | # If a group has no matches, then return Id.Unknown_Tok? And then you can
|
| 127 | # chain the groups in order. It might make sense to experiment with the order
|
| 128 | # too.
|
| 129 |
|
| 130 | _BACKSLASH = [
|
| 131 | R(r'\\[^\n\0]', Id.Lit_EscapedChar),
|
| 132 | C('\\\n', Id.Ignored_LineCont),
|
| 133 | ]
|
| 134 |
|
| 135 | _VAR_NAME_RE = r'[a-zA-Z_][a-zA-Z0-9_]*'
|
| 136 |
|
| 137 | # Used by osh/cmd_parse.py to validate for loop name. Note it must be
|
| 138 | # anchored on the right.
|
| 139 | VAR_NAME_RE = re.compile(_VAR_NAME_RE + '$')
|
| 140 |
|
| 141 | # All Kind.VSub
|
| 142 | _VARS = [
|
| 143 | # Unbraced variables
|
| 144 | R(r'\$' + _VAR_NAME_RE, Id.VSub_Name),
|
| 145 | R(r'\$[0-9]', Id.VSub_Number),
|
| 146 | C(r'$!', Id.VSub_Bang),
|
| 147 | C(r'$@', Id.VSub_At),
|
| 148 | C(r'$#', Id.VSub_Pound),
|
| 149 | C(r'$$', Id.VSub_Dollar),
|
| 150 | C(r'$*', Id.VSub_Star),
|
| 151 | C(r'$-', Id.VSub_Hyphen),
|
| 152 | C(r'$?', Id.VSub_QMark),
|
| 153 | ]
|
| 154 |
|
| 155 | # Kind.Left that are valid in double-quoted modes.
|
| 156 | _LEFT_SUBS = [
|
| 157 | C('`', Id.Left_Backtick),
|
| 158 | C('$(', Id.Left_CommandSub),
|
| 159 | C('${', Id.Left_VarSub),
|
| 160 | C('$((', Id.Left_ArithSub),
|
| 161 | C('$[', Id.Left_ArithSub2),
|
| 162 | ]
|
| 163 |
|
| 164 | # Additional Kind.Left that are valid in unquoted modes.
|
| 165 | _LEFT_UNQUOTED = [
|
| 166 | C('"', Id.Left_DoubleQuote),
|
| 167 | C("'", Id.Left_SingleQuote),
|
| 168 | C('$"', Id.Left_DollarDoubleQuote),
|
| 169 | C("$'", Id.Left_DollarSingleQuote),
|
| 170 |
|
| 171 | C('<(', Id.Left_ProcSubIn),
|
| 172 | C('>(', Id.Left_ProcSubOut),
|
| 173 | ]
|
| 174 |
|
| 175 | # Constructs used:
|
| 176 | # Character classes [] with simple ranges and negation, +, *, \n, \0
|
| 177 | # It would be nice to express this as CRE ... ? And then compile to re2c
|
| 178 | # syntax. And Python syntax.
|
| 179 |
|
| 180 | # NOTE: Should remain compatible with re2c syntax, for code gen.
|
| 181 | # http://re2c.org/manual/syntax/syntax.html
|
| 182 |
|
| 183 | # PROBLEM: \0 in Python re vs \000 in re2? Can this be unified?
|
| 184 | # Yes, Python allows \000 octal escapes.
|
| 185 | #
|
| 186 | # https://docs.python.org/2/library/re.html
|
| 187 |
|
| 188 | LEXER_DEF = {} # TODO: Should be a list so we enforce order.
|
| 189 |
|
| 190 | # Anything until the end of the line is a comment. Does not match the newline
|
| 191 | # itself. We want to switch modes and possibly process Op_Newline for here
|
| 192 | # docs, etc.
|
| 193 | LEXER_DEF[lex_mode_e.COMMENT] = [
|
| 194 | R(r'[^\n\0]*', Id.Ignored_Comment)
|
| 195 | ]
|
| 196 |
|
| 197 | _UNQUOTED = _BACKSLASH + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
|
| 198 | # NOTE: We could add anything 128 and above to this character class? So
|
| 199 | # utf-8 characters don't get split?
|
| 200 | R(r'[a-zA-Z0-9_/.-]+', Id.Lit_Chars),
|
| 201 | # e.g. beginning of NAME=val, which will always be longer than the above
|
| 202 | # Id.Lit_Chars.
|
| 203 | R(r'[a-zA-Z_][a-zA-Z0-9_]*\+?=', Id.Lit_VarLike),
|
| 204 |
|
| 205 | C('#', Id.Lit_Pound), # For comments
|
| 206 |
|
| 207 | # Needs to be LONGER than any other
|
| 208 | #(_VAR_NAME_RE + r'\[', Id.Lit_Maybe_LHS_ARRAY),
|
| 209 | # Id.Lit_Maybe_LHS_ARRAY2
|
| 210 | #(r'\]\+?=', Id.Lit_Maybe_ARRAY_ASSIGN_RIGHT),
|
| 211 |
|
| 212 | # For brace expansion {a,b}
|
| 213 | C('{', Id.Lit_LBrace),
|
| 214 | C('}', Id.Lit_RBrace), # Also for var sub ${a}
|
| 215 | C(',', Id.Lit_Comma),
|
| 216 | C('~', Id.Lit_Tilde), # For tilde expansion
|
| 217 |
|
| 218 | R(r'[ \t\r]+', Id.WS_Space),
|
| 219 |
|
| 220 | C('\n', Id.Op_Newline),
|
| 221 |
|
| 222 | C('&', Id.Op_Amp),
|
| 223 | C('|', Id.Op_Pipe),
|
| 224 | C('|&', Id.Op_PipeAmp),
|
| 225 | C('&&', Id.Op_DAmp),
|
| 226 | C('||', Id.Op_DPipe),
|
| 227 | C(';', Id.Op_Semi),
|
| 228 | C(';;', Id.Op_DSemi),
|
| 229 |
|
| 230 | C('(', Id.Op_LParen),
|
| 231 | C(')', Id.Op_RParen),
|
| 232 |
|
| 233 | R(r'[0-9]*<', Id.Redir_Less),
|
| 234 | R(r'[0-9]*>', Id.Redir_Great),
|
| 235 | R(r'[0-9]*<<', Id.Redir_DLess),
|
| 236 | R(r'[0-9]*<<<', Id.Redir_TLess),
|
| 237 | R(r'[0-9]*>>', Id.Redir_DGreat),
|
| 238 | R(r'[0-9]*<<-', Id.Redir_DLessDash),
|
| 239 | R(r'[0-9]*>&', Id.Redir_GreatAnd),
|
| 240 | R(r'[0-9]*<&', Id.Redir_LessAnd),
|
| 241 | R(r'[0-9]*<>', Id.Redir_LessGreat),
|
| 242 | R(r'[0-9]*>\|', Id.Redir_Clobber),
|
| 243 |
|
| 244 | R(r'[^\0]', Id.Lit_Other), # any other single char is a literal
|
| 245 | ]
|
| 246 |
|
| 247 | # In OUTER and DBRACKET states.
|
| 248 | _EXTGLOB_BEGIN = [
|
| 249 | C('@(', Id.ExtGlob_At),
|
| 250 | C('*(', Id.ExtGlob_Star),
|
| 251 | C('+(', Id.ExtGlob_Plus),
|
| 252 | C('?(', Id.ExtGlob_QMark),
|
| 253 | C('!(', Id.ExtGlob_Bang),
|
| 254 | ]
|
| 255 |
|
| 256 | _KEYWORDS = [
|
| 257 | # NOTE: { is matched elsewhere
|
| 258 | C('[[', Id.KW_DLeftBracket),
|
| 259 | C('!', Id.KW_Bang),
|
| 260 | C('for', Id.KW_For),
|
| 261 | C('while', Id.KW_While),
|
| 262 | C('until', Id.KW_Until),
|
| 263 | C('do', Id.KW_Do),
|
| 264 | C('done', Id.KW_Done),
|
| 265 | C('in', Id.KW_In),
|
| 266 | C('case', Id.KW_Case),
|
| 267 | C('esac', Id.KW_Esac),
|
| 268 | C('if', Id.KW_If),
|
| 269 | C('fi', Id.KW_Fi),
|
| 270 | C('then', Id.KW_Then),
|
| 271 | C('else', Id.KW_Else),
|
| 272 | C('elif', Id.KW_Elif),
|
| 273 | C('function', Id.KW_Function),
|
| 274 | C('time', Id.KW_Time),
|
| 275 | ]
|
| 276 |
|
| 277 | # These are treated like builtins in bash, but keywords in OSH. However, we
|
| 278 | # main compatibility with bash for the 'type' builtin.
|
| 279 | _MORE_KEYWORDS = [
|
| 280 | C('declare', Id.Assign_Declare),
|
| 281 | C('typeset', Id.Assign_Typeset),
|
| 282 | C('local', Id.Assign_Local),
|
| 283 | C('readonly', Id.Assign_Readonly),
|
| 284 |
|
| 285 | C('break', Id.ControlFlow_Break),
|
| 286 | C('continue', Id.ControlFlow_Continue),
|
| 287 | C('return', Id.ControlFlow_Return),
|
| 288 | C('exit', Id.ControlFlow_Exit),
|
| 289 | ]
|
| 290 |
|
| 291 |
|
| 292 | _TYPE_KEYWORDS = set(name for _, name, _ in _KEYWORDS)
|
| 293 | _TYPE_KEYWORDS.add('{') # not in our lexer list
|
| 294 | _TYPE_BUILTINS = set(name for _, name, _ in _MORE_KEYWORDS)
|
| 295 |
|
| 296 |
|
| 297 | def IsOtherBuiltin(name):
|
| 298 | return name in _TYPE_BUILTINS
|
| 299 |
|
| 300 |
|
| 301 | def IsKeyword(name):
|
| 302 | return name in _TYPE_KEYWORDS
|
| 303 |
|
| 304 |
|
| 305 | # These two can must be recognized in the OUTER state, but can't nested within
|
| 306 | # [[.
|
| 307 | # Keywords have to be checked before _UNQUOTED so we get <KW_If "if"> instead
|
| 308 | # of <Lit_Chars "if">.
|
| 309 | LEXER_DEF[lex_mode_e.OUTER] = [
|
| 310 | C('((', Id.Op_DLeftParen), # not allowed within [[
|
| 311 | ] + _KEYWORDS + _MORE_KEYWORDS + _UNQUOTED + _EXTGLOB_BEGIN
|
| 312 |
|
| 313 | # DBRACKET: can be like OUTER, except:
|
| 314 | # - Don't really need redirects either... Redir_Less could be Op_Less
|
| 315 | # - Id.Op_DLeftParen can't be nested inside.
|
| 316 | LEXER_DEF[lex_mode_e.DBRACKET] = [
|
| 317 | C(']]', Id.Lit_DRightBracket),
|
| 318 | C('!', Id.KW_Bang),
|
| 319 | ] + ID_SPEC.LexerPairs(Kind.BoolUnary) + \
|
| 320 | ID_SPEC.LexerPairs(Kind.BoolBinary) + \
|
| 321 | _UNQUOTED + _EXTGLOB_BEGIN
|
| 322 |
|
| 323 | # Inside an extended glob, most characters are literals, including spaces and
|
| 324 | # punctuation. We also accept \, $var, ${var}, "", etc. They can also be
|
| 325 | # nested, so _EXTGLOB_BEGIN appears here.
|
| 326 | #
|
| 327 | # Example: echo @(<> <>|&&|'foo'|$bar)
|
| 328 | LEXER_DEF[lex_mode_e.EXTGLOB] = \
|
| 329 | _BACKSLASH + _LEFT_SUBS + _VARS + _EXTGLOB_BEGIN + [
|
| 330 | R(r'[^\\$`"\'|)@*+!?\0]+', Id.Lit_Chars),
|
| 331 | C('|', Id.Op_Pipe),
|
| 332 | C(')', Id.Op_RParen), # maybe be translated to Id.ExtGlob_RParen
|
| 333 | R(r'[^\0]', Id.Lit_Other), # everything else is literal
|
| 334 | ]
|
| 335 |
|
| 336 |
|
| 337 | LEXER_DEF[lex_mode_e.BASH_REGEX] = [
|
| 338 | # Match these literals first, and then the rest of the OUTER state I guess.
|
| 339 | # That's how bash works.
|
| 340 | #
|
| 341 | # At a minimum, you do need $ and ~ expansions to happen. <>;& could have
|
| 342 | # been allowed unescaped too, but that's not what bash does. The criteria
|
| 343 | # was whether they were "special" in both languages, which seems dubious.
|
| 344 | C('(', Id.Lit_Chars),
|
| 345 | C(')', Id.Lit_Chars),
|
| 346 | C('|', Id.Lit_Chars),
|
| 347 | ] + [
|
| 348 | # Avoid "unreachable rule error"
|
| 349 | (is_regex, pat, re_list) for
|
| 350 | (is_regex, pat, re_list) in _UNQUOTED
|
| 351 | if not (is_regex == False and pat in ('(', ')', '|'))
|
| 352 | ]
|
| 353 |
|
| 354 |
|
| 355 | LEXER_DEF[lex_mode_e.DQ] = [
|
| 356 | # Only 4 characters are backslash escaped inside "".
|
| 357 | # https://www.gnu.org/software/bash/manual/bash.html#Double-Quotes
|
| 358 | R(r'\\[$`"\\]', Id.Lit_EscapedChar),
|
| 359 | C('\\\n', Id.Ignored_LineCont),
|
| 360 | ] + _LEFT_SUBS + _VARS + [
|
| 361 | R(r'[^$`"\0\\]+', Id.Lit_Chars), # matches a line at most
|
| 362 | # NOTE: When parsing here doc line, this token doesn't end it.
|
| 363 | C('"', Id.Right_DoubleQuote),
|
| 364 | R(r'[^\0]', Id.Lit_Other), # e.g. "$"
|
| 365 | ]
|
| 366 |
|
| 367 | _VS_ARG_COMMON = _BACKSLASH + [
|
| 368 | C('/', Id.Lit_Slash), # for patsub (not Id.VOp2_Slash)
|
| 369 | C('#', Id.Lit_Pound), # for patsub prefix (not Id.VOp1_Pound)
|
| 370 | C('%', Id.Lit_Percent), # for patsdub suffix (not Id.VOp1_Percent)
|
| 371 | C('}', Id.Right_VarSub), # For var sub "${a}"
|
| 372 | ]
|
| 373 |
|
| 374 | # Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
|
| 375 | LEXER_DEF[lex_mode_e.VS_ARG_UNQ] = \
|
| 376 | _VS_ARG_COMMON + _LEFT_SUBS + _LEFT_UNQUOTED + _VARS + [
|
| 377 | # NOTE: added < and > so it doesn't eat <()
|
| 378 | R(r'[^$`/}"\'\0\\#%<>]+', Id.Lit_Chars),
|
| 379 | R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
|
| 380 | ]
|
| 381 |
|
| 382 | # Kind.{LIT,IGNORED,VS,LEFT,RIGHT,Eof}
|
| 383 | LEXER_DEF[lex_mode_e.VS_ARG_DQ] = _VS_ARG_COMMON + _LEFT_SUBS + _VARS + [
|
| 384 | R(r'[^$`/}"\0\\#%]+', Id.Lit_Chars), # matches a line at most
|
| 385 | # Weird wart: even in double quoted state, double quotes are allowed
|
| 386 | C('"', Id.Left_DoubleQuote),
|
| 387 | R(r'[^\0]', Id.Lit_Other), # e.g. "$", must be last
|
| 388 | ]
|
| 389 |
|
| 390 | # NOTE: Id.Ignored_LineCont is NOT supported in SQ state, as opposed to DQ
|
| 391 | # state.
|
| 392 | LEXER_DEF[lex_mode_e.SQ] = [
|
| 393 | R(r"[^'\0]+", Id.Lit_Chars), # matches a line at most
|
| 394 | C("'", Id.Right_SingleQuote),
|
| 395 | ]
|
| 396 |
|
| 397 | # Shared between echo -e and $''.
|
| 398 | _C_STRING_COMMON = [
|
| 399 |
|
| 400 | # \x6 is valid in bash
|
| 401 | R(r'\\x[0-9a-fA-F]{1,2}', Id.Char_Hex),
|
| 402 | R(r'\\u[0-9]{1,4}', Id.Char_Unicode4),
|
| 403 | R(r'\\U[0-9]{1,8}', Id.Char_Unicode8),
|
| 404 |
|
| 405 | R(r'\\[0abeEfrtnv\\]', Id.Char_OneChar),
|
| 406 |
|
| 407 | # Backslash that ends a line. Note '.' doesn't match a newline character.
|
| 408 | C('\\\n', Id.Char_Literals),
|
| 409 |
|
| 410 | # e.g. \A is not an escape, and \x doesn't match a hex escape. We allow it,
|
| 411 | # but a lint tool could warn about it.
|
| 412 | C('\\', Id.Char_BadBackslash),
|
| 413 | ]
|
| 414 |
|
| 415 | # Used by ECHO_LEXER in core/builtin.py.
|
| 416 | ECHO_E_DEF = _C_STRING_COMMON + [
|
| 417 | # Note: tokens above \0377 can either be truncated or be flagged a syntax
|
| 418 | # error in strict mode.
|
| 419 | R(r'\\0[0-7]{1,3}', Id.Char_Octal4),
|
| 420 |
|
| 421 | C(r'\c', Id.Char_Stop),
|
| 422 |
|
| 423 | # Bad Backslash should not end the string. We allow it, but a lint tool
|
| 424 | # should warn about it.
|
| 425 | R(r'\\$', Id.Char_BadBackslash),
|
| 426 |
|
| 427 | # e.g. 'foo', anything that's not a backslash escape
|
| 428 | R(r'[^\\]+', Id.Char_Literals),
|
| 429 | ]
|
| 430 |
|
| 431 | # NOTE: Id.Ignored_LineCont is also not supported here, even though the whole
|
| 432 | # point of it is that supports other backslash escapes like \n! It just
|
| 433 | # becomes a regular backslash.
|
| 434 | LEXER_DEF[lex_mode_e.DOLLAR_SQ] = _C_STRING_COMMON + [
|
| 435 | # Silly difference! In echo -e, the syntax is \0377, but here it's $'\377',
|
| 436 | # with no leading 0.
|
| 437 | R(r'\\[0-7]{1,3}', Id.Char_Octal3),
|
| 438 |
|
| 439 | # ' is escaped in $'' mode, but not echo -e. Ditto fr ", not sure why.
|
| 440 | C(r"\'", Id.Char_OneChar),
|
| 441 | C(r'\"', Id.Char_OneChar),
|
| 442 |
|
| 443 | # e.g. 'foo', anything that's not a backslash escape. Need to exclude ' as
|
| 444 | # well.
|
| 445 | R(r"[^\\'\0]+", Id.Char_Literals),
|
| 446 |
|
| 447 | C("'", Id.Right_SingleQuote),
|
| 448 |
|
| 449 | # Backslash that ends the file! Caught by re2c exhaustiveness check. Parser
|
| 450 | # will assert; should give a better syntax error.
|
| 451 | C('\\\0', Id.Unknown_Tok),
|
| 452 | ]
|
| 453 |
|
| 454 | LEXER_DEF[lex_mode_e.VS_1] = [
|
| 455 | R(_VAR_NAME_RE, Id.VSub_Name),
|
| 456 | # ${11} is valid, compared to $11 which is $1 and then literal 1.
|
| 457 | R(r'[0-9]+', Id.VSub_Number),
|
| 458 | C('!', Id.VSub_Bang),
|
| 459 | C('@', Id.VSub_At),
|
| 460 | C('#', Id.VSub_Pound),
|
| 461 | C('$', Id.VSub_Dollar),
|
| 462 | C('*', Id.VSub_Star),
|
| 463 | C('-', Id.VSub_Hyphen),
|
| 464 | C('?', Id.VSub_QMark),
|
| 465 |
|
| 466 | C('}', Id.Right_VarSub),
|
| 467 |
|
| 468 | C('\\\n', Id.Ignored_LineCont),
|
| 469 |
|
| 470 | C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
|
| 471 | R(r'[^\0]', Id.Unknown_Tok), # any char except newline
|
| 472 | ]
|
| 473 |
|
| 474 | LEXER_DEF[lex_mode_e.VS_2] = \
|
| 475 | ID_SPEC.LexerPairs(Kind.VTest) + \
|
| 476 | ID_SPEC.LexerPairs(Kind.VOp1) + \
|
| 477 | ID_SPEC.LexerPairs(Kind.VOp2) + [
|
| 478 | C('}', Id.Right_VarSub),
|
| 479 |
|
| 480 | C('\\\n', Id.Ignored_LineCont),
|
| 481 | C('\n', Id.Unknown_Tok), # newline not allowed inside ${}
|
| 482 | R(r'[^\0]', Id.Unknown_Tok), # any char except newline
|
| 483 | ]
|
| 484 |
|
| 485 | # https://www.gnu.org/software/bash/manual/html_node/Shell-Arithmetic.html#Shell-Arithmetic
|
| 486 | LEXER_DEF[lex_mode_e.ARITH] = \
|
| 487 | _LEFT_SUBS + _VARS + _LEFT_UNQUOTED + [
|
| 488 | # newline is ignored space, unlike in OUTER
|
| 489 | R(r'[ \t\r\n]+', Id.Ignored_Space),
|
| 490 |
|
| 491 | # Examples of arith constants:
|
| 492 | # 64#azAZ
|
| 493 | # 0xabc 0xABC
|
| 494 | # 0123
|
| 495 | # A separate digits token makes this easier to parse STATICALLY. But this
|
| 496 | # doesn't help with DYNAMIC parsing.
|
| 497 | R(_VAR_NAME_RE, Id.Lit_ArithVarLike), # for variable names or 64#_
|
| 498 | R(r'[0-9]+', Id.Lit_Digits),
|
| 499 | C('@', Id.Lit_At), # for 64#@ or ${a[@]}
|
| 500 | C('#', Id.Lit_Pound), # for 64#a
|
| 501 |
|
| 502 | # TODO: 64#@ interferes with VS_AT. Hm.
|
| 503 | ] + ID_SPEC.LexerPairs(Kind.Arith) + [
|
| 504 | C('\\\n', Id.Ignored_LineCont),
|
| 505 | R(r'[^\0]', Id.Unknown_Tok) # any char. This should be a syntax error.
|
| 506 | ]
|
| 507 |
|
| 508 | # Notes on BASH_REGEX states
|
| 509 | #
|
| 510 | # From bash manual:
|
| 511 | #
|
| 512 | # - Any part of the pattern may be quoted to force the quoted portion to be
|
| 513 | # matched as a string.
|
| 514 | # - Bracket expressions in regular expressions must be treated carefully, since
|
| 515 | # normal quoting characters lose their meanings between brackets.
|
| 516 | # - If the pattern is stored in a shell variable, quoting the variable
|
| 517 | # expansion forces the entire pattern to be matched as a string.
|
| 518 | #
|
| 519 | # Is there a re.escape function? It's just like EscapeGlob and UnescapeGlob.
|
| 520 | #
|
| 521 | # TODO: For testing, write a script to extract and save regexes... and compile
|
| 522 | # them with regcomp. I've only seen constant regexes.
|
| 523 | #
|
| 524 | # From code: ( | ) are treated special.
|