| 1 | """expr_parse.py."""
|
| 2 | from __future__ import print_function
|
| 3 |
|
| 4 | from _devbuild.gen.syntax_asdl import (loc, Token, DoubleQuoted, SingleQuoted,
|
| 5 | CommandSub, ShArrayLiteral,
|
| 6 | CompoundWord, word_part_t, word_e)
|
| 7 | from _devbuild.gen.id_kind_asdl import Id, Kind, Id_str
|
| 8 | from _devbuild.gen.types_asdl import lex_mode_e
|
| 9 |
|
| 10 | from display import ui
|
| 11 | from core.error import p_die
|
| 12 | from frontend import consts
|
| 13 | from frontend import lexer
|
| 14 | from frontend import reader
|
| 15 | from mycpp import mylib
|
| 16 | from mycpp.mylib import log, tagswitch
|
| 17 | from osh import braces
|
| 18 | from osh import word_
|
| 19 | from osh import word_compile
|
| 20 | from pgen2 import parse
|
| 21 | from pgen2.pnode import PNodeAllocator
|
| 22 |
|
| 23 | _ = log
|
| 24 |
|
| 25 | from typing import TYPE_CHECKING, Any, Dict, Tuple, List, cast, Optional
|
| 26 | if TYPE_CHECKING:
|
| 27 | from frontend.lexer import Lexer
|
| 28 | from frontend.parse_lib import ParseContext
|
| 29 | from pgen2.grammar import Grammar
|
| 30 | from pgen2.pnode import PNode
|
| 31 |
|
| 32 | if mylib.PYTHON:
|
| 33 |
|
| 34 | class ParseTreePrinter(object):
|
| 35 | """Prints a tree of PNode instances."""
|
| 36 |
|
| 37 | def __init__(self, names):
|
| 38 | # type: (Dict[int, str]) -> None
|
| 39 | self.names = names
|
| 40 | self.f = mylib.Stdout()
|
| 41 |
|
| 42 | def _Print(self, pnode, indent, i):
|
| 43 | # type: (PNode, int, int) -> None
|
| 44 |
|
| 45 | ind = ' ' * indent
|
| 46 | # NOTE:
|
| 47 | # - why isn't 'tok' None for PRODUCTIONS? There is some redundancy to get
|
| 48 | # rid of.
|
| 49 | if pnode.tok:
|
| 50 | if isinstance(pnode.tok, Token):
|
| 51 | v = lexer.TokenVal(pnode.tok)
|
| 52 | else:
|
| 53 | # e.g. CommandSub for x = $(echo hi)
|
| 54 | v = repr(pnode.tok)
|
| 55 | else:
|
| 56 | v = '-'
|
| 57 | self.f.write('%s%d %s %s\n' % (ind, i, self.names[pnode.typ], v))
|
| 58 | if pnode.children is not None:
|
| 59 | for i, child in enumerate(pnode.children):
|
| 60 | self._Print(child, indent + 1, i)
|
| 61 |
|
| 62 | def Print(self, pnode):
|
| 63 | # type: (PNode) -> None
|
| 64 | self._Print(pnode, 0, 0)
|
| 65 |
|
| 66 |
|
| 67 | def _Classify(gr, tok):
|
| 68 | # type: (Grammar, Token) -> int
|
| 69 |
|
| 70 | # We have to match up what ParserGenerator.make_grammar() did when
|
| 71 | # calling make_label() and make_first(). See classify() in
|
| 72 | # opy/pgen2/driver.py.
|
| 73 |
|
| 74 | id_ = tok.id # mycpp fix: we need C++ to do uint16_t -> int conversion
|
| 75 |
|
| 76 | # TODO: use something more efficient than a Dict
|
| 77 | if id_ in gr.tokens:
|
| 78 | return gr.tokens[id_]
|
| 79 |
|
| 80 | if id_ == Id.Unknown_DEqual:
|
| 81 | p_die('Use === to be exact, or ~== to convert types', tok)
|
| 82 |
|
| 83 | if id_ == Id.Unknown_Tok:
|
| 84 | type_str = ''
|
| 85 | else:
|
| 86 | type_str = ' (%s)' % ui.PrettyId(tok.id)
|
| 87 | p_die('Unexpected token in expression mode%s' % type_str, tok)
|
| 88 |
|
| 89 |
|
| 90 | # Newlines are ignored between these pairs.
|
| 91 | # yapf: disable
|
| 92 | _OTHER_BALANCE = {
|
| 93 |
|
| 94 | # Parenthesized expressions (tuples) and func/proc parameter lists
|
| 95 | Id.Op_LParen: 1,
|
| 96 | Id.Op_RParen: -1,
|
| 97 | Id.Op_LBracket: 1,
|
| 98 | Id.Op_RBracket: -1,
|
| 99 |
|
| 100 | # Dicts are {}, and the grammar respects Op_Newline.
|
| 101 | }
|
| 102 | # yapf: enable
|
| 103 |
|
| 104 |
|
| 105 | def _PushYshTokens(parse_ctx, gr, p, lex):
|
| 106 | # type: (ParseContext, Grammar, parse.Parser, Lexer) -> Token
|
| 107 | """Push tokens onto pgen2's parser.
|
| 108 |
|
| 109 | Returns the last token so it can be reused/seen by the CommandParser.
|
| 110 | """
|
| 111 | #log('keywords = %s', gr.keywords)
|
| 112 | #log('tokens = %s', gr.tokens)
|
| 113 |
|
| 114 | last_token = None # type: Optional[Token]
|
| 115 | prev_was_newline = False
|
| 116 |
|
| 117 | balance = 0 # to ignore newlines
|
| 118 |
|
| 119 | while True:
|
| 120 | if last_token: # e.g. left over from WordParser
|
| 121 | tok = last_token
|
| 122 | #log('last_token = %s', last_token)
|
| 123 | last_token = None
|
| 124 | else:
|
| 125 | tok = lex.Read(lex_mode_e.Expr)
|
| 126 | #log('tok = %s', tok)
|
| 127 |
|
| 128 | # Comments and whitespace. Newlines aren't ignored.
|
| 129 | if consts.GetKind(tok.id) == Kind.Ignored:
|
| 130 | continue
|
| 131 |
|
| 132 | # For multiline lists, maps, etc.
|
| 133 | if tok.id == Id.Op_Newline:
|
| 134 | if balance > 0:
|
| 135 | #log('*** SKIPPING NEWLINE')
|
| 136 | continue
|
| 137 | # Eliminate duplicate newline tokens. It makes the grammar simpler, and
|
| 138 | # it's consistent with CPython's lexer and our own WordParser.
|
| 139 | if prev_was_newline:
|
| 140 | continue
|
| 141 | prev_was_newline = True
|
| 142 | else:
|
| 143 | prev_was_newline = False
|
| 144 |
|
| 145 | balance += _OTHER_BALANCE.get(tok.id, 0)
|
| 146 | #log('BALANCE after seeing %s = %d', tok.id, balance)
|
| 147 |
|
| 148 | if tok.id == Id.Op_LParen:
|
| 149 | # For nesting inside $()
|
| 150 | lex.PushHint(Id.Op_RParen, Id.Op_RParen)
|
| 151 |
|
| 152 | #if tok.id == Id.Expr_Name and tok.val in KEYWORDS:
|
| 153 | # tok.id = KEYWORDS[tok.val]
|
| 154 | # log('Replaced with %s', tok.id)
|
| 155 |
|
| 156 | assert tok.id < 256, Id_str(tok.id)
|
| 157 |
|
| 158 | ilabel = _Classify(gr, tok)
|
| 159 | #log('tok = %s, ilabel = %d', tok, ilabel)
|
| 160 |
|
| 161 | if p.addtoken(tok.id, tok, ilabel):
|
| 162 | return tok
|
| 163 |
|
| 164 | #
|
| 165 | # Mututally recursive calls into the command/word parsers.
|
| 166 | #
|
| 167 |
|
| 168 | if tok.id in (Id.Left_ColonPipe,
|
| 169 | Id.Left_PercentParen): # :| %( LEGACY!
|
| 170 | left_tok = tok
|
| 171 | if tok.id == Id.Left_PercentParen:
|
| 172 | lex.PushHint(Id.Op_RParen, Id.Right_ShArrayLiteral)
|
| 173 |
|
| 174 | # Blame the opening token
|
| 175 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
| 176 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
| 177 | words = [] # type: List[CompoundWord]
|
| 178 | close_tok = None # type: Optional[Token]
|
| 179 | done = False
|
| 180 | while not done:
|
| 181 | w = w_parser.ReadWord(lex_mode_e.ShCommand)
|
| 182 | with tagswitch(w) as case:
|
| 183 | if case(word_e.Operator):
|
| 184 | tok = cast(Token, w)
|
| 185 | if tok.id == Id.Right_ShArrayLiteral:
|
| 186 | if left_tok.id != Id.Left_PercentParen:
|
| 187 | p_die('Expected ) to close', left_tok)
|
| 188 | close_tok = tok
|
| 189 | done = True # can't use break here
|
| 190 | elif tok.id == Id.Op_Pipe:
|
| 191 | if left_tok.id != Id.Left_ColonPipe:
|
| 192 | p_die('Expected ) to close', left_tok)
|
| 193 | close_tok = tok
|
| 194 | done = True # can't use break here
|
| 195 | elif tok.id == Id.Op_Newline: # internal newlines allowed
|
| 196 | continue
|
| 197 | else:
|
| 198 | p_die('Unexpected token in array literal',
|
| 199 | loc.Word(w))
|
| 200 |
|
| 201 | elif case(word_e.Compound):
|
| 202 | words.append(cast(CompoundWord, w))
|
| 203 |
|
| 204 | else:
|
| 205 | raise AssertionError()
|
| 206 |
|
| 207 | words2 = braces.BraceDetectAll(words)
|
| 208 | words3 = word_.TildeDetectAll(words2)
|
| 209 |
|
| 210 | typ = Id.Expr_CastedDummy
|
| 211 |
|
| 212 | lit_part = ShArrayLiteral(left_tok, words3, close_tok)
|
| 213 | opaque = cast(Token, lit_part) # HACK for expr_to_ast
|
| 214 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
| 215 | assert not done # can't end the expression
|
| 216 |
|
| 217 | # Now push the closing )
|
| 218 | ilabel = _Classify(gr, close_tok)
|
| 219 | done = p.addtoken(tok.id, close_tok, ilabel)
|
| 220 | assert not done # can't end the expression
|
| 221 |
|
| 222 | continue
|
| 223 |
|
| 224 | # $( @( ^(
|
| 225 | if tok.id in (Id.Left_DollarParen, Id.Left_AtParen,
|
| 226 | Id.Left_CaretParen):
|
| 227 |
|
| 228 | left_token = tok
|
| 229 |
|
| 230 | lex.PushHint(Id.Op_RParen, Id.Eof_RParen)
|
| 231 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
| 232 | c_parser = parse_ctx.MakeParserForCommandSub(
|
| 233 | line_reader, lex, Id.Eof_RParen)
|
| 234 | node = c_parser.ParseCommandSub()
|
| 235 | # A little gross: Copied from osh/word_parse.py
|
| 236 | right_token = c_parser.w_parser.cur_token
|
| 237 |
|
| 238 | cs_part = CommandSub(left_token, node, right_token)
|
| 239 |
|
| 240 | typ = Id.Expr_CastedDummy
|
| 241 | opaque = cast(Token, cs_part) # HACK for expr_to_ast
|
| 242 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
| 243 | assert not done # can't end the expression
|
| 244 |
|
| 245 | # Now push the closing )
|
| 246 | ilabel = _Classify(gr, right_token)
|
| 247 | done = p.addtoken(right_token.id, right_token, ilabel)
|
| 248 | assert not done # can't end the expression
|
| 249 |
|
| 250 | continue
|
| 251 |
|
| 252 | # " $" """ $""" ^"
|
| 253 | if tok.id in (Id.Left_DoubleQuote, Id.Left_DollarDoubleQuote,
|
| 254 | Id.Left_TDoubleQuote, Id.Left_DollarTDoubleQuote,
|
| 255 | Id.Left_CaretDoubleQuote):
|
| 256 |
|
| 257 | left_token = tok
|
| 258 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
| 259 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
| 260 |
|
| 261 | parts = [] # type: List[word_part_t]
|
| 262 | last_token = w_parser.ReadDoubleQuoted(left_token, parts)
|
| 263 | expr_dq_part = DoubleQuoted(left_token, parts, last_token)
|
| 264 |
|
| 265 | typ = Id.Expr_CastedDummy
|
| 266 | opaque = cast(Token, expr_dq_part) # HACK for expr_to_ast
|
| 267 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
| 268 | assert not done # can't end the expression
|
| 269 |
|
| 270 | continue
|
| 271 |
|
| 272 | # ${
|
| 273 | if tok.id == Id.Left_DollarBrace:
|
| 274 | left_token = tok
|
| 275 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
| 276 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
| 277 |
|
| 278 | part, last_token = w_parser.ReadBracedVarSub(left_token)
|
| 279 |
|
| 280 | # It's casted word_part.BracedVarSub -> dummy -> expr.BracedVarSub!
|
| 281 | typ = Id.Expr_CastedDummy
|
| 282 | opaque = cast(Token, part) # HACK for expr_to_ast
|
| 283 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
| 284 | assert not done # can't end the expression
|
| 285 |
|
| 286 | continue
|
| 287 |
|
| 288 | # 'x' '''x'''
|
| 289 | # r'x' r'''x'''
|
| 290 | # u'x' u'''x'''
|
| 291 | # b'x' b'''x'''
|
| 292 | # $'x'
|
| 293 | if tok.id in (Id.Left_SingleQuote, Id.Left_TSingleQuote,
|
| 294 | Id.Left_RSingleQuote, Id.Left_RTSingleQuote,
|
| 295 | Id.Left_USingleQuote, Id.Left_UTSingleQuote,
|
| 296 | Id.Left_BSingleQuote, Id.Left_BTSingleQuote,
|
| 297 | Id.Left_DollarSingleQuote):
|
| 298 | if tok.id == Id.Left_DollarSingleQuote:
|
| 299 | sq_mode = lex_mode_e.SQ_C
|
| 300 | elif tok.id in (Id.Left_USingleQuote, Id.Left_UTSingleQuote,
|
| 301 | Id.Left_BSingleQuote, Id.Left_BTSingleQuote):
|
| 302 | sq_mode = lex_mode_e.J8_Str
|
| 303 | else:
|
| 304 | sq_mode = lex_mode_e.SQ_Raw
|
| 305 |
|
| 306 | left_token = tok
|
| 307 | line_reader = reader.DisallowedLineReader(parse_ctx.arena, tok)
|
| 308 | w_parser = parse_ctx.MakeWordParser(lex, line_reader)
|
| 309 |
|
| 310 | tokens = [] # type: List[Token]
|
| 311 | last_token = w_parser.ReadSingleQuoted(sq_mode, left_token, tokens,
|
| 312 | True)
|
| 313 |
|
| 314 | sval = word_compile.EvalSingleQuoted(left_token.id, tokens)
|
| 315 | sq_part = SingleQuoted(left_token, sval, last_token)
|
| 316 |
|
| 317 | typ = Id.Expr_CastedDummy
|
| 318 | opaque = cast(Token, sq_part) # HACK for expr_to_ast
|
| 319 | done = p.addtoken(typ, opaque, gr.tokens[typ])
|
| 320 | assert not done # can't end the expression
|
| 321 | continue
|
| 322 |
|
| 323 | else:
|
| 324 | # We never broke out -- EOF is too soon (how can this happen???)
|
| 325 | raise parse.ParseError("incomplete input", tok.id, tok)
|
| 326 |
|
| 327 |
|
| 328 | class ExprParser(object):
|
| 329 | """A wrapper around a pgen2 parser."""
|
| 330 |
|
| 331 | def __init__(self, parse_ctx, gr):
|
| 332 | # type: (ParseContext, Grammar) -> None
|
| 333 | self.parse_ctx = parse_ctx
|
| 334 | self.gr = gr
|
| 335 | # Reused multiple times.
|
| 336 | self.push_parser = parse.Parser(gr)
|
| 337 | self.pnode_alloc = None # type: Optional[PNodeAllocator]
|
| 338 |
|
| 339 | def Parse(self, lexer, start_symbol):
|
| 340 | # type: (Lexer, int) -> Tuple[PNode, Token]
|
| 341 |
|
| 342 | # Reuse the parser
|
| 343 | self.push_parser.setup(start_symbol, self.pnode_alloc)
|
| 344 | try:
|
| 345 | last_token = _PushYshTokens(self.parse_ctx, self.gr,
|
| 346 | self.push_parser, lexer)
|
| 347 | except parse.ParseError as e:
|
| 348 | #log('ERROR %s', e)
|
| 349 | # TODO:
|
| 350 | # - Describe what lexer mode we're in (Invalid syntax in regex)
|
| 351 | # - Maybe say where the mode started
|
| 352 | # - Id.Unknown_Tok could say "This character is invalid"
|
| 353 |
|
| 354 | # ParseError has a "too much input" case but I haven't been able to
|
| 355 | # tickle it. Maybe it's because of the Eof tokens?
|
| 356 |
|
| 357 | p_die(
|
| 358 | 'Syntax error in expression (near %s)' % ui.PrettyId(e.tok.id),
|
| 359 | e.tok)
|
| 360 |
|
| 361 | return self.push_parser.rootnode, last_token
|
| 362 |
|
| 363 |
|
| 364 | class ctx_PNodeAllocator(object):
|
| 365 |
|
| 366 | def __init__(self, ep):
|
| 367 | # type: (ExprParser) -> None
|
| 368 | self.expr_parser = ep
|
| 369 | self.expr_parser.pnode_alloc = PNodeAllocator()
|
| 370 |
|
| 371 | def __enter__(self):
|
| 372 | # type: () -> None
|
| 373 | pass
|
| 374 |
|
| 375 | def __exit__(self, type, value, traceback):
|
| 376 | # type: (Any, Any, Any) -> None
|
| 377 | self.expr_parser.pnode_alloc.Clear()
|
| 378 | self.expr_parser.pnode_alloc = None
|