1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
2 | # Licensed to PSF under a Contributor Agreement.
|
3 |
|
4 | """Parser engine for the grammar tables generated by pgen.
|
5 |
|
6 | The grammar table must be loaded first.
|
7 |
|
8 | See Parser/parser.c in the Python distribution for additional info on
|
9 | how this parsing engine works.
|
10 | """
|
11 |
|
12 | from mycpp.mylib import log
|
13 | _ = log
|
14 |
|
15 | from typing import TYPE_CHECKING, Optional, Any, List
|
16 | from pgen2.pnode import PNode, PNodeAllocator
|
17 |
|
18 | if TYPE_CHECKING:
|
19 | from _devbuild.gen.syntax_asdl import Token
|
20 | from pgen2.grammar import Grammar, dfa_t
|
21 |
|
22 |
|
23 | class ParseError(Exception):
|
24 | """Exception to signal the parser is stuck."""
|
25 |
|
26 | def __init__(self, msg, type_, tok):
|
27 | # type: (str, int, Token) -> None
|
28 | self.msg = msg
|
29 | self.type = type_
|
30 | self.tok = tok
|
31 |
|
32 | def __repr__(self):
|
33 | # type: () -> str
|
34 | return "%s: type=%d, tok=%r" % (self.msg, self.type, self.tok)
|
35 |
|
36 |
|
37 | class _StackItem(object):
|
38 | def __init__(self, dfa, state, node):
|
39 | # type: (dfa_t, int, PNode) -> None
|
40 | self.dfa = dfa
|
41 | self.state = state
|
42 | self.node = node
|
43 |
|
44 |
|
45 | class Parser(object):
|
46 | """Parser engine.
|
47 |
|
48 | The proper usage sequence is:
|
49 |
|
50 | p = Parser(grammar, [converter]) # create instance
|
51 | p.setup(start) # prepare for parsing
|
52 | <for each input token>:
|
53 | if p.addtoken(...): # parse a token; may raise ParseError
|
54 | break
|
55 | root = p.rootnode # root of abstract syntax tree
|
56 |
|
57 | A Parser instance may be reused by calling setup() repeatedly.
|
58 |
|
59 | A Parser instance contains state pertaining to the current token
|
60 | sequence, and should not be used concurrently by different threads
|
61 | to parse separate token sequences.
|
62 |
|
63 | See driver.py for how to get input tokens by tokenizing a file or
|
64 | string.
|
65 |
|
66 | Parsing is complete when addtoken() returns True; the root of the
|
67 | abstract syntax tree can then be retrieved from the rootnode
|
68 | instance variable. When a syntax error occurs, addtoken() raises
|
69 | the ParseError exception. There is no error recovery; the parser
|
70 | cannot be used after a syntax error was reported (but it can be
|
71 | reinitialized by calling setup()).
|
72 | """
|
73 |
|
74 | def __init__(self, grammar):
|
75 | # type: (Grammar) -> None
|
76 | """Constructor.
|
77 |
|
78 | The grammar argument is a grammar.Grammar instance; see the
|
79 | grammar module for more information.
|
80 |
|
81 | The parser is not ready yet for parsing; you must call the
|
82 | setup() method to get it started.
|
83 |
|
84 | A concrete syntax tree node is a (type, value, context, nodes)
|
85 | tuple, where type is the node type (a token or symbol number),
|
86 | value is None for symbols and a string for tokens, context is
|
87 | None or an opaque value used for error reporting (typically a
|
88 | (lineno, offset) pair), and nodes is a list of children for
|
89 | symbols, and None for tokens.
|
90 |
|
91 | An abstract syntax tree node may be anything; this is entirely
|
92 | up to the converter function.
|
93 | """
|
94 | self.grammar = grammar
|
95 | self.rootnode = None # type: Optional[PNode]
|
96 | self.stack = [] # type: List[_StackItem]
|
97 | self.pnode_alloc = None # type: Optional[PNodeAllocator]
|
98 |
|
99 | def setup(self, start, pnode_alloc):
|
100 | # type: (int, PNodeAllocator) -> None
|
101 | """Prepare for parsing.
|
102 |
|
103 | This *must* be called before starting to parse.
|
104 |
|
105 | The optional argument is an alternative start symbol; it
|
106 | defaults to the grammar's start symbol.
|
107 |
|
108 | You can use a Parser instance to parse any number of programs;
|
109 | each time you call setup() the parser is reset to an initial
|
110 | state determined by the (implicit or explicit) start symbol.
|
111 | """
|
112 | self.pnode_alloc = pnode_alloc
|
113 | newnode = self.pnode_alloc.NewPNode(start, None)
|
114 | # Each stack entry is a tuple: (dfa, state, node).
|
115 | self.stack = [_StackItem(self.grammar.dfas[start], 0, newnode)]
|
116 | self.rootnode = None
|
117 |
|
118 | def addtoken(self, typ, opaque, ilabel):
|
119 | # type: (int, Token, int) -> bool
|
120 | """Add a token; return True iff this is the end of the program."""
|
121 | # Loop until the token is shifted; may raise exceptions
|
122 |
|
123 | # Andy NOTE: This is not linear time, i.e. a constant amount of work
|
124 | # for each token? Is it O(n^2) as the ANTLR paper says?
|
125 | # Do the "accelerators" in pgen.c have anything to do with it?
|
126 |
|
127 | while True:
|
128 | top = self.stack[-1]
|
129 | states, _ = top.dfa
|
130 | state = top.state
|
131 |
|
132 | arcs = states[state]
|
133 |
|
134 | # Look for a state with this label
|
135 | found = False
|
136 | for ilab, newstate in arcs:
|
137 | t = self.grammar.labels[ilab]
|
138 | if ilabel == ilab:
|
139 | # Look it up in the list of labels
|
140 | assert t < 256, t
|
141 | # Shift a token; we're done with it
|
142 | self.shift(typ, opaque, newstate)
|
143 | # Pop while we are in an accept-only state
|
144 | state = newstate
|
145 |
|
146 | while True:
|
147 | # mycpp: rewrite of tuple-in-list comparison
|
148 | if len(states[state]) != 1:
|
149 | break
|
150 |
|
151 | s0, s1 = states[state][0]
|
152 | if s0 != 0 or s1 != state:
|
153 | break
|
154 |
|
155 | self.pop()
|
156 | if len(self.stack) == 0:
|
157 | # Done parsing!
|
158 | return True
|
159 | top = self.stack[-1]
|
160 | states, _ = top.dfa
|
161 | state = top.state
|
162 |
|
163 | # Done with this token
|
164 | return False
|
165 | elif t >= 256:
|
166 | # See if it's a symbol and if we're in its first set
|
167 | itsdfa = self.grammar.dfas[t]
|
168 | _, itsfirst = itsdfa
|
169 | if ilabel in itsfirst:
|
170 | # Push a symbol
|
171 | self.push(t, opaque, self.grammar.dfas[t], newstate)
|
172 | found = True
|
173 | break # To continue the outer while loop
|
174 |
|
175 | if not found:
|
176 | # Note: this condition was rewritten for mycpp tarnslation.
|
177 | # if (0, state) in arcs:
|
178 | # ...
|
179 | # else:
|
180 | # ...
|
181 | found2 = False
|
182 | for left, right in arcs:
|
183 | if left == 0 and right == state:
|
184 | # An accepting state, pop it and try something else
|
185 | self.pop()
|
186 | if len(self.stack) == 0:
|
187 | # Done parsing, but another token is input
|
188 | raise ParseError("too much input", typ, opaque)
|
189 | found2 = True
|
190 |
|
191 | if not found2:
|
192 | # No success finding a transition
|
193 | raise ParseError("bad input", typ, opaque)
|
194 |
|
195 | def shift(self, typ, opaque, newstate):
|
196 | # type: (int, Token, int) -> None
|
197 | """Shift a token. (Internal)"""
|
198 | top = self.stack[-1]
|
199 | newnode = self.pnode_alloc.NewPNode(typ, opaque)
|
200 | top.node.AddChild(newnode)
|
201 | self.stack[-1].state = newstate
|
202 |
|
203 | def push(self, typ, opaque, newdfa, newstate):
|
204 | # type: (int, Token, dfa_t, int) -> None
|
205 | """Push a nonterminal. (Internal)"""
|
206 | top = self.stack[-1]
|
207 | newnode = self.pnode_alloc.NewPNode(typ, opaque)
|
208 | self.stack[-1].state = newstate
|
209 | self.stack.append(_StackItem(newdfa, 0, newnode))
|
210 |
|
211 | def pop(self):
|
212 | # type: () -> None
|
213 | """Pop a nonterminal. (Internal)"""
|
214 | top = self.stack.pop()
|
215 | newnode = top.node
|
216 | if len(self.stack):
|
217 | top2 = self.stack[-1]
|
218 | top2.node.AddChild(newnode)
|
219 | else:
|
220 | self.rootnode = newnode
|