OILS / pgen2 / parse.py View on Github | oilshell.org

225 lines, 92 significant
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4"""Parser engine for the grammar tables generated by pgen.
5
6The grammar table must be loaded first.
7
8See Parser/parser.c in the Python distribution for additional info on
9how this parsing engine works.
10"""
11
12from mycpp.mylib import log
13_ = log
14
15from typing import TYPE_CHECKING, Optional, Any, List
16from pgen2.pnode import PNode, PNodeAllocator
17
18if TYPE_CHECKING:
19 from _devbuild.gen.syntax_asdl import Token
20 from pgen2.grammar import Grammar, dfa_t
21
22# Duplicated to avoid dependency
23# from pgen2 import token
24
25NT_OFFSET = 256
26
27
28class ParseError(Exception):
29 """Exception to signal the parser is stuck."""
30
31 def __init__(self, msg, type_, tok):
32 # type: (str, int, Token) -> None
33 self.msg = msg
34 self.type = type_
35 self.tok = tok
36
37 def __repr__(self):
38 # type: () -> str
39 return "%s: type=%d, tok=%r" % (self.msg, self.type, self.tok)
40
41
42class _StackItem(object):
43 def __init__(self, dfa, state, node):
44 # type: (dfa_t, int, PNode) -> None
45 self.dfa = dfa
46 self.state = state
47 self.node = node
48
49
50class Parser(object):
51 """Parser engine.
52
53 The proper usage sequence is:
54
55 p = Parser(grammar, [converter]) # create instance
56 p.setup(start) # prepare for parsing
57 <for each input token>:
58 if p.addtoken(...): # parse a token; may raise ParseError
59 break
60 root = p.rootnode # root of abstract syntax tree
61
62 A Parser instance may be reused by calling setup() repeatedly.
63
64 A Parser instance contains state pertaining to the current token
65 sequence, and should not be used concurrently by different threads
66 to parse separate token sequences.
67
68 See driver.py for how to get input tokens by tokenizing a file or
69 string.
70
71 Parsing is complete when addtoken() returns True; the root of the
72 abstract syntax tree can then be retrieved from the rootnode
73 instance variable. When a syntax error occurs, addtoken() raises
74 the ParseError exception. There is no error recovery; the parser
75 cannot be used after a syntax error was reported (but it can be
76 reinitialized by calling setup()).
77 """
78
79 def __init__(self, grammar):
80 # type: (Grammar) -> None
81 """Constructor.
82
83 The grammar argument is a grammar.Grammar instance; see the
84 grammar module for more information.
85
86 The parser is not ready yet for parsing; you must call the
87 setup() method to get it started.
88
89 A concrete syntax tree node is a (type, value, context, nodes)
90 tuple, where type is the node type (a token or symbol number),
91 value is None for symbols and a string for tokens, context is
92 None or an opaque value used for error reporting (typically a
93 (lineno, offset) pair), and nodes is a list of children for
94 symbols, and None for tokens.
95
96 An abstract syntax tree node may be anything; this is entirely
97 up to the converter function.
98 """
99 self.grammar = grammar
100 self.rootnode = None # type: Optional[PNode]
101 self.stack = [] # type: List[_StackItem]
102 self.pnode_alloc = None # type: Optional[PNodeAllocator]
103
104 def setup(self, start, pnode_alloc):
105 # type: (int, PNodeAllocator) -> None
106 """Prepare for parsing.
107
108 This *must* be called before starting to parse.
109
110 The optional argument is an alternative start symbol; it
111 defaults to the grammar's start symbol.
112
113 You can use a Parser instance to parse any number of programs;
114 each time you call setup() the parser is reset to an initial
115 state determined by the (implicit or explicit) start symbol.
116 """
117 self.pnode_alloc = pnode_alloc
118 newnode = self.pnode_alloc.NewPNode(start, None)
119 # Each stack entry is a tuple: (dfa, state, node).
120 self.stack = [_StackItem(self.grammar.dfas[start], 0, newnode)]
121 self.rootnode = None
122
123 def addtoken(self, typ, opaque, ilabel):
124 # type: (int, Token, int) -> bool
125 """Add a token; return True iff this is the end of the program."""
126 # Loop until the token is shifted; may raise exceptions
127
128 # Andy NOTE: This is not linear time, i.e. a constant amount of work
129 # for each token? Is it O(n^2) as the ANTLR paper says?
130 # Do the "accelerators" in pgen.c have anything to do with it?
131
132 while True:
133 top = self.stack[-1]
134 states, _ = top.dfa
135 state = top.state
136
137 arcs = states[state]
138
139 # Look for a state with this label
140 found = False
141 for ilab, newstate in arcs:
142 t = self.grammar.labels[ilab]
143 if ilabel == ilab:
144 # Look it up in the list of labels
145 assert t < NT_OFFSET, t
146 # Shift a token; we're done with it
147 self.shift(typ, opaque, newstate)
148 # Pop while we are in an accept-only state
149 state = newstate
150
151 while True:
152 # mycpp: rewrite of tuple-in-list comparison
153 if len(states[state]) != 1:
154 break
155
156 s0, s1 = states[state][0]
157 if s0 != 0 or s1 != state:
158 break
159
160 self.pop()
161 if len(self.stack) == 0:
162 # Done parsing!
163 return True
164 top = self.stack[-1]
165 states, _ = top.dfa
166 state = top.state
167
168 # Done with this token
169 return False
170 elif t >= NT_OFFSET:
171 # See if it's a symbol and if we're in its first set
172 itsdfa = self.grammar.dfas[t]
173 _, itsfirst = itsdfa
174 if ilabel in itsfirst:
175 # Push a symbol
176 self.push(t, opaque, self.grammar.dfas[t], newstate)
177 found = True
178 break # To continue the outer while loop
179
180 if not found:
181 # Note: this condition was rewritten for mycpp tarnslation.
182 # if (0, state) in arcs:
183 # ...
184 # else:
185 # ...
186 found2 = False
187 for left, right in arcs:
188 if left == 0 and right == state:
189 # An accepting state, pop it and try something else
190 self.pop()
191 if len(self.stack) == 0:
192 # Done parsing, but another token is input
193 raise ParseError("too much input", typ, opaque)
194 found2 = True
195
196 if not found2:
197 # No success finding a transition
198 raise ParseError("bad input", typ, opaque)
199
200 def shift(self, typ, opaque, newstate):
201 # type: (int, Token, int) -> None
202 """Shift a token. (Internal)"""
203 top = self.stack[-1]
204 newnode = self.pnode_alloc.NewPNode(typ, opaque)
205 top.node.AddChild(newnode)
206 self.stack[-1].state = newstate
207
208 def push(self, typ, opaque, newdfa, newstate):
209 # type: (int, Token, dfa_t, int) -> None
210 """Push a nonterminal. (Internal)"""
211 top = self.stack[-1]
212 newnode = self.pnode_alloc.NewPNode(typ, opaque)
213 self.stack[-1].state = newstate
214 self.stack.append(_StackItem(newdfa, 0, newnode))
215
216 def pop(self):
217 # type: () -> None
218 """Pop a nonterminal. (Internal)"""
219 top = self.stack.pop()
220 newnode = top.node
221 if len(self.stack):
222 top2 = self.stack[-1]
223 top2.node.AddChild(newnode)
224 else:
225 self.rootnode = newnode