OILS / pgen2 / pgen.py View on Github | oilshell.org

428 lines, 314 significant
1# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
2# Licensed to PSF under a Contributor Agreement.
3
4# Pgen imports
5#import grammar, token, tokenize
6# NOTE: Need these special versions of token/tokenize for BACKQUOTE and such.
7from . import grammar, token, tokenize
8from mycpp.mylib import log
9
10
11class PythonTokDef(object):
12
13 def GetTerminalNum(self, label):
14 """ e.g. NAME -> 1 """
15 itoken = getattr(token, label, None)
16 assert isinstance(itoken, int), label
17 assert itoken in token.tok_name, label
18 return itoken
19
20 def GetKeywordNum(self, value):
21 """
22 e.g 'xor' -> Id.Expr_Xor
23
24 Python doesn't have this, but Oil does. Returns None if not found.
25 """
26 return None
27
28 def GetOpNum(self, op_str):
29 """
30 e.g '(' -> LPAR
31
32 Raises an exception if it's not found.
33 """
34 return token.opmap[op_str]
35
36
37class ParserGenerator(object):
38
39 def __init__(self, lexer):
40 self.lexer = lexer
41
42 def parse(self):
43 self.gettoken() # Initialize lookahead
44
45 dfas = {}
46 startsymbol = None
47 # MSTART: (NEWLINE | RULE)* ENDMARKER
48 while self.type != token.ENDMARKER:
49 while self.type == token.NEWLINE:
50 self.gettoken()
51 # RULE: NAME ':' RHS NEWLINE
52 name = self.expect(token.NAME)
53 self.expect(token.OP, ":")
54 a, z = self.parse_rhs()
55 self.expect(token.NEWLINE)
56 #self.dump_nfa(name, a, z)
57 dfa = self.make_dfa(a, z)
58 #self.dump_dfa(name, dfa)
59 oldlen = len(dfa)
60 self.simplify_dfa(dfa)
61 newlen = len(dfa)
62 dfas[name] = dfa
63 #print name, oldlen, newlen
64 if startsymbol is None:
65 startsymbol = name
66 return dfas, startsymbol
67
68 def make_dfa(self, start, finish):
69 # To turn an NFA into a DFA, we define the states of the DFA
70 # to correspond to *sets* of states of the NFA. Then do some
71 # state reduction. Let's represent sets as dicts with 1 for
72 # values.
73 assert isinstance(start, NFAState)
74 assert isinstance(finish, NFAState)
75 def closure(state):
76 base = {}
77 addclosure(state, base)
78 return base
79 def addclosure(state, base):
80 assert isinstance(state, NFAState)
81 if state in base:
82 return
83 base[state] = 1
84 for label, next in state.arcs:
85 if label is None:
86 addclosure(next, base)
87 states = [DFAState(closure(start), finish)]
88 for state in states: # NB states grows while we're iterating
89 arcs = {}
90 for nfastate in state.nfaset:
91 for label, next in nfastate.arcs:
92 if label is not None:
93 addclosure(next, arcs.setdefault(label, {}))
94 for label, nfaset in sorted(arcs.items()):
95 for st in states:
96 if st.nfaset == nfaset:
97 break
98 else:
99 st = DFAState(nfaset, finish)
100 states.append(st)
101 state.addarc(st, label)
102 return states # List of DFAState instances; first one is start
103
104 def dump_nfa(self, name, start, finish):
105 print("Dump of NFA for", name)
106 todo = [start]
107 for i, state in enumerate(todo):
108 print(" State", i, state is finish and "(final)" or "")
109 for label, next in state.arcs:
110 if next in todo:
111 j = todo.index(next)
112 else:
113 j = len(todo)
114 todo.append(next)
115 if label is None:
116 print(" -> %d" % j)
117 else:
118 print(" %s -> %d" % (label, j))
119
120 def dump_dfa(self, name, dfa):
121 print("Dump of DFA for", name)
122 for i, state in enumerate(dfa):
123 print(" State", i, state.isfinal and "(final)" or "")
124 for label, next in sorted(state.arcs.items()):
125 print(" %s -> %d" % (label, dfa.index(next)))
126
127 def simplify_dfa(self, dfa):
128 # This is not theoretically optimal, but works well enough.
129 # Algorithm: repeatedly look for two states that have the same
130 # set of arcs (same labels pointing to the same nodes) and
131 # unify them, until things stop changing.
132
133 # dfa is a list of DFAState instances
134 changes = True
135 while changes:
136 changes = False
137 for i, state_i in enumerate(dfa):
138 for j in range(i+1, len(dfa)):
139 state_j = dfa[j]
140 if state_i == state_j:
141 #print " unify", i, j
142 del dfa[j]
143 for state in dfa:
144 state.unifystate(state_j, state_i)
145 changes = True
146 break
147
148 def parse_rhs(self):
149 # RHS: ALT ('|' ALT)*
150 a, z = self.parse_alt()
151 if self.value != "|":
152 return a, z
153 else:
154 aa = NFAState()
155 zz = NFAState()
156 aa.addarc(a)
157 z.addarc(zz)
158 while self.value == "|":
159 self.gettoken()
160 a, z = self.parse_alt()
161 aa.addarc(a)
162 z.addarc(zz)
163 return aa, zz
164
165 def parse_alt(self):
166 # ALT: ITEM+
167 a, b = self.parse_item()
168 while (self.value in ("(", "[") or
169 self.type in (token.NAME, token.STRING)):
170 c, d = self.parse_item()
171 b.addarc(c)
172 b = d
173 return a, b
174
175 def parse_item(self):
176 # ITEM: '[' RHS ']' | ATOM ['+' | '*']
177 if self.value == "[":
178 self.gettoken()
179 a, z = self.parse_rhs()
180 self.expect(token.OP, "]")
181 a.addarc(z)
182 return a, z
183 else:
184 a, z = self.parse_atom()
185 value = self.value
186 if value not in ("+", "*"):
187 return a, z
188 self.gettoken()
189 z.addarc(a)
190 if value == "+":
191 return a, z
192 else:
193 return a, a
194
195 def parse_atom(self):
196 # ATOM: '(' RHS ')' | NAME | STRING
197 if self.value == "(":
198 self.gettoken()
199 a, z = self.parse_rhs()
200 self.expect(token.OP, ")")
201 return a, z
202 elif self.type in (token.NAME, token.STRING):
203 a = NFAState()
204 z = NFAState()
205 a.addarc(z, self.value)
206 self.gettoken()
207 return a, z
208 else:
209 self.raise_error("expected (...) or NAME or STRING, got %s/%s",
210 self.type, self.value)
211
212 def expect(self, type, value=None):
213 if self.type != type or (value is not None and self.value != value):
214 self.raise_error("expected %s/%s, got %s/%s",
215 type, value, self.type, self.value)
216 value = self.value
217 self.gettoken()
218 return value
219
220 def gettoken(self):
221 tup = next(self.lexer)
222 while tup[0] in (tokenize.COMMENT, tokenize.NL):
223 tup = next(self.lexer)
224 self.type, self.value, self.begin, self.end, self.line = tup
225 #print token.tok_name[self.type], repr(self.value)
226
227 def raise_error(self, msg, *args):
228 if args:
229 msg = msg % args
230 raise SyntaxError(msg, ('<grammar>', self.end[0],
231 self.end[1], self.line))
232
233class NFAState(object):
234
235 def __init__(self):
236 self.arcs = [] # list of (label, NFAState) pairs
237
238 def addarc(self, next, label=None):
239 assert label is None or isinstance(label, str)
240 assert isinstance(next, NFAState)
241 self.arcs.append((label, next))
242
243class DFAState(object):
244
245 def __init__(self, nfaset, final):
246 assert isinstance(nfaset, dict)
247 assert isinstance(next(iter(nfaset)), NFAState)
248 assert isinstance(final, NFAState)
249 self.nfaset = nfaset
250 self.isfinal = final in nfaset
251 self.arcs = {} # map from label to DFAState
252
253 def addarc(self, next, label):
254 assert isinstance(label, str)
255 assert label not in self.arcs
256 assert isinstance(next, DFAState)
257 self.arcs[label] = next
258
259 def unifystate(self, old, new):
260 for label, next in self.arcs.items():
261 if next is old:
262 self.arcs[label] = new
263
264 def __eq__(self, other):
265 # Equality test -- ignore the nfaset instance variable
266 assert isinstance(other, DFAState)
267 if self.isfinal != other.isfinal:
268 return False
269 # Can't just return self.arcs == other.arcs, because that
270 # would invoke this method recursively, with cycles...
271 if len(self.arcs) != len(other.arcs):
272 return False
273 for label, next in self.arcs.items():
274 if next is not other.arcs.get(label):
275 return False
276 return True
277
278 __hash__ = None # For Py3 compatibility.
279
280
281def calcfirst(dfas, first, name):
282 """Recursive function that mutates first."""
283 dfa = dfas[name]
284 first[name] = None # dummy to detect left recursion
285 state = dfa[0]
286 totalset = {}
287 overlapcheck = {}
288 for label, _ in state.arcs.items():
289 if label in dfas:
290 if label in first:
291 fset = first[label]
292 if fset is None:
293 raise ValueError("recursion for rule %r" % name)
294 else:
295 calcfirst(dfas, first, label)
296 fset = first[label]
297 totalset.update(fset)
298 overlapcheck[label] = fset
299 else:
300 totalset[label] = 1
301 overlapcheck[label] = {label: 1}
302 inverse = {}
303 for label, itsfirst in overlapcheck.items():
304 for symbol in itsfirst:
305 if symbol in inverse:
306 raise ValueError("rule %s is ambiguous; %s is in the"
307 " first sets of %s as well as %s" %
308 (name, symbol, label, inverse[symbol]))
309 inverse[symbol] = label
310 first[name] = totalset
311
312
313def make_label(tok_def, gr, label):
314 """Given a grammar item, return a unique integer representing it.
315
316 It could be:
317 1. or_test - a non-terminal
318 2. Expr_Name - a terminal
319 3. 'for' - keyword (quotes)
320 4. '>=' - operator (quotes)
321
322 Oil addition
323 5. Op_RBracket -- anything with _ is assumed to be in the Id namespace.
324 """
325 #log('make_label %r', label)
326 # XXX Maybe this should be a method on a subclass of converter?
327 ilabel = len(gr.labels)
328 if label[0].isalpha():
329 if label in gr.symbol2number: # NON-TERMINAL
330 if label in gr.symbol2label:
331 return gr.symbol2label[label]
332 else:
333 gr.labels.append(gr.symbol2number[label])
334 gr.symbol2label[label] = ilabel
335 return ilabel
336 else: # TERMINAL like (NAME, NUMBER, STRING)
337 itoken = tok_def.GetTerminalNum(label)
338 if itoken in gr.tokens:
339 return gr.tokens[itoken]
340 else:
341 gr.labels.append(itoken)
342 #log('%s %d -> %s', token.tok_name[itoken], itoken, ilabel)
343 gr.tokens[itoken] = ilabel
344 return ilabel
345 else:
346 # Either a keyword or an operator
347 assert label[0] in ('"', "'"), label
348 value = eval(label)
349
350 # Treat 'xor' just like '^'. TODO: I think this code can be
351 # simplified.
352 n = tok_def.GetKeywordNum(value) # int or None
353
354 if value[0].isalpha() and n is None: # A word like 'for', 'xor'
355 # Then look in the keywords automatically extracted from the
356 # grammar.
357 if value in gr.keywords:
358 return gr.keywords[value]
359 else:
360 gr.labels.append(token.NAME) # arbitrary number < NT_OFFSET
361 gr.keywords[value] = ilabel
362 return ilabel
363
364 else: # An operator e.g. '>='
365 if n is None:
366 itoken = tok_def.GetOpNum(value)
367 else:
368 itoken = n
369
370 if itoken in gr.tokens:
371 return gr.tokens[itoken]
372 else:
373 gr.labels.append(itoken)
374 gr.tokens[itoken] = ilabel
375 return ilabel
376
377
378def make_first(tok_def, rawfirst, gr):
379 first = {}
380 for label in sorted(rawfirst):
381 ilabel = make_label(tok_def, gr, label)
382 ##assert ilabel not in first # XXX failed on <> ... !=
383 first[ilabel] = 1
384 return first
385
386
387def MakeGrammar(f, tok_def=None):
388 """Construct a Grammar object from a file."""
389
390 lexer = tokenize.generate_tokens(f.readline)
391 p = ParserGenerator(lexer)
392 dfas, startsymbol = p.parse()
393
394 first = {} # map from symbol name to set of tokens
395 for name in sorted(dfas):
396 if name not in first:
397 calcfirst(dfas, first, name)
398 #print name, self.first[name].keys()
399
400 # TODO: startsymbol support could be removed. The call to p.setup() in
401 # PushTokens() can always specify it explicitly.
402 names = sorted(dfas)
403 names.remove(startsymbol)
404 names.insert(0, startsymbol)
405
406 gr = grammar.Grammar()
407 for name in names:
408 i = token.NT_OFFSET + len(gr.symbol2number)
409 gr.symbol2number[name] = i
410 gr.number2symbol[i] = name
411
412 tok_def = tok_def or PythonTokDef()
413 for name in names:
414 dfa = dfas[name]
415 states = []
416 for state in dfa:
417 arcs = []
418 for label, next_ in sorted(state.arcs.items()):
419 arcs.append((make_label(tok_def, gr, label), dfa.index(next_)))
420 if state.isfinal:
421 arcs.append((0, dfa.index(state)))
422 states.append(arcs)
423 gr.states.append(states)
424 fi = make_first(tok_def, first[name], gr)
425 gr.dfas[gr.symbol2number[name]] = (states, fi)
426
427 gr.start = gr.symbol2number[startsymbol]
428 return gr