| 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
| 2 | # Licensed to PSF under a Contributor Agreement.
|
| 3 |
|
| 4 | """This module defines the data structures used to represent a grammar.
|
| 5 |
|
| 6 | These are a bit arcane because they are derived from the data
|
| 7 | structures used by Python's 'pgen' parser generator.
|
| 8 |
|
| 9 | There's also a table here mapping operators to their names in the
|
| 10 | token module; the Python tokenize module reports all operators as the
|
| 11 | fallback token code OP, but the parser needs the actual token code.
|
| 12 |
|
| 13 | """
|
| 14 |
|
| 15 | import marshal
|
| 16 |
|
| 17 | from mycpp.mylib import log
|
| 18 | from mycpp import mylib
|
| 19 |
|
| 20 | from typing import TYPE_CHECKING
|
| 21 |
|
| 22 | if TYPE_CHECKING:
|
| 23 | from typing import IO, Dict, List, Tuple
|
| 24 |
|
| 25 | # Type aliases
|
| 26 | arc_t = Tuple[int, int]
|
| 27 | first_t = Dict[int, int]
|
| 28 | states_t = List[List[arc_t]]
|
| 29 | dfa_t = Tuple[states_t, first_t]
|
| 30 |
|
| 31 |
|
| 32 | class Grammar(object):
|
| 33 | """Pgen parsing tables conversion class.
|
| 34 |
|
| 35 | Once initialized, this class supplies the grammar tables for the
|
| 36 | parsing engine implemented by parse.py. The parsing engine
|
| 37 | accesses the instance variables directly. The class here does not
|
| 38 | provide initialization of the tables; several subclasses exist to
|
| 39 | do this (see the conv and pgen modules).
|
| 40 |
|
| 41 | The load() method reads the tables from a pickle file, which is
|
| 42 | much faster than the other ways offered by subclasses. The pickle
|
| 43 | file is written by calling dump() (after loading the grammar
|
| 44 | tables using a subclass). The report() method prints a readable
|
| 45 | representation of the tables to stdout, for debugging.
|
| 46 |
|
| 47 | The instance variables are as follows:
|
| 48 |
|
| 49 | symbol2number -- a dict mapping symbol names to numbers. Symbol
|
| 50 | numbers are always NT_OFFSET or higher, to distinguish
|
| 51 | them from token numbers, which are between 0 and 255
|
| 52 | (inclusive).
|
| 53 |
|
| 54 | number2symbol -- a dict mapping numbers to symbol names;
|
| 55 | these two are each other's inverse.
|
| 56 |
|
| 57 | states -- a list of DFAs, where each DFA is a list of
|
| 58 | states, each state is a list of arcs, and each
|
| 59 | arc is a (i, j) pair where i is a label and j is
|
| 60 | a state number. The DFA number is the index into
|
| 61 | this list. (This name is slightly confusing.)
|
| 62 | Final states are represented by a special arc of
|
| 63 | the form (0, j) where j is its own state number.
|
| 64 |
|
| 65 | dfas -- a dict mapping symbol numbers to (DFA, first)
|
| 66 | pairs, where DFA is an item from the states list
|
| 67 | above, and first is a set of tokens that can
|
| 68 | begin this grammar rule (represented by a dict
|
| 69 | whose values are always 1).
|
| 70 |
|
| 71 | labels -- a list of (x, y) pairs where x is either a token
|
| 72 | number or a symbol number, and y is either None
|
| 73 | or a string; the strings are keywords. The label
|
| 74 | number is the index in this list; label numbers
|
| 75 | are used to mark state transitions (arcs) in the
|
| 76 | DFAs.
|
| 77 |
|
| 78 | Oils patch: this became List[int] where int is the
|
| 79 | token/symbol number.
|
| 80 |
|
| 81 | start -- the number of the grammar's start symbol.
|
| 82 |
|
| 83 | keywords -- a dict mapping keyword strings to arc labels.
|
| 84 |
|
| 85 | tokens -- a dict mapping token numbers to arc labels.
|
| 86 |
|
| 87 | """
|
| 88 |
|
| 89 | def __init__(self):
|
| 90 | # type: () -> None
|
| 91 | self.symbol2number = {} # type: Dict[str, int]
|
| 92 | self.number2symbol = {} # type: Dict[int, str]
|
| 93 |
|
| 94 | # TODO: See MakeGrammar in pgen2/pgen.py
|
| 95 | # To see the type
|
| 96 | # states: List[List[arcs]]
|
| 97 | # arc: (int, int)
|
| 98 | # dfs = Dict[int, Tuple[states, ...]]
|
| 99 |
|
| 100 | self.states = [] # type: states_t
|
| 101 | self.dfas = {} # type: Dict[int, dfa_t]
|
| 102 | # Oils patch: used to be [(0, "EMPTY")]. I suppose 0 is a special value?
|
| 103 | # Or is it ENDMARKER?
|
| 104 | self.labels = [0] # type: List[int]
|
| 105 | self.keywords = {} # type: Dict[str, int]
|
| 106 | self.tokens = {} # type: Dict[int, int]
|
| 107 | self.symbol2label = {} # type: Dict[str, int]
|
| 108 | self.start = 256 # TODO: move to pgen.NT_OFFSET, breaks OVM tarball
|
| 109 |
|
| 110 | if mylib.PYTHON:
|
| 111 | def dump(self, f):
|
| 112 | # type: (IO[str]) -> None
|
| 113 | """Dump the grammar tables to a marshal file.
|
| 114 |
|
| 115 | Oils patch: changed pickle to marshal.
|
| 116 |
|
| 117 | dump() recursively changes all dict to OrderedDict, so the pickled file
|
| 118 | is not exactly the same as what was passed in to dump(). load() uses the
|
| 119 | pickled file to create the tables, but only changes OrderedDict to dict
|
| 120 | at the top level; it does not recursively change OrderedDict to dict.
|
| 121 | So, the loaded tables are different from the original tables that were
|
| 122 | passed to load() in that some of the OrderedDict (from the pickled file)
|
| 123 | are not changed back to dict. For parsing, this has no effect on
|
| 124 | performance because OrderedDict uses dict's __getitem__ with nothing in
|
| 125 | between.
|
| 126 | """
|
| 127 | # Hack to get rid of Id_t
|
| 128 | labels = [int(i) for i in self.labels]
|
| 129 | tokens = dict((int(k), v) for (k, v) in self.tokens.iteritems())
|
| 130 |
|
| 131 | #self.report()
|
| 132 | payload = (
|
| 133 | self.MARSHAL_HEADER,
|
| 134 | self.symbol2number,
|
| 135 | self.number2symbol,
|
| 136 | self.states,
|
| 137 | self.dfas,
|
| 138 | labels,
|
| 139 | self.keywords,
|
| 140 | tokens,
|
| 141 | self.symbol2label,
|
| 142 | self.start,
|
| 143 | ) # tuple
|
| 144 | marshal.dump(payload, f) # version 2 is latest
|
| 145 |
|
| 146 | def dump_nonterminals_py(self, f):
|
| 147 | # type: (IO[str]) -> None
|
| 148 | """Write a Python module with nonterminals.
|
| 149 |
|
| 150 | Analogous to the 'symbol' module in Python.
|
| 151 | """
|
| 152 | f.write('# This code is generated by pgen2/grammar.py\n\n')
|
| 153 | for num in sorted(self.number2symbol):
|
| 154 | name = self.number2symbol[num]
|
| 155 | f.write('%s = %d\n' % (name, num))
|
| 156 |
|
| 157 | def dump_nonterminals_cpp(self, f):
|
| 158 | # type: (IO[str]) -> None
|
| 159 | """Write a Python module with nonterminals.
|
| 160 |
|
| 161 | Analogous to the 'symbol' module in Python.
|
| 162 | """
|
| 163 | f.write("""\
|
| 164 | // This code is generated by pgen2/grammar.py
|
| 165 |
|
| 166 | namespace grammar_nt {
|
| 167 | """)
|
| 168 | for num in sorted(self.number2symbol):
|
| 169 | name = self.number2symbol[num]
|
| 170 | f.write(' const int %s = %d;\n' % (name, num))
|
| 171 | f.write("""\
|
| 172 |
|
| 173 | } // namespace grammar_nt
|
| 174 | """)
|
| 175 |
|
| 176 | def dump_cpp(self, src_f):
|
| 177 | # type: (IO[str]) -> None
|
| 178 | """Dump a C++ implementation of the grammar tables."""
|
| 179 | src_f.write("""\
|
| 180 | // This code is generated by pgen2/grammar.py
|
| 181 | # include "cpp/pgen2.h"
|
| 182 | # include "mycpp/runtime.h"
|
| 183 |
|
| 184 | namespace grammar {
|
| 185 |
|
| 186 | """)
|
| 187 |
|
| 188 | src_f.write('Grammar::Grammar() {\n')
|
| 189 | src_f.write(' symbol2number = Alloc<Dict<BigStr*, int>>();\n')
|
| 190 | src_f.write(' number2symbol = Alloc<Dict<int, BigStr*>>();\n')
|
| 191 | src_f.write(' dfas = Alloc<Dict<int, dfa_t*>>();\n')
|
| 192 | src_f.write(' keywords = Alloc<Dict<BigStr*, int>>();\n')
|
| 193 | src_f.write(' tokens = Alloc<Dict<int, int>>();\n')
|
| 194 | src_f.write(' symbol2label = Alloc<Dict<BigStr*, int>>();\n')
|
| 195 | src_f.write(' start = %d;\n' % self.start)
|
| 196 |
|
| 197 | src_f.write('\n')
|
| 198 | for i, (states, first) in self.dfas.items():
|
| 199 | src_f.write(' {\n')
|
| 200 | src_f.write(' states_t* st = Alloc<states_t>();\n')
|
| 201 | for s in states:
|
| 202 | src_f.write(' st->append(NewList<arc_t*>(\n')
|
| 203 | src_f.write(' std::initializer_list<arc_t*>{\n')
|
| 204 | for a, b in s:
|
| 205 | src_f.write(' Alloc<Tuple2<int, int>>(%d, %d),\n' % (a, b))
|
| 206 |
|
| 207 | src_f.write(' })\n')
|
| 208 | src_f.write(' );\n')
|
| 209 |
|
| 210 | src_f.write(' first_t* first = Alloc<first_t>();\n')
|
| 211 | for k, v in first.items():
|
| 212 | src_f.write(' first->set(%d, %d);\n' % (k, v))
|
| 213 |
|
| 214 | src_f.write(' dfa_t* dfa = Alloc<dfa_t>(st, first);\n')
|
| 215 | src_f.write(' dfas->set(%d, dfa);\n' % i)
|
| 216 | src_f.write(' \n')
|
| 217 | src_f.write(' }\n')
|
| 218 | src_f.write('\n')
|
| 219 |
|
| 220 | for symbol, num in self.symbol2number.items():
|
| 221 | src_f.write(' symbol2number->set(StrFromC("%s"), %d);\n' % (symbol, num))
|
| 222 |
|
| 223 | src_f.write('\n')
|
| 224 | for num, symbol in self.number2symbol.items():
|
| 225 | src_f.write(' number2symbol->set(%d, StrFromC("%s"));\n' % (num, symbol))
|
| 226 |
|
| 227 | src_f.write('\n')
|
| 228 | for symbol, label in self.symbol2label.items():
|
| 229 | src_f.write(' symbol2label->set(StrFromC("%s"), %d);\n' % (symbol, label))
|
| 230 |
|
| 231 | src_f.write('\n')
|
| 232 | for ks, ki in self.keywords.items():
|
| 233 | src_f.write(' keywords->set(StrFromC("%s"), %d);\n' % (ks, ki))
|
| 234 |
|
| 235 | src_f.write('\n')
|
| 236 | for k, v in self.tokens.items():
|
| 237 | src_f.write(' tokens->set(%d, %d);\n' % (k, v))
|
| 238 |
|
| 239 | src_f.write('\n')
|
| 240 | src_f.write('\n')
|
| 241 | src_f.write(' labels = NewList<int>(\n')
|
| 242 | src_f.write(' std::initializer_list<int>{\n')
|
| 243 | src_f.write(' ')
|
| 244 | src_f.write(',\n '.join([str(label) for label in self.labels]))
|
| 245 | src_f.write('\n')
|
| 246 | src_f.write(' }\n')
|
| 247 | src_f.write(' );\n')
|
| 248 |
|
| 249 | src_f.write('\n')
|
| 250 | src_f.write('};\n')
|
| 251 |
|
| 252 | src_f.write("""\
|
| 253 |
|
| 254 | } // namespace grammar
|
| 255 | """)
|
| 256 |
|
| 257 | MARSHAL_HEADER = 'PGEN2\n' # arbitrary header
|
| 258 |
|
| 259 | def loads(self, s):
|
| 260 | # type: (str) -> None
|
| 261 | """Load the grammar from a string.
|
| 262 |
|
| 263 | We have to use a string rather than a file because the marshal module
|
| 264 | doesn't "fake" support zipimport files.
|
| 265 | """
|
| 266 | payload = marshal.loads(s)
|
| 267 | if payload[0] != self.MARSHAL_HEADER:
|
| 268 | raise RuntimeError('Invalid header %r' % payload[0])
|
| 269 |
|
| 270 | ( _,
|
| 271 | self.symbol2number,
|
| 272 | self.number2symbol,
|
| 273 | self.states,
|
| 274 | self.dfas,
|
| 275 | self.labels,
|
| 276 | self.keywords,
|
| 277 | self.tokens,
|
| 278 | self.symbol2label,
|
| 279 | self.start,
|
| 280 | ) = payload
|
| 281 | #self.report()
|
| 282 |
|
| 283 | assert isinstance(self.symbol2number, dict), self.symbol2number
|
| 284 | assert isinstance(self.number2symbol, dict), self.number2symbol
|
| 285 |
|
| 286 | def report(self):
|
| 287 | # type: () -> None
|
| 288 | """Dump the grammar tables to standard output, for debugging."""
|
| 289 | log("symbol2number: %d entries", len(self.symbol2number))
|
| 290 | log("number2symbol: %d entries", len(self.number2symbol))
|
| 291 | log("states: %d entries", len(self.states))
|
| 292 | log("dfas: %d entries", len(self.dfas))
|
| 293 | return
|
| 294 | from pprint import pprint
|
| 295 | print("labels")
|
| 296 | pprint(self.labels)
|
| 297 | print("keywords")
|
| 298 | pprint(self.labels)
|
| 299 | print("tokens")
|
| 300 | pprint(self.tokens)
|
| 301 | print("symbol2label")
|
| 302 | pprint(self.symbol2label)
|
| 303 | print("start", self.start)
|