| 1 | # Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved. | 
| 2 | # Licensed to PSF under a Contributor Agreement. | 
| 3 |  | 
| 4 | """This module defines the data structures used to represent a grammar. | 
| 5 |  | 
| 6 | These are a bit arcane because they are derived from the data | 
| 7 | structures used by Python's 'pgen' parser generator. | 
| 8 |  | 
| 9 | There's also a table here mapping operators to their names in the | 
| 10 | token module; the Python tokenize module reports all operators as the | 
| 11 | fallback token code OP, but the parser needs the actual token code. | 
| 12 |  | 
| 13 | """ | 
| 14 |  | 
| 15 | import marshal | 
| 16 |  | 
| 17 | from mycpp.mylib import log | 
| 18 | from mycpp import mylib | 
| 19 |  | 
| 20 | from typing import TYPE_CHECKING | 
| 21 |  | 
| 22 | if TYPE_CHECKING: | 
| 23 | from typing import IO, Dict, List, Tuple | 
| 24 |  | 
| 25 | # Type aliases | 
| 26 | arc_t = Tuple[int, int] | 
| 27 | first_t = Dict[int, int] | 
| 28 | states_t = List[List[arc_t]] | 
| 29 | dfa_t = Tuple[states_t, first_t] | 
| 30 |  | 
| 31 |  | 
| 32 | class Grammar(object): | 
| 33 | """Pgen parsing tables conversion class. | 
| 34 |  | 
| 35 | Once initialized, this class supplies the grammar tables for the | 
| 36 | parsing engine implemented by parse.py.  The parsing engine | 
| 37 | accesses the instance variables directly.  The class here does not | 
| 38 | provide initialization of the tables; several subclasses exist to | 
| 39 | do this (see the conv and pgen modules). | 
| 40 |  | 
| 41 | The load() method reads the tables from a pickle file, which is | 
| 42 | much faster than the other ways offered by subclasses.  The pickle | 
| 43 | file is written by calling dump() (after loading the grammar | 
| 44 | tables using a subclass).  The report() method prints a readable | 
| 45 | representation of the tables to stdout, for debugging. | 
| 46 |  | 
| 47 | The instance variables are as follows: | 
| 48 |  | 
| 49 | symbol2number -- a dict mapping symbol names to numbers.  Symbol | 
| 50 | numbers are always NT_OFFSET or higher, to distinguish | 
| 51 | them from token numbers, which are between 0 and 255 | 
| 52 | (inclusive). | 
| 53 |  | 
| 54 | number2symbol -- a dict mapping numbers to symbol names; | 
| 55 | these two are each other's inverse. | 
| 56 |  | 
| 57 | states        -- a list of DFAs, where each DFA is a list of | 
| 58 | states, each state is a list of arcs, and each | 
| 59 | arc is a (i, j) pair where i is a label and j is | 
| 60 | a state number.  The DFA number is the index into | 
| 61 | this list.  (This name is slightly confusing.) | 
| 62 | Final states are represented by a special arc of | 
| 63 | the form (0, j) where j is its own state number. | 
| 64 |  | 
| 65 | dfas          -- a dict mapping symbol numbers to (DFA, first) | 
| 66 | pairs, where DFA is an item from the states list | 
| 67 | above, and first is a set of tokens that can | 
| 68 | begin this grammar rule (represented by a dict | 
| 69 | whose values are always 1). | 
| 70 |  | 
| 71 | labels        -- a list of (x, y) pairs where x is either a token | 
| 72 | number or a symbol number, and y is either None | 
| 73 | or a string; the strings are keywords.  The label | 
| 74 | number is the index in this list; label numbers | 
| 75 | are used to mark state transitions (arcs) in the | 
| 76 | DFAs. | 
| 77 |  | 
| 78 | Oils patch: this became List[int] where int is the | 
| 79 | token/symbol number. | 
| 80 |  | 
| 81 | start         -- the number of the grammar's start symbol. | 
| 82 |  | 
| 83 | keywords      -- a dict mapping keyword strings to arc labels. | 
| 84 |  | 
| 85 | tokens        -- a dict mapping token numbers to arc labels. | 
| 86 |  | 
| 87 | """ | 
| 88 |  | 
| 89 | def __init__(self): | 
| 90 | # type: () -> None | 
| 91 | self.symbol2number = {}  # type: Dict[str, int] | 
| 92 | self.number2symbol = {}  # type: Dict[int, str] | 
| 93 |  | 
| 94 | # TODO: See MakeGrammar in pgen2/pgen.py | 
| 95 | # To see the type | 
| 96 | # states: List[List[arcs]] | 
| 97 | # arc: (int, int) | 
| 98 | # dfs = Dict[int, Tuple[states, ...]] | 
| 99 |  | 
| 100 | self.states = []  # type: states_t | 
| 101 | self.dfas = {}  # type: Dict[int, dfa_t] | 
| 102 | # Oils patch: used to be [(0, "EMPTY")].  I suppose 0 is a special value? | 
| 103 | # Or is it ENDMARKER? | 
| 104 | self.labels = [0]  # type: List[int] | 
| 105 | self.keywords = {}  # type: Dict[str, int] | 
| 106 | self.tokens = {}  # type: Dict[int, int] | 
| 107 | self.symbol2label = {}  # type: Dict[str, int] | 
| 108 | self.start = 256  # TODO: move to pgen.NT_OFFSET, breaks OVM tarball | 
| 109 |  | 
| 110 | if mylib.PYTHON: | 
| 111 | def dump(self, f): | 
| 112 | # type: (IO[str]) -> None | 
| 113 | """Dump the grammar tables to a marshal file. | 
| 114 |  | 
| 115 | Oils patch: changed pickle to marshal. | 
| 116 |  | 
| 117 | dump() recursively changes all dict to OrderedDict, so the pickled file | 
| 118 | is not exactly the same as what was passed in to dump(). load() uses the | 
| 119 | pickled file to create the tables, but  only changes OrderedDict to dict | 
| 120 | at the top level; it does not recursively change OrderedDict to dict. | 
| 121 | So, the loaded tables are different from the original tables that were | 
| 122 | passed to load() in that some of the OrderedDict (from the pickled file) | 
| 123 | are not changed back to dict. For parsing, this has no effect on | 
| 124 | performance because OrderedDict uses dict's __getitem__ with nothing in | 
| 125 | between. | 
| 126 | """ | 
| 127 | # Hack to get rid of Id_t | 
| 128 | labels = [int(i) for i in self.labels] | 
| 129 | tokens = dict((int(k), v) for (k, v) in self.tokens.iteritems()) | 
| 130 |  | 
| 131 | #self.report() | 
| 132 | payload = ( | 
| 133 | self.MARSHAL_HEADER, | 
| 134 | self.symbol2number, | 
| 135 | self.number2symbol, | 
| 136 | self.states, | 
| 137 | self.dfas, | 
| 138 | labels, | 
| 139 | self.keywords, | 
| 140 | tokens, | 
| 141 | self.symbol2label, | 
| 142 | self.start, | 
| 143 | )  # tuple | 
| 144 | marshal.dump(payload, f)  # version 2 is latest | 
| 145 |  | 
| 146 | def dump_nonterminals_py(self, f): | 
| 147 | # type: (IO[str]) -> None | 
| 148 | """Write a Python module with nonterminals. | 
| 149 |  | 
| 150 | Analogous to the 'symbol' module in Python. | 
| 151 | """ | 
| 152 | f.write('# This code is generated by pgen2/grammar.py\n\n') | 
| 153 | for num in sorted(self.number2symbol): | 
| 154 | name = self.number2symbol[num] | 
| 155 | f.write('%s = %d\n' % (name, num)) | 
| 156 |  | 
| 157 | def dump_nonterminals_cpp(self, f): | 
| 158 | # type: (IO[str]) -> None | 
| 159 | """Write a Python module with nonterminals. | 
| 160 |  | 
| 161 | Analogous to the 'symbol' module in Python. | 
| 162 | """ | 
| 163 | f.write("""\ | 
| 164 | // This code is generated by pgen2/grammar.py | 
| 165 |  | 
| 166 | namespace grammar_nt { | 
| 167 | """) | 
| 168 | for num in sorted(self.number2symbol): | 
| 169 | name = self.number2symbol[num] | 
| 170 | f.write('  const int %s = %d;\n' % (name, num)) | 
| 171 | f.write("""\ | 
| 172 |  | 
| 173 | }  // namespace grammar_nt | 
| 174 | """) | 
| 175 |  | 
| 176 | def dump_cpp(self, src_f): | 
| 177 | # type: (IO[str]) -> None | 
| 178 | """Dump a C++ implementation of the grammar tables.""" | 
| 179 | src_f.write("""\ | 
| 180 | // This code is generated by pgen2/grammar.py | 
| 181 | # include "cpp/pgen2.h" | 
| 182 | # include "mycpp/runtime.h" | 
| 183 |  | 
| 184 | namespace grammar { | 
| 185 |  | 
| 186 | """) | 
| 187 |  | 
| 188 | src_f.write('Grammar::Grammar() {\n') | 
| 189 | src_f.write('  symbol2number = Alloc<Dict<BigStr*, int>>();\n') | 
| 190 | src_f.write('  number2symbol = Alloc<Dict<int, BigStr*>>();\n') | 
| 191 | src_f.write('  dfas = Alloc<Dict<int, dfa_t*>>();\n') | 
| 192 | src_f.write('  keywords = Alloc<Dict<BigStr*, int>>();\n') | 
| 193 | src_f.write('  tokens = Alloc<Dict<int, int>>();\n') | 
| 194 | src_f.write('  symbol2label = Alloc<Dict<BigStr*, int>>();\n') | 
| 195 | src_f.write('  start = %d;\n' % self.start) | 
| 196 |  | 
| 197 | src_f.write('\n') | 
| 198 | for i, (states, first) in self.dfas.items(): | 
| 199 | src_f.write('  {\n') | 
| 200 | src_f.write('    states_t* st = Alloc<states_t>();\n') | 
| 201 | for s in states: | 
| 202 | src_f.write('    st->append(NewList<arc_t*>(\n') | 
| 203 | src_f.write('      std::initializer_list<arc_t*>{\n') | 
| 204 | for a, b in s: | 
| 205 | src_f.write('        Alloc<Tuple2<int, int>>(%d, %d),\n' % (a, b)) | 
| 206 |  | 
| 207 | src_f.write('      })\n') | 
| 208 | src_f.write('    );\n') | 
| 209 |  | 
| 210 | src_f.write('    first_t* first = Alloc<first_t>();\n') | 
| 211 | for k, v in first.items(): | 
| 212 | src_f.write('    first->set(%d, %d);\n' % (k, v)) | 
| 213 |  | 
| 214 | src_f.write('    dfa_t* dfa = Alloc<dfa_t>(st, first);\n') | 
| 215 | src_f.write('    dfas->set(%d, dfa);\n' % i) | 
| 216 | src_f.write('    \n') | 
| 217 | src_f.write('  }\n') | 
| 218 | src_f.write('\n') | 
| 219 |  | 
| 220 | for symbol, num in self.symbol2number.items(): | 
| 221 | src_f.write('  symbol2number->set(StrFromC("%s"), %d);\n' % (symbol, num)) | 
| 222 |  | 
| 223 | src_f.write('\n') | 
| 224 | for num, symbol in self.number2symbol.items(): | 
| 225 | src_f.write('  number2symbol->set(%d, StrFromC("%s"));\n' % (num, symbol)) | 
| 226 |  | 
| 227 | src_f.write('\n') | 
| 228 | for symbol, label in self.symbol2label.items(): | 
| 229 | src_f.write('  symbol2label->set(StrFromC("%s"), %d);\n' % (symbol, label)) | 
| 230 |  | 
| 231 | src_f.write('\n') | 
| 232 | for ks, ki in self.keywords.items(): | 
| 233 | src_f.write('  keywords->set(StrFromC("%s"), %d);\n' % (ks, ki)) | 
| 234 |  | 
| 235 | src_f.write('\n') | 
| 236 | for k, v in self.tokens.items(): | 
| 237 | src_f.write('  tokens->set(%d, %d);\n' % (k, v)) | 
| 238 |  | 
| 239 | src_f.write('\n') | 
| 240 | src_f.write('\n') | 
| 241 | src_f.write('  labels = NewList<int>(\n') | 
| 242 | src_f.write('    std::initializer_list<int>{\n') | 
| 243 | src_f.write('      ') | 
| 244 | src_f.write(',\n      '.join([str(label) for label in self.labels])) | 
| 245 | src_f.write('\n') | 
| 246 | src_f.write('    }\n') | 
| 247 | src_f.write('  );\n') | 
| 248 |  | 
| 249 | src_f.write('\n') | 
| 250 | src_f.write('};\n') | 
| 251 |  | 
| 252 | src_f.write("""\ | 
| 253 |  | 
| 254 | }  // namespace grammar | 
| 255 | """) | 
| 256 |  | 
| 257 | MARSHAL_HEADER = 'PGEN2\n'  # arbitrary header | 
| 258 |  | 
| 259 | def loads(self, s): | 
| 260 | # type: (str) -> None | 
| 261 | """Load the grammar from a string. | 
| 262 |  | 
| 263 | We have to use a string rather than a file because the marshal module | 
| 264 | doesn't "fake" support zipimport files. | 
| 265 | """ | 
| 266 | payload = marshal.loads(s) | 
| 267 | if payload[0] != self.MARSHAL_HEADER: | 
| 268 | raise RuntimeError('Invalid header %r' % payload[0]) | 
| 269 |  | 
| 270 | ( _, | 
| 271 | self.symbol2number, | 
| 272 | self.number2symbol, | 
| 273 | self.states, | 
| 274 | self.dfas, | 
| 275 | self.labels, | 
| 276 | self.keywords, | 
| 277 | self.tokens, | 
| 278 | self.symbol2label, | 
| 279 | self.start, | 
| 280 | ) = payload | 
| 281 | #self.report() | 
| 282 |  | 
| 283 | assert isinstance(self.symbol2number, dict), self.symbol2number | 
| 284 | assert isinstance(self.number2symbol, dict), self.number2symbol | 
| 285 |  | 
| 286 | def report(self): | 
| 287 | # type: () -> None | 
| 288 | """Dump the grammar tables to standard output, for debugging.""" | 
| 289 | log("symbol2number: %d entries", len(self.symbol2number)) | 
| 290 | log("number2symbol: %d entries", len(self.number2symbol)) | 
| 291 | log("states: %d entries", len(self.states)) | 
| 292 | log("dfas: %d entries", len(self.dfas)) | 
| 293 | return | 
| 294 | from pprint import pprint | 
| 295 | print("labels") | 
| 296 | pprint(self.labels) | 
| 297 | print("keywords") | 
| 298 | pprint(self.labels) | 
| 299 | print("tokens") | 
| 300 | pprint(self.tokens) | 
| 301 | print("symbol2label") | 
| 302 | pprint(self.symbol2label) | 
| 303 | print("start", self.start) |