asdl/front_end.py

OILS / asdl / front_end.py View on Github | oilshell.org

514 lines, 295 significant

1	"""front_end.py: Lexer and parser for the ASDL schema language."""
2	from __future__ import print_function
3
4	import re
5
6	from asdl import ast
7	from asdl.ast import (AST, Use, Module, TypeDecl, Constructor, Field, Sum,
8	SimpleSum, Product)
9	from asdl.util import log
10
11	_ = log
12
13	_KEYWORDS = ['use', 'module', 'generate']
14
15	_TOKENS = [
16	('Keyword', ''),
17	('Name', ''),
18
19	# For operators, the string matters
20	('Equals', '='),
21	('Comma', ','),
22	('Question', '?'),
23	('Pipe', '\|'),
24	('Asterisk', '*'),
25	('LParen', '('),
26	('RParen', ')'),
27	('LBrace', '{'),
28	('RBrace', '}'),
29	('Percent', '%'),
30
31	# Oil addition for parameterized types.
32	('LBracket', '['),
33	('RBracket', ']'),
34
35	# - Start with Dict[string, bool].
36	# - List[string] is an alias for string*
37	#
38	# statically typed: Dict and List
39	# dynamically typed: dict and list
40	]
41
42	_TOKEN_STR = [name for name, _ in _TOKENS] # integer -> string like LParen
43	_TOKEN_INT = {} # string like '(' -> integer
44
45
46	class TokenKind(object):
47	"""ASDL tokens.
48
49	TokenKind.LBrace = 5, etc.
50	"""
51	pass
52
53
54	for i, (name, val) in enumerate(_TOKENS):
55	setattr(TokenKind, name, i)
56	_TOKEN_INT[val] = i
57
58
59	class Token(object):
60
61	def __init__(self, kind, value, lineno):
62	self.kind = kind
63	self.value = value
64	self.lineno = lineno
65
66
67	class ASDLSyntaxError(Exception):
68
69	def __init__(self, msg, lineno=None):
70	self.msg = msg
71	self.lineno = lineno or '<unknown>'
72
73	def __str__(self):
74	return 'Syntax error on line {0.lineno}: {0.msg}'.format(self)
75
76
77	def _Tokenize(f):
78	"""Tokenize the given buffer.
79
80	Yield Token objects.
81	"""
82	for lineno, line in enumerate(f, 1):
83	for m in re.finditer(r'\s(\w+\|--.\|#.*\|.)', line.strip()):
84	c = m.group(1)
85	if c in _KEYWORDS:
86	yield Token(TokenKind.Keyword, c, lineno)
87
88	elif c[0].isalpha():
89	yield Token(TokenKind.Name, c, lineno)
90
91	elif c.startswith('--') or c.startswith('#'):
92	# ASDL comments start with --
93	# Added # comments like Python and shell
94	break
95
96	else:
97	# Operators
98	try:
99	op_kind = _TOKEN_INT[c]
100	except KeyError:
101	raise ASDLSyntaxError('Invalid operator %s' % c, lineno)
102	yield Token(op_kind, c, lineno)
103
104
105	def _SumIsSimple(variant_list):
106	"""Return True if a sum is a simple.
107
108	A sum is simple if its types have no fields, e.g. unaryop = Invert \|
109	Not \| UAdd \| USub
110	"""
111	for t in variant_list:
112	if t.fields or t.shared_type:
113	return False
114	return True
115
116
117	_CODE_GEN_OPTIONS = [
118	'no_namespace_suffix', # Id.Foo instead of Id_e.Foo
119	'integers', # integer builtin_i instead of strongly typed builtin_e
120	'bit_set', # not implemented: 1 << n instead of n
121
122	# probably don't need this
123	# 'common_synthetic_field:left_tok',
124
125	# Put this type, and transitive closure of types it references, in the
126	# unique "first class variant" namespace, and generate type reflection.
127	'reflect_all_types',
128
129	# Squeeze and Freeze, with the number of bits as a option Hm the headers
130	# here still need type reflection. Probably OK.
131	'mirror_all_types:16',
132	]
133
134
135	class ASDLParser(object):
136	"""Parser for ASDL files.
137
138	Create, then call the parse method on a buffer containing ASDL. This
139	is a simple recursive descent parser that uses _Tokenize for the
140	lexing.
141	"""
142
143	def __init__(self):
144	self._tokenizer = None
145	self.cur_token = None
146
147	def parse(self, f):
148	"""Parse the ASDL in the file and return an AST with a Module root."""
149	self._tokenizer = _Tokenize(f)
150	self._advance()
151	return self._parse_module()
152
153	def _parse_module(self):
154	"""
155	type_decl : NAME (':' NAME) '=' compound_type
156	module : 'module' NAME '{' use* type_decl* '}'
157
158	We added:
159	: for code gen options
160	use for imports
161
162	alloc_members =
163	List
164	\| Dict
165	\| Struct
166	generate [bit_set]
167
168	-- color::Red, not color_e::Red or color_i::Red
169	color = Red \| Green
170	generate [integers, no_sum_suffix]
171	"""
172	if not self._at_keyword('module'):
173	raise ASDLSyntaxError(
174	'Expected "module" (found {})'.format(self.cur_token.value),
175	self.cur_token.lineno)
176	self._advance()
177	name = self._match(TokenKind.Name)
178	self._match(TokenKind.LBrace)
179
180	uses = []
181	while self._at_keyword('use'):
182	uses.append(self._parse_use())
183
184	defs = []
185	while self.cur_token.kind == TokenKind.Name:
186	typename = self._advance()
187	self._match(TokenKind.Equals)
188	type_ = self._parse_compound_type()
189	defs.append(TypeDecl(typename, type_))
190
191	self._match(TokenKind.RBrace)
192	return Module(name, uses, defs)
193
194	def _parse_use(self):
195	"""Use: 'use' NAME+ '{' NAME+ '}'.
196
197	example: use frontend syntax { Token }
198
199	This means frontend/syntax.asdl.h :: Token
200	"""
201	self._advance() # past 'use'
202	module_parts = []
203	while self.cur_token.kind == TokenKind.Name:
204	part = self._advance()
205	module_parts.append(part)
206
207	self._match(TokenKind.LBrace)
208
209	type_names = []
210	while self.cur_token.kind == TokenKind.Name:
211	t = self._advance()
212	type_names.append(t)
213	if self.cur_token.kind == TokenKind.RParen:
214	break
215	elif self.cur_token.kind == TokenKind.Comma:
216	self._advance()
217
218	self._match(TokenKind.RBrace)
219	#print('MOD %s' % module_parts)
220	return Use(module_parts, type_names)
221
222	def _parse_compound_type(self):
223	"""Constructor : NAME fields? \| NAME '%' NAME # shared variant.
224
225	compound_type : product
226	\| constructor ('\|' constructor)*
227	"""
228	if self.cur_token.kind == TokenKind.LParen:
229	# If we see a (, it's a product
230	return self._parse_product()
231	else:
232	# Otherwise it's a sum. Look for ConstructorId
233	sumlist = []
234	while True:
235	cons_name = self._match(TokenKind.Name)
236
237	shared_type = None
238	fields = None
239	if self.cur_token.kind == TokenKind.LParen:
240	fields = self._parse_fields()
241	elif self.cur_token.kind == TokenKind.Percent:
242	self._advance()
243	shared_type = self._match(TokenKind.Name)
244	else:
245	pass
246
247	cons = Constructor(cons_name, shared_type, fields)
248	sumlist.append(cons)
249
250	if self.cur_token.kind != TokenKind.Pipe:
251	break
252	self._advance()
253	generate = self._parse_optional_generate()
254
255	# Additional validation
256	if generate is not None:
257	for g in generate:
258	if g not in _CODE_GEN_OPTIONS:
259	raise ASDLSyntaxError('Invalid code gen option %r' % g,
260	self.cur_token.lineno)
261
262	if _SumIsSimple(sumlist):
263	return SimpleSum(sumlist, generate)
264	else:
265	return Sum(sumlist, generate)
266
267	def _parse_type_expr(self):
268	"""One or two params:
269
270	type_params : '[' type_expr ( ',' type_expr )* ']'
271
272	type_expr : NAME type_params? (''?' \| '*')? # allow one suffix
273
274	NAME is validated against Optional, List, Dict afterward
275	"""
276	type_name = self._match(TokenKind.Name)
277
278	# Accept Python-like naming!
279	if type_name == 'str':
280	type_name = 'string'
281
282	children = []
283	if self.cur_token.kind == TokenKind.LBracket:
284	self._advance()
285	children.append(self._parse_type_expr())
286	if self.cur_token.kind == TokenKind.Comma:
287	self._advance()
288	children.append(self._parse_type_expr())
289
290	self._match(TokenKind.RBracket)
291
292	if type_name in ('List', 'Optional'):
293	if len(children) != 1:
294	raise ASDLSyntaxError(
295	'Expected 1 type param to {}'.format(type_name),
296	self.cur_token.lineno)
297	elif type_name == 'Dict':
298	if len(children) != 2:
299	raise ASDLSyntaxError(
300	'Expected 2 type params to {}'.format(type_name),
301	self.cur_token.lineno)
302	else:
303	if len(children) != 0:
304	raise ASDLSyntaxError(
305	'Expected zero type params to {}'.format(type_name),
306	self.cur_token.lineno)
307
308	if len(children):
309	typ = ast.ParameterizedType(type_name, children)
310	else:
311	typ = ast.NamedType(type_name)
312
313	if self.cur_token.kind == TokenKind.Asterisk:
314	# string* is equivalent to List[string]
315	typ = ast.ParameterizedType('List', [typ])
316	self._advance()
317
318	elif self.cur_token.kind == TokenKind.Question:
319	# string* is equivalent to Optional[string]
320	typ = ast.ParameterizedType('Optional', [typ])
321	self._advance()
322
323	return typ
324
325	def _parse_fields(self):
326	"""fields_inner: type_expr NAME ( ',' type_expr NAME )* ','?
327
328	fields : '(' fields_inner? ')'
329
330	Name Quantifier? should be changed to typename.
331	"""
332	fields = []
333	self._match(TokenKind.LParen)
334	while self.cur_token.kind == TokenKind.Name:
335	typ = self._parse_type_expr()
336	field_name = self._match(TokenKind.Name)
337
338	fields.append(Field(typ, field_name))
339
340	if self.cur_token.kind == TokenKind.RParen:
341	break
342	elif self.cur_token.kind == TokenKind.Comma:
343	self._advance()
344
345	self._match(TokenKind.RParen)
346	return fields
347
348	def _parse_list(self):
349	"""list_inner: NAME ( ',' NAME )* ','?
350
351	list : '[' list_inner? ']'
352	"""
353	generate = []
354	self._match(TokenKind.LBracket)
355	while self.cur_token.kind == TokenKind.Name:
356	name = self._match(TokenKind.Name)
357
358	generate.append(name)
359
360	if self.cur_token.kind == TokenKind.RBracket:
361	break
362	elif self.cur_token.kind == TokenKind.Comma:
363	self._advance()
364
365	self._match(TokenKind.RBracket)
366	return generate
367
368	def _parse_optional_generate(self):
369	"""Attributes = 'generate' list."""
370	if self._at_keyword('generate'):
371	self._advance()
372	return self._parse_list()
373	else:
374	return None
375
376	def _parse_product(self):
377	"""Product: fields attributes?"""
378	return Product(self._parse_fields())
379
380	def _advance(self):
381	"""Return current token; read next token into self.cur_token."""
382	cur_val = None if self.cur_token is None else self.cur_token.value
383	try:
384	self.cur_token = next(self._tokenizer)
385	except StopIteration:
386	self.cur_token = None
387	return cur_val
388
389	def _match(self, kind):
390	"""The 'match' primitive of RD parsers.
391
392	* Verifies that the current token is of the given kind (kind can
393	be a tuple, in which the kind must match one of its members).
394	* Returns the value of the current token
395	* Reads in the next token
396
397	Args:
398	kind: A TokenKind, or a tuple of TokenKind
399	"""
400	if self.cur_token.kind == kind:
401	value = self.cur_token.value
402	self._advance()
403	return value
404	else:
405	raise ASDLSyntaxError(
406	'Expected token {}, got {}'.format(_TOKEN_STR[kind],
407	self.cur_token.value),
408	self.cur_token.lineno)
409
410	def _at_keyword(self, keyword):
411	return (self.cur_token.kind == TokenKind.Keyword and
412	self.cur_token.value == keyword)
413
414
415	_PRIMITIVE_TYPES = [
416	'string',
417	'int',
418	'BigInt',
419	'float',
420	'bool',
421
422	# 'any' is used:
423	# - for value.Obj in the the Oil expression evaluator. We're not doing any
424	# dynamic or static checking now.
425	'any',
426	]
427
428
429	def _ResolveType(typ, type_lookup):
430	# type: (AST, dict) -> None
431	"""Recursively attach a 'resolved' field to AST nodes."""
432	if isinstance(typ, ast.NamedType):
433	if typ.name not in _PRIMITIVE_TYPES:
434	ast_node = type_lookup.get(typ.name)
435	if ast_node is None:
436	raise ASDLSyntaxError("Couldn't find type %r" % typ.name)
437	typ.resolved = ast_node
438
439	elif isinstance(typ, ast.ParameterizedType):
440	for child in typ.children:
441	_ResolveType(child, type_lookup)
442
443	if typ.type_name == 'Optional':
444	child = typ.children[0]
445	if isinstance(child, ast.NamedType):
446	if child.name in _PRIMITIVE_TYPES and child.name != 'string':
447	raise ASDLSyntaxError(
448	'Optional primitive type {} not allowed'.format(
449	child.name))
450
451	if child.resolved and isinstance(child.resolved,
452	ast.SimpleSum):
453	raise ASDLSyntaxError(
454	'Optional simple sum type {} not allowed'.format(
455	child.name))
456
457	else:
458	raise AssertionError()
459
460
461	def _ResolveFields(field_ast_nodes, type_lookup):
462	"""
463	Args:
464	type_lookup: Populated by name resolution
465	"""
466	for field in field_ast_nodes:
467	_ResolveType(field.typ, type_lookup)
468
469
470	def _ResolveModule(module, app_types):
471	"""Name resolution for NamedType."""
472	# Types that fields are declared with: int, id, word_part, etc.
473	# Fields are NOT declared with Constructor names.
474	type_lookup = dict(app_types)
475
476	# Note: we don't actually load the type, and instead leave that to MyPy /
477	# C++. A consequence of this is TypeNameHeuristic().
478	for u in module.uses:
479	for type_name in u.type_names:
480	type_lookup[type_name] = u # type: ast.Use()
481
482	# NOTE: We need two passes because types can be mutually recursive, e.g.
483	# asdl/arith.asdl.
484
485	# First pass: collect declared types and make entries for them.
486	for d in module.dfns:
487	type_lookup[d.name] = d.value
488
489	# Second pass: add NamedType.resolved field
490	for d in module.dfns:
491	ast_node = d.value
492	if isinstance(ast_node, ast.Product):
493	#log('fields %s', ast_node.fields)
494	_ResolveFields(ast_node.fields, type_lookup)
495
496	elif isinstance(ast_node, ast.Sum):
497	for cons in ast_node.types:
498	_ResolveFields(cons.fields, type_lookup)
499
500	else:
501	raise AssertionError(ast_node)
502
503
504	def LoadSchema(f, app_types, verbose=False):
505	"""Returns an AST for the schema."""
506	p = ASDLParser()
507	schema_ast = p.parse(f)
508	if verbose:
509	import sys
510	schema_ast.Print(sys.stdout, 0)
511
512	# Make sure all the names are valid
513	_ResolveModule(schema_ast, app_types)
514	return schema_ast