osh/word_compile.py

OILS / osh / word_compile.py View on Github | oilshell.org

280 lines, 154 significant

1	#!/usr/bin/env python2
2	from __future__ import print_function
3	"""osh/word_compile.py.
4
5	These functions are called after parsing, but don't depend on any runtime
6	values.
7	"""
8
9	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10	from _devbuild.gen.syntax_asdl import (
11	Token,
12	CharCode,
13	word_part_e,
14	word_part_t,
15	)
16	from data_lang import j8
17	from frontend import consts
18	from frontend import lexer
19	from mycpp import mylib
20	from mycpp.mylib import log, switch
21
22	from typing import List, Optional, cast
23
24
25	def EvalCharLiteralForRegex(tok):
26	# type: (Token) -> CharCode
27	"""For regex char classes.
28
29	Similar logic as below.
30	"""
31	id_ = tok.id
32	value = lexer.TokenVal(tok)
33
34	with switch(id_) as case:
35	if case(Id.Char_UBraced):
36	s = lexer.TokenSlice(tok, 3, -1) # \u{123}
37	i = int(s, 16)
38	return CharCode(tok, i, True) # u_braced
39
40	elif case(Id.Char_OneChar): # \'
41	# value[1] -> mylib.ByteAt()
42	one_char_str = consts.LookupCharC(value[1])
43	return CharCode(tok, ord(one_char_str), False)
44
45	elif case(Id.Char_Hex):
46	s = lexer.TokenSliceLeft(tok, 2)
47	i = int(s, 16)
48	return CharCode(tok, i, False)
49
50	elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
51	# Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
52	# Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
53	# Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
54
55	assert len(value) == 1, tok
56	# value[0] -> mylib.ByteAt()
57	return CharCode(tok, ord(value[0]), False)
58
59	else:
60	raise AssertionError(tok)
61
62
63	def EvalCStringToken(id_, value):
64	# type: (Id_t, str) -> Optional[str]
65	"""This function is shared between echo -e and $''.
66
67	$'' could use it at compile time, much like brace expansion in braces.py.
68	"""
69	if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
70	# shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
71	return value
72
73	# single quotes in the middle of a triple quoted string
74	elif id_ == Id.Right_SingleQuote:
75	return value
76
77	elif id_ == Id.Char_OneChar:
78	c = value[1]
79	return consts.LookupCharC(c)
80
81	elif id_ == Id.Char_Stop: # \c returns a special sentinel
82	return None
83
84	elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
85	if id_ == Id.Char_Octal3: # $'\377' (disallowed at parse time in YSH)
86	s = value[1:]
87	else: # echo -e '\0377'
88	s = value[2:]
89
90	i = int(s, 8)
91	if i >= 256:
92	i = i % 256
93	# NOTE: This is for strict mode
94	#raise AssertionError('Out of range')
95	return chr(i)
96
97	elif id_ in (Id.Char_Hex, Id.Char_YHex):
98	s = value[2:]
99	i = int(s, 16)
100	return chr(i)
101
102	elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
103	s = value[2:]
104	i = int(s, 16)
105	#util.log('i = %d', i)
106	return j8.Utf8Encode(i)
107
108	elif id_ == Id.Char_UBraced:
109	s = value[3:-1] # \u{123}
110	i = int(s, 16)
111	return j8.Utf8Encode(i)
112
113	else:
114	raise AssertionError(Id_str(id_))
115
116
117	def EvalSingleQuoted2(id_, tokens):
118	# type: (Id_t, List[Token]) -> str
119	""" Done at parse time """
120	if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
121	Id.Left_RTSingleQuote):
122	strs = [lexer.TokenVal(t) for t in tokens]
123
124	elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
125	Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
126	Id.Left_BTSingleQuote):
127	if 0:
128	for t in tokens:
129	print('T %s' % t)
130
131	strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens]
132
133	else:
134	raise AssertionError(id_)
135	return ''.join(strs)
136
137
138	def _TokenConsistsOf(tok, byte_set):
139	# type: (Token, str) -> bool
140	start = tok.col
141	end = tok.col + tok.length
142	for i in xrange(start, end):
143	b = mylib.ByteAt(tok.line.content, i)
144	if not mylib.ByteInSet(b, byte_set):
145	return False
146	return True
147
148
149	def _IsLeadingSpace(tok):
150	# type: (Token) -> bool
151	""" Determine if the token before ''' etc. is space to trim """
152	return _TokenConsistsOf(tok, ' \t')
153
154
155	def _IsTrailingSpace(tok):
156	# type: (Token) -> bool
157	""" Determine if the space/newlines after ''' should be trimmed
158
159	Like s.isspace(), without legacy \f \v and Unicode.
160	"""
161	return _TokenConsistsOf(tok, ' \n\r\t')
162
163
164	# Whitespace trimming algorithms:
165	#
166	# 1. Trim what's after opening ''' or """, if it's whitespace
167	# 2. Determine what's before closing ''' or """ -- this is what you strip
168	# 3. Strip each line by mutating the token
169	# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
170	# the lossless invariant
171
172
173	def RemoveLeadingSpaceDQ(parts):
174	# type: (List[word_part_t]) -> None
175	if len(parts) <= 1: # We need at least 2 parts to strip anything
176	return
177
178	# The first token may have a newline
179	UP_first = parts[0]
180	if UP_first.tag() == word_part_e.Literal:
181	first = cast(Token, UP_first)
182	#log('T %s', first_part)
183	if _IsTrailingSpace(first):
184	# Remove the first part. TODO: This could be expensive if there are many
185	# lines.
186	parts.pop(0)
187
188	UP_last = parts[-1]
189	to_strip = None # type: Optional[str]
190	if UP_last.tag() == word_part_e.Literal:
191	last = cast(Token, UP_last)
192	if _IsLeadingSpace(last):
193	to_strip = lexer.TokenVal(last)
194	parts.pop() # Remove the last part
195
196	if to_strip is None:
197	return
198
199	n = len(to_strip)
200	for part in parts:
201	if part.tag() != word_part_e.Literal:
202	line_ended = False
203	continue
204
205	lit_tok = cast(Token, part)
206
207	if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
208	# TODO: Lexer should not populate this!
209	assert lit_tok.tval is None, lit_tok.tval
210
211	lit_tok.col = n
212	lit_tok.length -= n
213	#log('n = %d, %s', n, lit_tok)
214
215	assert lit_tok.id == Id.Lit_Chars, lit_tok
216	# --tool lossless-cat has a special case for this
217	lit_tok.id = Id.Lit_CharsWithoutPrefix
218
219
220	def RemoveLeadingSpaceSQ(tokens):
221	# type: (List[Token]) -> None
222	"""Strip leading whitespace from tokens.
223
224	May return original list unmodified, or a new list.
225
226	Must respect lossless invariant - see test/lossless/multiline-str.sh
227
228	For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
229	arena.
230	"""
231	if 0:
232	log('--')
233	for tok in tokens:
234	#log('tok %s', tok)
235	import sys
236	from asdl import format as fmt
237	ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
238	tree = tok.AbbreviatedTree()
239	fmt.PrintTree(tree, ast_f)
240	print('', file=sys.stderr)
241	log('--')
242
243	if len(tokens) <= 1: # We need at least 2 parts to strip anything
244	return
245
246	# var x = ''' # strip initial newline/whitespace
247	# x
248	# '''
249	first = tokens[0]
250	if first.id == Id.Lit_Chars:
251	if _IsTrailingSpace(first):
252	tokens.pop(0) # Remove the first part
253
254	# Figure out what to strip, based on last token
255	last = tokens[-1]
256	to_strip = None # type: Optional[str]
257	if last.id == Id.Lit_Chars:
258	if _IsLeadingSpace(last):
259	to_strip = lexer.TokenVal(last)
260	tokens.pop() # Remove the last part
261
262	if to_strip is None:
263	return
264
265	#log('SQ Stripping %r', to_strip)
266	n = len(to_strip)
267
268	#log('--')
269	for tok in tokens: # line_ended reset on every iteration
270	#log('tok %s', tok)
271	# Strip leading space on tokens that begin lines, by bumping start col
272	if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
273	tok.col = n
274	tok.length -= n
275
276	assert tok.id == Id.Lit_Chars, tok
277	# --tool lossless-cat has a special case for this
278	tok.id = Id.Lit_CharsWithoutPrefix
279
280	#log('STRIP tok %s', tok)