osh/word_compile.py

OILS / osh / word_compile.py View on Github | oilshell.org

280 lines, 160 significant

1	#!/usr/bin/env python2
2	"""osh/word_compile.py.
3
4	These functions are called after parsing, but don't depend on any runtime
5	values.
6	"""
7	from _devbuild.gen.id_kind_asdl import Id, Id_str
8	from _devbuild.gen.syntax_asdl import (
9	Token,
10	SingleQuoted,
11	CharCode,
12	word_part_e,
13	word_part_t,
14	)
15	from data_lang import j8
16	from frontend import consts
17	from mycpp.mylib import log, switch
18
19	from typing import List, Optional, cast
20
21
22	def EvalCharLiteralForRegex(tok):
23	# type: (Token) -> CharCode
24	"""For regex char classes.
25
26	Similar logic as below.
27	"""
28	id_ = tok.id
29	value = tok.tval
30
31	with switch(id_) as case:
32	if case(Id.Char_UBraced):
33	s = value[3:-1] # \u{123}
34	i = int(s, 16)
35	return CharCode(i, True, tok) # u_braced
36
37	elif case(Id.Char_OneChar): # \'
38	one_char_str = consts.LookupCharC(value[1])
39	return CharCode(ord(one_char_str), False, tok)
40
41	elif case(Id.Char_Hex):
42	s = value[2:]
43	i = int(s, 16)
44	return CharCode(i, False, tok)
45
46	elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
47	# Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
48	# Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
49	# Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
50
51	assert len(tok.tval) == 1, tok
52	return CharCode(ord(tok.tval[0]), False, tok)
53
54	else:
55	raise AssertionError(tok)
56
57
58	def EvalCStringToken(tok):
59	# type: (Token) -> Optional[str]
60	"""This function is shared between echo -e and $''.
61
62	$'' could use it at compile time, much like brace expansion in braces.py.
63	"""
64	id_ = tok.id
65	value = tok.tval
66
67	if 0:
68	log('tok %s', tok)
69
70	if id_ in (Id.Char_Literals, Id.Unknown_Backslash, Id.Char_AsciiControl):
71	# shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
72
73	# Char_AsciiControl is allowed in YSH code, for newlines in u''
74	# strings, just like r'' has
75	# TODO: could allow ONLY newline?
76	return value
77
78	# single quotes in the middle of a triple quoted string
79	elif id_ == Id.Right_SingleQuote:
80	return value
81
82	elif id_ == Id.Char_OneChar:
83	c = value[1]
84	return consts.LookupCharC(c)
85
86	elif id_ == Id.Char_Stop: # \c returns a special sentinel
87	return None
88
89	elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
90	if id_ == Id.Char_Octal3: # $'\377' (disallowed at parse time in YSH)
91	s = value[1:]
92	else: # echo -e '\0377'
93	s = value[2:]
94
95	i = int(s, 8)
96	if i >= 256:
97	i = i % 256
98	# NOTE: This is for strict mode
99	#raise AssertionError('Out of range')
100	return chr(i)
101
102	elif id_ in (Id.Char_Hex, Id.Char_YHex):
103	s = value[2:]
104	i = int(s, 16)
105	return chr(i)
106
107	elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
108	s = value[2:]
109	i = int(s, 16)
110	#util.log('i = %d', i)
111	return j8.Utf8Encode(i)
112
113	elif id_ == Id.Char_UBraced:
114	s = value[3:-1] # \u{123}
115	i = int(s, 16)
116	return j8.Utf8Encode(i)
117
118	else:
119	raise AssertionError(Id_str(id_))
120
121
122	def EvalSingleQuoted(part):
123	# type: (SingleQuoted) -> str
124	if part.left.id in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
125	Id.Left_TSingleQuote, Id.Left_RTSingleQuote):
126
127	# TODO: Strip leading whitespace for ''' and r'''
128	if 0:
129	for t in part.tokens:
130	log('sq tok %s', t)
131
132	tmp = [t.tval for t in part.tokens]
133	s = ''.join(tmp)
134
135	elif part.left.id in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
136	Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
137	Id.Left_BTSingleQuote):
138	# NOTE: This could be done at compile time
139	tmp = [EvalCStringToken(t) for t in part.tokens]
140	s = ''.join(tmp)
141
142	else:
143	raise AssertionError(part.left.id)
144	return s
145
146
147	def IsLeadingSpace(s):
148	# type: (str) -> bool
149	"""Determines if the token before ''' etc. can be stripped.
150
151	Similar to IsWhitespace()
152	"""
153	for ch in s:
154	if ch not in ' \t':
155	return False
156	return True
157
158
159	def IsWhitespace(s):
160	# type: (str) -> bool
161	"""Alternative to s.isspace() that doesn't have legacy \f \v codes.
162	"""
163	for ch in s:
164	if ch not in ' \n\r\t':
165	return False
166	return True
167
168
169	# Whitespace stripping algorithm
170	#
171	# - First token should be WHITESPACE* NEWLINE. Omit it
172	# - Last token should be WHITESPACE*
173	# - Then go through all the other tokens that are AFTER token that ends with \n
174	# - if tok.tval[:n] is the same as the last token, then STRIP THAT PREFIX
175	# - Do you need to set a flag on the SingleQuoted part?
176	#
177	# TODO: do this all at compile time?
178
179	# These functions may mutate tok.tval. TODO: mutate the parts instead, after
180	# we remove .tval
181
182
183	def RemoveLeadingSpaceDQ(parts):
184	# type: (List[word_part_t]) -> None
185	if len(parts) <= 1: # We need at least 2 parts to strip anything
186	return
187
188	line_ended = False # Think of it as a tiny state machine
189
190	# The first token may have a newline
191	UP_first = parts[0]
192	if UP_first.tag() == word_part_e.Literal:
193	first = cast(Token, UP_first)
194	#log('T %s', first_part)
195	if IsWhitespace(first.tval):
196	# Remove the first part. TODO: This could be expensive if there are many
197	# lines.
198	parts.pop(0)
199	if first.tval.endswith('\n'):
200	line_ended = True
201
202	UP_last = parts[-1]
203	to_strip = None # type: Optional[str]
204	if UP_last.tag() == word_part_e.Literal:
205	last = cast(Token, UP_last)
206	if IsLeadingSpace(last.tval):
207	to_strip = last.tval
208	parts.pop() # Remove the last part
209
210	if to_strip is not None:
211	n = len(to_strip)
212	for UP_p in parts:
213	if UP_p.tag() != word_part_e.Literal:
214	line_ended = False
215	continue
216
217	p = cast(Token, UP_p)
218
219	if line_ended:
220	if p.tval.startswith(to_strip):
221	# MUTATING the part here
222	p.tval = p.tval[n:]
223
224	line_ended = False
225	if p.tval.endswith('\n'):
226	line_ended = True
227	#log('%s', p)
228
229
230	def RemoveLeadingSpaceSQ(tokens):
231	# type: (List[Token]) -> None
232	"""
233	In $''', we have Char_Literals \n
234	In r''' and ''', we have Lit_Chars \n
235	In u''' and b''', we have Char_AsciiControl \n
236
237	Should make these more consistent.
238	"""
239	if 0:
240	log('--')
241	for tok in tokens:
242	log('tok %s', tok)
243	log('--')
244
245	if len(tokens) <= 1: # We need at least 2 parts to strip anything
246	return
247
248	line_ended = False
249
250	first = tokens[0]
251	if first.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
252	if IsWhitespace(first.tval):
253	tokens.pop(0) # Remove the first part
254	if first.tval.endswith('\n'):
255	line_ended = True
256
257	last = tokens[-1]
258	to_strip = None # type: Optional[str]
259	if last.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
260	if IsLeadingSpace(last.tval):
261	to_strip = last.tval
262	tokens.pop() # Remove the last part
263
264	if to_strip is not None:
265	#log('SQ Stripping %r', to_strip)
266	n = len(to_strip)
267	for tok in tokens:
268	if tok.id not in (Id.Lit_Chars, Id.Char_Literals,
269	Id.Char_AsciiControl):
270	line_ended = False
271	continue
272
273	if line_ended:
274	if tok.tval.startswith(to_strip):
275	# MUTATING the token here
276	tok.tval = tok.tval[n:]
277
278	line_ended = False
279	if tok.tval.endswith('\n'):
280	line_ended = True