osh/word_compile.py

OILS / osh / word_compile.py View on Github | oilshell.org

281 lines, 149 significant

1	#!/usr/bin/env python2
2	"""osh/word_compile.py.
3
4	These functions are called after parsing, but don't depend on any runtime
5	values.
6	"""
7	from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
8	from _devbuild.gen.syntax_asdl import (
9	Token,
10	CharCode,
11	word_part_e,
12	word_part_t,
13	)
14	from data_lang import j8
15	from frontend import consts
16	from frontend import lexer
17	from mycpp import mylib
18	from mycpp.mylib import log, switch
19
20	from typing import List, Optional, cast
21
22
23	def EvalCharLiteralForRegex(tok):
24	# type: (Token) -> CharCode
25	"""For regex char classes.
26
27	Similar logic as below.
28	"""
29	id_ = tok.id
30	value = lexer.TokenVal(tok)
31
32	with switch(id_) as case:
33	if case(Id.Char_UBraced):
34	s = lexer.TokenSlice(tok, 3, -1) # \u{123}
35	i = int(s, 16)
36	return CharCode(tok, i, True) # u_braced
37
38	elif case(Id.Char_OneChar): # \'
39	# value[1] -> mylib.ByteAt()
40	one_char_str = consts.LookupCharC(value[1])
41	return CharCode(tok, ord(one_char_str), False)
42
43	elif case(Id.Char_Hex):
44	s = lexer.TokenSliceLeft(tok, 2)
45	i = int(s, 16)
46	return CharCode(tok, i, False)
47
48	elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
49	# Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
50	# Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
51	# Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
52
53	assert len(value) == 1, tok
54	# value[0] -> mylib.ByteAt()
55	return CharCode(tok, ord(value[0]), False)
56
57	else:
58	raise AssertionError(tok)
59
60
61	def EvalCStringToken(id_, value):
62	# type: (Id_t, str) -> Optional[str]
63	"""This function is shared between echo -e and $''.
64
65	$'' could use it at compile time, much like brace expansion in braces.py.
66	"""
67	if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash,
68	Id.Char_AsciiControl):
69	# shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
70
71	# Char_AsciiControl is allowed in YSH code, for newlines in u''
72	# strings, just like r'' has
73	# TODO: could allow ONLY newline?
74	return value
75
76	# single quotes in the middle of a triple quoted string
77	elif id_ == Id.Right_SingleQuote:
78	return value
79
80	elif id_ == Id.Char_OneChar:
81	c = value[1]
82	return consts.LookupCharC(c)
83
84	elif id_ == Id.Char_Stop: # \c returns a special sentinel
85	return None
86
87	elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
88	if id_ == Id.Char_Octal3: # $'\377' (disallowed at parse time in YSH)
89	s = value[1:]
90	else: # echo -e '\0377'
91	s = value[2:]
92
93	i = int(s, 8)
94	if i >= 256:
95	i = i % 256
96	# NOTE: This is for strict mode
97	#raise AssertionError('Out of range')
98	return chr(i)
99
100	elif id_ in (Id.Char_Hex, Id.Char_YHex):
101	s = value[2:]
102	i = int(s, 16)
103	return chr(i)
104
105	elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
106	s = value[2:]
107	i = int(s, 16)
108	#util.log('i = %d', i)
109	return j8.Utf8Encode(i)
110
111	elif id_ == Id.Char_UBraced:
112	s = value[3:-1] # \u{123}
113	i = int(s, 16)
114	return j8.Utf8Encode(i)
115
116	else:
117	raise AssertionError(Id_str(id_))
118
119
120	def EvalSingleQuoted2(id_, tokens):
121	# type: (Id_t, List[Token]) -> str
122	""" Done at parse time """
123	if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
124	Id.Left_RTSingleQuote):
125	strs = [lexer.TokenVal(t) for t in tokens]
126
127	elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
128	Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
129	Id.Left_BTSingleQuote):
130	if 0:
131	for t in tokens:
132	print('T %s' % t)
133
134	strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens]
135
136	else:
137	raise AssertionError(id_)
138	return ''.join(strs)
139
140
141	def _TokenConsistsOf(tok, byte_set):
142	# type: (Token, str) -> bool
143	start = tok.col
144	end = tok.col + tok.length
145	for i in xrange(start, end):
146	b = mylib.ByteAt(tok.line.content, i)
147	if not mylib.ByteInSet(b, byte_set):
148	return False
149	return True
150
151
152	def _IsLeadingSpace(tok):
153	# type: (Token) -> bool
154	""" Determine if the token before ''' etc. is space to trim """
155	return _TokenConsistsOf(tok, ' \t')
156
157
158	def _IsTrailingSpace(tok):
159	# type: (Token) -> bool
160	""" Determine if the space/newlines after ''' should be trimmed
161
162	Like s.isspace(), without legacy \f \v and Unicode.
163	"""
164	return _TokenConsistsOf(tok, ' \n\r\t')
165
166
167	# Whitespace trimming algorithms:
168	#
169	# 1. Trim what's after opening ''' or """, if it's whitespace
170	# 2. Determine what's before closing ''' or """ -- this is what you strip
171	# 3. Strip each line by mutating the token
172	# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
173	# the lossless invariant
174
175
176	def RemoveLeadingSpaceDQ(parts):
177	# type: (List[word_part_t]) -> None
178	if len(parts) <= 1: # We need at least 2 parts to strip anything
179	return
180
181	# The first token may have a newline
182	UP_first = parts[0]
183	if UP_first.tag() == word_part_e.Literal:
184	first = cast(Token, UP_first)
185	#log('T %s', first_part)
186	if _IsTrailingSpace(first):
187	# Remove the first part. TODO: This could be expensive if there are many
188	# lines.
189	parts.pop(0)
190
191	UP_last = parts[-1]
192	to_strip = None # type: Optional[str]
193	if UP_last.tag() == word_part_e.Literal:
194	last = cast(Token, UP_last)
195	if _IsLeadingSpace(last):
196	to_strip = lexer.TokenVal(last)
197	parts.pop() # Remove the last part
198
199	if to_strip is None:
200	return
201
202	n = len(to_strip)
203	for part in parts:
204	if part.tag() != word_part_e.Literal:
205	line_ended = False
206	continue
207
208	lit_tok = cast(Token, part)
209
210	if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
211	# TODO: Lexer should not populate this!
212	assert lit_tok.tval is None, lit_tok.tval
213
214	lit_tok.col = n
215	lit_tok.length -= n
216	#log('n = %d, %s', n, lit_tok)
217
218	assert lit_tok.id == Id.Lit_Chars, lit_tok
219	# --tool lossless-cat has a special case for this
220	lit_tok.id = Id.Lit_CharsWithoutPrefix
221
222
223	def RemoveLeadingSpaceSQ(tokens):
224	# type: (List[Token]) -> None
225	"""Strip leading whitespace from tokens.
226
227	May return original list unmodified, or a new list.
228
229	Must respect lossless invariant - see test/lossless/multiline-str.sh
230
231	For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
232	arena.
233
234	Quirk to make more consistent:
235	In $''' and r''' and ''', we have Lit_Chars \n
236	In u''' and b''', we have Char_AsciiControl \n
237	"""
238	if 0:
239	log('--')
240	for tok in tokens:
241	log('tok %s', tok)
242	log('--')
243
244	if len(tokens) <= 1: # We need at least 2 parts to strip anything
245	return
246
247	# var x = ''' # strip initial newline/whitespace
248	# x
249	# '''
250	first = tokens[0]
251	if first.id in (Id.Lit_Chars, Id.Char_AsciiControl):
252	if _IsTrailingSpace(first):
253	tokens.pop(0) # Remove the first part
254
255	# Figure out what to strip, based on last token
256	last = tokens[-1]
257	to_strip = None # type: Optional[str]
258	if last.id in (Id.Lit_Chars, Id.Char_AsciiControl):
259	if _IsLeadingSpace(last):
260	to_strip = lexer.TokenVal(last)
261	tokens.pop() # Remove the last part
262
263	if to_strip is None:
264	return
265
266	#log('SQ Stripping %r', to_strip)
267	n = len(to_strip)
268
269	#log('--')
270	for tok in tokens: # line_ended reset on every iteration
271	#log('tok %s', tok)
272	# Strip leading space on tokens that begin lines, by bumping start col
273	if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
274	tok.col = n
275	tok.length -= n
276
277	assert tok.id == Id.Lit_Chars, tok
278	# --tool lossless-cat has a special case for this
279	tok.id = Id.Lit_CharsWithoutPrefix
280
281	#log('STRIP tok %s', tok)