OILS / osh / word_compile.py View on Github | oilshell.org

280 lines, 160 significant
1#!/usr/bin/env python2
2"""osh/word_compile.py.
3
4These functions are called after parsing, but don't depend on any runtime
5values.
6"""
7from _devbuild.gen.id_kind_asdl import Id, Id_str
8from _devbuild.gen.syntax_asdl import (
9 Token,
10 SingleQuoted,
11 CharCode,
12 word_part_e,
13 word_part_t,
14)
15from data_lang import j8
16from frontend import consts
17from mycpp.mylib import log, switch
18
19from typing import List, Optional, cast
20
21
22def EvalCharLiteralForRegex(tok):
23 # type: (Token) -> CharCode
24 """For regex char classes.
25
26 Similar logic as below.
27 """
28 id_ = tok.id
29 value = tok.tval
30
31 with switch(id_) as case:
32 if case(Id.Char_UBraced):
33 s = value[3:-1] # \u{123}
34 i = int(s, 16)
35 return CharCode(i, True, tok) # u_braced
36
37 elif case(Id.Char_OneChar): # \'
38 one_char_str = consts.LookupCharC(value[1])
39 return CharCode(ord(one_char_str), False, tok)
40
41 elif case(Id.Char_Hex):
42 s = value[2:]
43 i = int(s, 16)
44 return CharCode(i, False, tok)
45
46 elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
47 # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
48 # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
49 # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
50
51 assert len(tok.tval) == 1, tok
52 return CharCode(ord(tok.tval[0]), False, tok)
53
54 else:
55 raise AssertionError(tok)
56
57
58def EvalCStringToken(tok):
59 # type: (Token) -> Optional[str]
60 """This function is shared between echo -e and $''.
61
62 $'' could use it at compile time, much like brace expansion in braces.py.
63 """
64 id_ = tok.id
65 value = tok.tval
66
67 if 0:
68 log('tok %s', tok)
69
70 if id_ in (Id.Char_Literals, Id.Unknown_Backslash, Id.Char_AsciiControl):
71 # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
72
73 # Char_AsciiControl is allowed in YSH code, for newlines in u''
74 # strings, just like r'' has
75 # TODO: could allow ONLY newline?
76 return value
77
78 # single quotes in the middle of a triple quoted string
79 elif id_ == Id.Right_SingleQuote:
80 return value
81
82 elif id_ == Id.Char_OneChar:
83 c = value[1]
84 return consts.LookupCharC(c)
85
86 elif id_ == Id.Char_Stop: # \c returns a special sentinel
87 return None
88
89 elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
90 if id_ == Id.Char_Octal3: # $'\377' (disallowed at parse time in YSH)
91 s = value[1:]
92 else: # echo -e '\0377'
93 s = value[2:]
94
95 i = int(s, 8)
96 if i >= 256:
97 i = i % 256
98 # NOTE: This is for strict mode
99 #raise AssertionError('Out of range')
100 return chr(i)
101
102 elif id_ in (Id.Char_Hex, Id.Char_YHex):
103 s = value[2:]
104 i = int(s, 16)
105 return chr(i)
106
107 elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
108 s = value[2:]
109 i = int(s, 16)
110 #util.log('i = %d', i)
111 return j8.Utf8Encode(i)
112
113 elif id_ == Id.Char_UBraced:
114 s = value[3:-1] # \u{123}
115 i = int(s, 16)
116 return j8.Utf8Encode(i)
117
118 else:
119 raise AssertionError(Id_str(id_))
120
121
122def EvalSingleQuoted(part):
123 # type: (SingleQuoted) -> str
124 if part.left.id in (Id.Left_SingleQuote, Id.Left_RSingleQuote,
125 Id.Left_TSingleQuote, Id.Left_RTSingleQuote):
126
127 # TODO: Strip leading whitespace for ''' and r'''
128 if 0:
129 for t in part.tokens:
130 log('sq tok %s', t)
131
132 tmp = [t.tval for t in part.tokens]
133 s = ''.join(tmp)
134
135 elif part.left.id in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
136 Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
137 Id.Left_BTSingleQuote):
138 # NOTE: This could be done at compile time
139 tmp = [EvalCStringToken(t) for t in part.tokens]
140 s = ''.join(tmp)
141
142 else:
143 raise AssertionError(part.left.id)
144 return s
145
146
147def IsLeadingSpace(s):
148 # type: (str) -> bool
149 """Determines if the token before ''' etc. can be stripped.
150
151 Similar to IsWhitespace()
152 """
153 for ch in s:
154 if ch not in ' \t':
155 return False
156 return True
157
158
159def IsWhitespace(s):
160 # type: (str) -> bool
161 """Alternative to s.isspace() that doesn't have legacy \f \v codes.
162 """
163 for ch in s:
164 if ch not in ' \n\r\t':
165 return False
166 return True
167
168
169# Whitespace stripping algorithm
170#
171# - First token should be WHITESPACE* NEWLINE. Omit it
172# - Last token should be WHITESPACE*
173# - Then go through all the other tokens that are AFTER token that ends with \n
174# - if tok.tval[:n] is the same as the last token, then STRIP THAT PREFIX
175# - Do you need to set a flag on the SingleQuoted part?
176#
177# TODO: do this all at compile time?
178
179# These functions may mutate tok.tval. TODO: mutate the parts instead, after
180# we remove .tval
181
182
183def RemoveLeadingSpaceDQ(parts):
184 # type: (List[word_part_t]) -> None
185 if len(parts) <= 1: # We need at least 2 parts to strip anything
186 return
187
188 line_ended = False # Think of it as a tiny state machine
189
190 # The first token may have a newline
191 UP_first = parts[0]
192 if UP_first.tag() == word_part_e.Literal:
193 first = cast(Token, UP_first)
194 #log('T %s', first_part)
195 if IsWhitespace(first.tval):
196 # Remove the first part. TODO: This could be expensive if there are many
197 # lines.
198 parts.pop(0)
199 if first.tval.endswith('\n'):
200 line_ended = True
201
202 UP_last = parts[-1]
203 to_strip = None # type: Optional[str]
204 if UP_last.tag() == word_part_e.Literal:
205 last = cast(Token, UP_last)
206 if IsLeadingSpace(last.tval):
207 to_strip = last.tval
208 parts.pop() # Remove the last part
209
210 if to_strip is not None:
211 n = len(to_strip)
212 for UP_p in parts:
213 if UP_p.tag() != word_part_e.Literal:
214 line_ended = False
215 continue
216
217 p = cast(Token, UP_p)
218
219 if line_ended:
220 if p.tval.startswith(to_strip):
221 # MUTATING the part here
222 p.tval = p.tval[n:]
223
224 line_ended = False
225 if p.tval.endswith('\n'):
226 line_ended = True
227 #log('%s', p)
228
229
230def RemoveLeadingSpaceSQ(tokens):
231 # type: (List[Token]) -> None
232 """
233 In $''', we have Char_Literals \n
234 In r''' and ''', we have Lit_Chars \n
235 In u''' and b''', we have Char_AsciiControl \n
236
237 Should make these more consistent.
238 """
239 if 0:
240 log('--')
241 for tok in tokens:
242 log('tok %s', tok)
243 log('--')
244
245 if len(tokens) <= 1: # We need at least 2 parts to strip anything
246 return
247
248 line_ended = False
249
250 first = tokens[0]
251 if first.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
252 if IsWhitespace(first.tval):
253 tokens.pop(0) # Remove the first part
254 if first.tval.endswith('\n'):
255 line_ended = True
256
257 last = tokens[-1]
258 to_strip = None # type: Optional[str]
259 if last.id in (Id.Lit_Chars, Id.Char_Literals, Id.Char_AsciiControl):
260 if IsLeadingSpace(last.tval):
261 to_strip = last.tval
262 tokens.pop() # Remove the last part
263
264 if to_strip is not None:
265 #log('SQ Stripping %r', to_strip)
266 n = len(to_strip)
267 for tok in tokens:
268 if tok.id not in (Id.Lit_Chars, Id.Char_Literals,
269 Id.Char_AsciiControl):
270 line_ended = False
271 continue
272
273 if line_ended:
274 if tok.tval.startswith(to_strip):
275 # MUTATING the token here
276 tok.tval = tok.tval[n:]
277
278 line_ended = False
279 if tok.tval.endswith('\n'):
280 line_ended = True