OILS / osh / word_compile.py View on Github | oilshell.org

281 lines, 149 significant
1#!/usr/bin/env python2
2"""osh/word_compile.py.
3
4These functions are called after parsing, but don't depend on any runtime
5values.
6"""
7from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
8from _devbuild.gen.syntax_asdl import (
9 Token,
10 CharCode,
11 word_part_e,
12 word_part_t,
13)
14from data_lang import j8
15from frontend import consts
16from frontend import lexer
17from mycpp import mylib
18from mycpp.mylib import log, switch
19
20from typing import List, Optional, cast
21
22
23def EvalCharLiteralForRegex(tok):
24 # type: (Token) -> CharCode
25 """For regex char classes.
26
27 Similar logic as below.
28 """
29 id_ = tok.id
30 value = lexer.TokenVal(tok)
31
32 with switch(id_) as case:
33 if case(Id.Char_UBraced):
34 s = lexer.TokenSlice(tok, 3, -1) # \u{123}
35 i = int(s, 16)
36 return CharCode(tok, i, True) # u_braced
37
38 elif case(Id.Char_OneChar): # \'
39 # value[1] -> mylib.ByteAt()
40 one_char_str = consts.LookupCharC(value[1])
41 return CharCode(tok, ord(one_char_str), False)
42
43 elif case(Id.Char_Hex):
44 s = lexer.TokenSliceLeft(tok, 2)
45 i = int(s, 16)
46 return CharCode(tok, i, False)
47
48 elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
49 # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
50 # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
51 # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
52
53 assert len(value) == 1, tok
54 # value[0] -> mylib.ByteAt()
55 return CharCode(tok, ord(value[0]), False)
56
57 else:
58 raise AssertionError(tok)
59
60
61def EvalCStringToken(id_, value):
62 # type: (Id_t, str) -> Optional[str]
63 """This function is shared between echo -e and $''.
64
65 $'' could use it at compile time, much like brace expansion in braces.py.
66 """
67 if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash,
68 Id.Char_AsciiControl):
69 # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
70
71 # Char_AsciiControl is allowed in YSH code, for newlines in u''
72 # strings, just like r'' has
73 # TODO: could allow ONLY newline?
74 return value
75
76 # single quotes in the middle of a triple quoted string
77 elif id_ == Id.Right_SingleQuote:
78 return value
79
80 elif id_ == Id.Char_OneChar:
81 c = value[1]
82 return consts.LookupCharC(c)
83
84 elif id_ == Id.Char_Stop: # \c returns a special sentinel
85 return None
86
87 elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
88 if id_ == Id.Char_Octal3: # $'\377' (disallowed at parse time in YSH)
89 s = value[1:]
90 else: # echo -e '\0377'
91 s = value[2:]
92
93 i = int(s, 8)
94 if i >= 256:
95 i = i % 256
96 # NOTE: This is for strict mode
97 #raise AssertionError('Out of range')
98 return chr(i)
99
100 elif id_ in (Id.Char_Hex, Id.Char_YHex):
101 s = value[2:]
102 i = int(s, 16)
103 return chr(i)
104
105 elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
106 s = value[2:]
107 i = int(s, 16)
108 #util.log('i = %d', i)
109 return j8.Utf8Encode(i)
110
111 elif id_ == Id.Char_UBraced:
112 s = value[3:-1] # \u{123}
113 i = int(s, 16)
114 return j8.Utf8Encode(i)
115
116 else:
117 raise AssertionError(Id_str(id_))
118
119
120def EvalSingleQuoted2(id_, tokens):
121 # type: (Id_t, List[Token]) -> str
122 """ Done at parse time """
123 if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
124 Id.Left_RTSingleQuote):
125 strs = [lexer.TokenVal(t) for t in tokens]
126
127 elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
128 Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
129 Id.Left_BTSingleQuote):
130 if 0:
131 for t in tokens:
132 print('T %s' % t)
133
134 strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens]
135
136 else:
137 raise AssertionError(id_)
138 return ''.join(strs)
139
140
141def _TokenConsistsOf(tok, byte_set):
142 # type: (Token, str) -> bool
143 start = tok.col
144 end = tok.col + tok.length
145 for i in xrange(start, end):
146 b = mylib.ByteAt(tok.line.content, i)
147 if not mylib.ByteInSet(b, byte_set):
148 return False
149 return True
150
151
152def _IsLeadingSpace(tok):
153 # type: (Token) -> bool
154 """ Determine if the token before ''' etc. is space to trim """
155 return _TokenConsistsOf(tok, ' \t')
156
157
158def _IsTrailingSpace(tok):
159 # type: (Token) -> bool
160 """ Determine if the space/newlines after ''' should be trimmed
161
162 Like s.isspace(), without legacy \f \v and Unicode.
163 """
164 return _TokenConsistsOf(tok, ' \n\r\t')
165
166
167# Whitespace trimming algorithms:
168#
169# 1. Trim what's after opening ''' or """, if it's whitespace
170# 2. Determine what's before closing ''' or """ -- this is what you strip
171# 3. Strip each line by mutating the token
172# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
173# the lossless invariant
174
175
176def RemoveLeadingSpaceDQ(parts):
177 # type: (List[word_part_t]) -> None
178 if len(parts) <= 1: # We need at least 2 parts to strip anything
179 return
180
181 # The first token may have a newline
182 UP_first = parts[0]
183 if UP_first.tag() == word_part_e.Literal:
184 first = cast(Token, UP_first)
185 #log('T %s', first_part)
186 if _IsTrailingSpace(first):
187 # Remove the first part. TODO: This could be expensive if there are many
188 # lines.
189 parts.pop(0)
190
191 UP_last = parts[-1]
192 to_strip = None # type: Optional[str]
193 if UP_last.tag() == word_part_e.Literal:
194 last = cast(Token, UP_last)
195 if _IsLeadingSpace(last):
196 to_strip = lexer.TokenVal(last)
197 parts.pop() # Remove the last part
198
199 if to_strip is None:
200 return
201
202 n = len(to_strip)
203 for part in parts:
204 if part.tag() != word_part_e.Literal:
205 line_ended = False
206 continue
207
208 lit_tok = cast(Token, part)
209
210 if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
211 # TODO: Lexer should not populate this!
212 assert lit_tok.tval is None, lit_tok.tval
213
214 lit_tok.col = n
215 lit_tok.length -= n
216 #log('n = %d, %s', n, lit_tok)
217
218 assert lit_tok.id == Id.Lit_Chars, lit_tok
219 # --tool lossless-cat has a special case for this
220 lit_tok.id = Id.Lit_CharsWithoutPrefix
221
222
223def RemoveLeadingSpaceSQ(tokens):
224 # type: (List[Token]) -> None
225 """Strip leading whitespace from tokens.
226
227 May return original list unmodified, or a new list.
228
229 Must respect lossless invariant - see test/lossless/multiline-str.sh
230
231 For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
232 arena.
233
234 Quirk to make more consistent:
235 In $''' and r''' and ''', we have Lit_Chars \n
236 In u''' and b''', we have Char_AsciiControl \n
237 """
238 if 0:
239 log('--')
240 for tok in tokens:
241 log('tok %s', tok)
242 log('--')
243
244 if len(tokens) <= 1: # We need at least 2 parts to strip anything
245 return
246
247 # var x = ''' # strip initial newline/whitespace
248 # x
249 # '''
250 first = tokens[0]
251 if first.id in (Id.Lit_Chars, Id.Char_AsciiControl):
252 if _IsTrailingSpace(first):
253 tokens.pop(0) # Remove the first part
254
255 # Figure out what to strip, based on last token
256 last = tokens[-1]
257 to_strip = None # type: Optional[str]
258 if last.id in (Id.Lit_Chars, Id.Char_AsciiControl):
259 if _IsLeadingSpace(last):
260 to_strip = lexer.TokenVal(last)
261 tokens.pop() # Remove the last part
262
263 if to_strip is None:
264 return
265
266 #log('SQ Stripping %r', to_strip)
267 n = len(to_strip)
268
269 #log('--')
270 for tok in tokens: # line_ended reset on every iteration
271 #log('tok %s', tok)
272 # Strip leading space on tokens that begin lines, by bumping start col
273 if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
274 tok.col = n
275 tok.length -= n
276
277 assert tok.id == Id.Lit_Chars, tok
278 # --tool lossless-cat has a special case for this
279 tok.id = Id.Lit_CharsWithoutPrefix
280
281 #log('STRIP tok %s', tok)