OILS / osh / word_compile.py View on Github | oilshell.org

280 lines, 154 significant
1#!/usr/bin/env python2
2from __future__ import print_function
3"""osh/word_compile.py.
4
5These functions are called after parsing, but don't depend on any runtime
6values.
7"""
8
9from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
10from _devbuild.gen.syntax_asdl import (
11 Token,
12 CharCode,
13 word_part_e,
14 word_part_t,
15)
16from data_lang import j8
17from frontend import consts
18from frontend import lexer
19from mycpp import mylib
20from mycpp.mylib import log, switch
21
22from typing import List, Optional, cast
23
24
25def EvalCharLiteralForRegex(tok):
26 # type: (Token) -> CharCode
27 """For regex char classes.
28
29 Similar logic as below.
30 """
31 id_ = tok.id
32 value = lexer.TokenVal(tok)
33
34 with switch(id_) as case:
35 if case(Id.Char_UBraced):
36 s = lexer.TokenSlice(tok, 3, -1) # \u{123}
37 i = int(s, 16)
38 return CharCode(tok, i, True) # u_braced
39
40 elif case(Id.Char_OneChar): # \'
41 # value[1] -> mylib.ByteAt()
42 one_char_str = consts.LookupCharC(value[1])
43 return CharCode(tok, ord(one_char_str), False)
44
45 elif case(Id.Char_Hex):
46 s = lexer.TokenSliceLeft(tok, 2)
47 i = int(s, 16)
48 return CharCode(tok, i, False)
49
50 elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
51 # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
52 # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
53 # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
54
55 assert len(value) == 1, tok
56 # value[0] -> mylib.ByteAt()
57 return CharCode(tok, ord(value[0]), False)
58
59 else:
60 raise AssertionError(tok)
61
62
63def EvalCStringToken(id_, value):
64 # type: (Id_t, str) -> Optional[str]
65 """This function is shared between echo -e and $''.
66
67 $'' could use it at compile time, much like brace expansion in braces.py.
68 """
69 if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
70 # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
71 return value
72
73 # single quotes in the middle of a triple quoted string
74 elif id_ == Id.Right_SingleQuote:
75 return value
76
77 elif id_ == Id.Char_OneChar:
78 c = value[1]
79 return consts.LookupCharC(c)
80
81 elif id_ == Id.Char_Stop: # \c returns a special sentinel
82 return None
83
84 elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
85 if id_ == Id.Char_Octal3: # $'\377' (disallowed at parse time in YSH)
86 s = value[1:]
87 else: # echo -e '\0377'
88 s = value[2:]
89
90 i = int(s, 8)
91 if i >= 256:
92 i = i % 256
93 # NOTE: This is for strict mode
94 #raise AssertionError('Out of range')
95 return chr(i)
96
97 elif id_ in (Id.Char_Hex, Id.Char_YHex):
98 s = value[2:]
99 i = int(s, 16)
100 return chr(i)
101
102 elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
103 s = value[2:]
104 i = int(s, 16)
105 #util.log('i = %d', i)
106 return j8.Utf8Encode(i)
107
108 elif id_ == Id.Char_UBraced:
109 s = value[3:-1] # \u{123}
110 i = int(s, 16)
111 return j8.Utf8Encode(i)
112
113 else:
114 raise AssertionError(Id_str(id_))
115
116
117def EvalSingleQuoted2(id_, tokens):
118 # type: (Id_t, List[Token]) -> str
119 """ Done at parse time """
120 if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
121 Id.Left_RTSingleQuote):
122 strs = [lexer.TokenVal(t) for t in tokens]
123
124 elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
125 Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
126 Id.Left_BTSingleQuote):
127 if 0:
128 for t in tokens:
129 print('T %s' % t)
130
131 strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens]
132
133 else:
134 raise AssertionError(id_)
135 return ''.join(strs)
136
137
138def _TokenConsistsOf(tok, byte_set):
139 # type: (Token, str) -> bool
140 start = tok.col
141 end = tok.col + tok.length
142 for i in xrange(start, end):
143 b = mylib.ByteAt(tok.line.content, i)
144 if not mylib.ByteInSet(b, byte_set):
145 return False
146 return True
147
148
149def _IsLeadingSpace(tok):
150 # type: (Token) -> bool
151 """ Determine if the token before ''' etc. is space to trim """
152 return _TokenConsistsOf(tok, ' \t')
153
154
155def _IsTrailingSpace(tok):
156 # type: (Token) -> bool
157 """ Determine if the space/newlines after ''' should be trimmed
158
159 Like s.isspace(), without legacy \f \v and Unicode.
160 """
161 return _TokenConsistsOf(tok, ' \n\r\t')
162
163
164# Whitespace trimming algorithms:
165#
166# 1. Trim what's after opening ''' or """, if it's whitespace
167# 2. Determine what's before closing ''' or """ -- this is what you strip
168# 3. Strip each line by mutating the token
169# - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
170# the lossless invariant
171
172
173def RemoveLeadingSpaceDQ(parts):
174 # type: (List[word_part_t]) -> None
175 if len(parts) <= 1: # We need at least 2 parts to strip anything
176 return
177
178 # The first token may have a newline
179 UP_first = parts[0]
180 if UP_first.tag() == word_part_e.Literal:
181 first = cast(Token, UP_first)
182 #log('T %s', first_part)
183 if _IsTrailingSpace(first):
184 # Remove the first part. TODO: This could be expensive if there are many
185 # lines.
186 parts.pop(0)
187
188 UP_last = parts[-1]
189 to_strip = None # type: Optional[str]
190 if UP_last.tag() == word_part_e.Literal:
191 last = cast(Token, UP_last)
192 if _IsLeadingSpace(last):
193 to_strip = lexer.TokenVal(last)
194 parts.pop() # Remove the last part
195
196 if to_strip is None:
197 return
198
199 n = len(to_strip)
200 for part in parts:
201 if part.tag() != word_part_e.Literal:
202 line_ended = False
203 continue
204
205 lit_tok = cast(Token, part)
206
207 if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
208 # TODO: Lexer should not populate this!
209 assert lit_tok.tval is None, lit_tok.tval
210
211 lit_tok.col = n
212 lit_tok.length -= n
213 #log('n = %d, %s', n, lit_tok)
214
215 assert lit_tok.id == Id.Lit_Chars, lit_tok
216 # --tool lossless-cat has a special case for this
217 lit_tok.id = Id.Lit_CharsWithoutPrefix
218
219
220def RemoveLeadingSpaceSQ(tokens):
221 # type: (List[Token]) -> None
222 """Strip leading whitespace from tokens.
223
224 May return original list unmodified, or a new list.
225
226 Must respect lossless invariant - see test/lossless/multiline-str.sh
227
228 For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
229 arena.
230 """
231 if 0:
232 log('--')
233 for tok in tokens:
234 #log('tok %s', tok)
235 import sys
236 from asdl import format as fmt
237 ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
238 tree = tok.AbbreviatedTree()
239 fmt.PrintTree(tree, ast_f)
240 print('', file=sys.stderr)
241 log('--')
242
243 if len(tokens) <= 1: # We need at least 2 parts to strip anything
244 return
245
246 # var x = ''' # strip initial newline/whitespace
247 # x
248 # '''
249 first = tokens[0]
250 if first.id == Id.Lit_Chars:
251 if _IsTrailingSpace(first):
252 tokens.pop(0) # Remove the first part
253
254 # Figure out what to strip, based on last token
255 last = tokens[-1]
256 to_strip = None # type: Optional[str]
257 if last.id == Id.Lit_Chars:
258 if _IsLeadingSpace(last):
259 to_strip = lexer.TokenVal(last)
260 tokens.pop() # Remove the last part
261
262 if to_strip is None:
263 return
264
265 #log('SQ Stripping %r', to_strip)
266 n = len(to_strip)
267
268 #log('--')
269 for tok in tokens: # line_ended reset on every iteration
270 #log('tok %s', tok)
271 # Strip leading space on tokens that begin lines, by bumping start col
272 if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
273 tok.col = n
274 tok.length -= n
275
276 assert tok.id == Id.Lit_Chars, tok
277 # --tool lossless-cat has a special case for this
278 tok.id = Id.Lit_CharsWithoutPrefix
279
280 #log('STRIP tok %s', tok)