1 | #!/usr/bin/env python2
|
2 | from __future__ import print_function
|
3 | """osh/word_compile.py.
|
4 |
|
5 | These functions are called after parsing, but don't depend on any runtime
|
6 | values.
|
7 | """
|
8 |
|
9 | from _devbuild.gen.id_kind_asdl import Id, Id_t, Id_str
|
10 | from _devbuild.gen.syntax_asdl import (
|
11 | Token,
|
12 | CharCode,
|
13 | word_part_e,
|
14 | word_part_t,
|
15 | )
|
16 | from data_lang import j8
|
17 | from frontend import consts
|
18 | from frontend import lexer
|
19 | from mycpp import mylib
|
20 | from mycpp.mylib import log, switch
|
21 |
|
22 | from typing import List, Optional, cast
|
23 |
|
24 |
|
25 | def EvalCharLiteralForRegex(tok):
|
26 | # type: (Token) -> CharCode
|
27 | """For regex char classes.
|
28 |
|
29 | Similar logic as below.
|
30 | """
|
31 | id_ = tok.id
|
32 | value = lexer.TokenVal(tok)
|
33 |
|
34 | with switch(id_) as case:
|
35 | if case(Id.Char_UBraced):
|
36 | s = lexer.TokenSlice(tok, 3, -1) # \u{123}
|
37 | i = int(s, 16)
|
38 | return CharCode(tok, i, True) # u_braced
|
39 |
|
40 | elif case(Id.Char_OneChar): # \'
|
41 | # value[1] -> mylib.ByteAt()
|
42 | one_char_str = consts.LookupCharC(value[1])
|
43 | return CharCode(tok, ord(one_char_str), False)
|
44 |
|
45 | elif case(Id.Char_Hex):
|
46 | s = lexer.TokenSliceLeft(tok, 2)
|
47 | i = int(s, 16)
|
48 | return CharCode(tok, i, False)
|
49 |
|
50 | elif case(Id.Lit_Chars, Id.Expr_Name, Id.Expr_DecInt):
|
51 | # Id.Lit_Chars: Token in single quoted string ['a'] is Id.Lit_Chars
|
52 | # Id.Expr_Name: [a-z] is ['a'-'Z'], and [a z] is ['a' 'Z']
|
53 | # Id.Expr_DecInt: [0-9] is ['0'-'9'], and [0 9] is ['0' '9']
|
54 |
|
55 | assert len(value) == 1, tok
|
56 | # value[0] -> mylib.ByteAt()
|
57 | return CharCode(tok, ord(value[0]), False)
|
58 |
|
59 | else:
|
60 | raise AssertionError(tok)
|
61 |
|
62 |
|
63 | def EvalCStringToken(id_, value):
|
64 | # type: (Id_t, str) -> Optional[str]
|
65 | """This function is shared between echo -e and $''.
|
66 |
|
67 | $'' could use it at compile time, much like brace expansion in braces.py.
|
68 | """
|
69 | if id_ in (Id.Lit_Chars, Id.Lit_CharsWithoutPrefix, Id.Unknown_Backslash):
|
70 | # shopt -u parse_backslash detects Unknown_Backslash at PARSE time in YSH.
|
71 | return value
|
72 |
|
73 | # single quotes in the middle of a triple quoted string
|
74 | elif id_ == Id.Right_SingleQuote:
|
75 | return value
|
76 |
|
77 | elif id_ == Id.Char_OneChar:
|
78 | c = value[1]
|
79 | return consts.LookupCharC(c)
|
80 |
|
81 | elif id_ == Id.Char_Stop: # \c returns a special sentinel
|
82 | return None
|
83 |
|
84 | elif id_ in (Id.Char_Octal3, Id.Char_Octal4):
|
85 | if id_ == Id.Char_Octal3: # $'\377' (disallowed at parse time in YSH)
|
86 | s = value[1:]
|
87 | else: # echo -e '\0377'
|
88 | s = value[2:]
|
89 |
|
90 | i = int(s, 8)
|
91 | if i >= 256:
|
92 | i = i % 256
|
93 | # NOTE: This is for strict mode
|
94 | #raise AssertionError('Out of range')
|
95 | return chr(i)
|
96 |
|
97 | elif id_ in (Id.Char_Hex, Id.Char_YHex):
|
98 | s = value[2:]
|
99 | i = int(s, 16)
|
100 | return chr(i)
|
101 |
|
102 | elif id_ in (Id.Char_Unicode4, Id.Char_Unicode8):
|
103 | s = value[2:]
|
104 | i = int(s, 16)
|
105 | #util.log('i = %d', i)
|
106 | return j8.Utf8Encode(i)
|
107 |
|
108 | elif id_ == Id.Char_UBraced:
|
109 | s = value[3:-1] # \u{123}
|
110 | i = int(s, 16)
|
111 | return j8.Utf8Encode(i)
|
112 |
|
113 | else:
|
114 | raise AssertionError(Id_str(id_))
|
115 |
|
116 |
|
117 | def EvalSingleQuoted2(id_, tokens):
|
118 | # type: (Id_t, List[Token]) -> str
|
119 | """ Done at parse time """
|
120 | if id_ in (Id.Left_SingleQuote, Id.Left_RSingleQuote, Id.Left_TSingleQuote,
|
121 | Id.Left_RTSingleQuote):
|
122 | strs = [lexer.TokenVal(t) for t in tokens]
|
123 |
|
124 | elif id_ in (Id.Left_DollarSingleQuote, Id.Left_USingleQuote,
|
125 | Id.Left_BSingleQuote, Id.Left_UTSingleQuote,
|
126 | Id.Left_BTSingleQuote):
|
127 | if 0:
|
128 | for t in tokens:
|
129 | print('T %s' % t)
|
130 |
|
131 | strs = [EvalCStringToken(t.id, lexer.TokenVal(t)) for t in tokens]
|
132 |
|
133 | else:
|
134 | raise AssertionError(id_)
|
135 | return ''.join(strs)
|
136 |
|
137 |
|
138 | def _TokenConsistsOf(tok, byte_set):
|
139 | # type: (Token, str) -> bool
|
140 | start = tok.col
|
141 | end = tok.col + tok.length
|
142 | for i in xrange(start, end):
|
143 | b = mylib.ByteAt(tok.line.content, i)
|
144 | if not mylib.ByteInSet(b, byte_set):
|
145 | return False
|
146 | return True
|
147 |
|
148 |
|
149 | def _IsLeadingSpace(tok):
|
150 | # type: (Token) -> bool
|
151 | """ Determine if the token before ''' etc. is space to trim """
|
152 | return _TokenConsistsOf(tok, ' \t')
|
153 |
|
154 |
|
155 | def _IsTrailingSpace(tok):
|
156 | # type: (Token) -> bool
|
157 | """ Determine if the space/newlines after ''' should be trimmed
|
158 |
|
159 | Like s.isspace(), without legacy \f \v and Unicode.
|
160 | """
|
161 | return _TokenConsistsOf(tok, ' \n\r\t')
|
162 |
|
163 |
|
164 | # Whitespace trimming algorithms:
|
165 | #
|
166 | # 1. Trim what's after opening ''' or """, if it's whitespace
|
167 | # 2. Determine what's before closing ''' or """ -- this is what you strip
|
168 | # 3. Strip each line by mutating the token
|
169 | # - Change the ID from Id.Lit_Chars -> Id.Lit_CharsWithoutPrefix to maintain
|
170 | # the lossless invariant
|
171 |
|
172 |
|
173 | def RemoveLeadingSpaceDQ(parts):
|
174 | # type: (List[word_part_t]) -> None
|
175 | if len(parts) <= 1: # We need at least 2 parts to strip anything
|
176 | return
|
177 |
|
178 | # The first token may have a newline
|
179 | UP_first = parts[0]
|
180 | if UP_first.tag() == word_part_e.Literal:
|
181 | first = cast(Token, UP_first)
|
182 | #log('T %s', first_part)
|
183 | if _IsTrailingSpace(first):
|
184 | # Remove the first part. TODO: This could be expensive if there are many
|
185 | # lines.
|
186 | parts.pop(0)
|
187 |
|
188 | UP_last = parts[-1]
|
189 | to_strip = None # type: Optional[str]
|
190 | if UP_last.tag() == word_part_e.Literal:
|
191 | last = cast(Token, UP_last)
|
192 | if _IsLeadingSpace(last):
|
193 | to_strip = lexer.TokenVal(last)
|
194 | parts.pop() # Remove the last part
|
195 |
|
196 | if to_strip is None:
|
197 | return
|
198 |
|
199 | n = len(to_strip)
|
200 | for part in parts:
|
201 | if part.tag() != word_part_e.Literal:
|
202 | line_ended = False
|
203 | continue
|
204 |
|
205 | lit_tok = cast(Token, part)
|
206 |
|
207 | if lit_tok.col == 0 and lexer.TokenStartsWith(lit_tok, to_strip):
|
208 | # TODO: Lexer should not populate this!
|
209 | assert lit_tok.tval is None, lit_tok.tval
|
210 |
|
211 | lit_tok.col = n
|
212 | lit_tok.length -= n
|
213 | #log('n = %d, %s', n, lit_tok)
|
214 |
|
215 | assert lit_tok.id == Id.Lit_Chars, lit_tok
|
216 | # --tool lossless-cat has a special case for this
|
217 | lit_tok.id = Id.Lit_CharsWithoutPrefix
|
218 |
|
219 |
|
220 | def RemoveLeadingSpaceSQ(tokens):
|
221 | # type: (List[Token]) -> None
|
222 | """Strip leading whitespace from tokens.
|
223 |
|
224 | May return original list unmodified, or a new list.
|
225 |
|
226 | Must respect lossless invariant - see test/lossless/multiline-str.sh
|
227 |
|
228 | For now we create NEW Id.Ignored_LeadingSpace tokens, and are NOT in the
|
229 | arena.
|
230 | """
|
231 | if 0:
|
232 | log('--')
|
233 | for tok in tokens:
|
234 | #log('tok %s', tok)
|
235 | import sys
|
236 | from asdl import format as fmt
|
237 | ast_f = fmt.DetectConsoleOutput(mylib.Stderr())
|
238 | tree = tok.AbbreviatedTree()
|
239 | fmt.PrintTree(tree, ast_f)
|
240 | print('', file=sys.stderr)
|
241 | log('--')
|
242 |
|
243 | if len(tokens) <= 1: # We need at least 2 parts to strip anything
|
244 | return
|
245 |
|
246 | # var x = ''' # strip initial newline/whitespace
|
247 | # x
|
248 | # '''
|
249 | first = tokens[0]
|
250 | if first.id == Id.Lit_Chars:
|
251 | if _IsTrailingSpace(first):
|
252 | tokens.pop(0) # Remove the first part
|
253 |
|
254 | # Figure out what to strip, based on last token
|
255 | last = tokens[-1]
|
256 | to_strip = None # type: Optional[str]
|
257 | if last.id == Id.Lit_Chars:
|
258 | if _IsLeadingSpace(last):
|
259 | to_strip = lexer.TokenVal(last)
|
260 | tokens.pop() # Remove the last part
|
261 |
|
262 | if to_strip is None:
|
263 | return
|
264 |
|
265 | #log('SQ Stripping %r', to_strip)
|
266 | n = len(to_strip)
|
267 |
|
268 | #log('--')
|
269 | for tok in tokens: # line_ended reset on every iteration
|
270 | #log('tok %s', tok)
|
271 | # Strip leading space on tokens that begin lines, by bumping start col
|
272 | if tok.col == 0 and lexer.TokenStartsWith(tok, to_strip):
|
273 | tok.col = n
|
274 | tok.length -= n
|
275 |
|
276 | assert tok.id == Id.Lit_Chars, tok
|
277 | # --tool lossless-cat has a special case for this
|
278 | tok.id = Id.Lit_CharsWithoutPrefix
|
279 |
|
280 | #log('STRIP tok %s', tok)
|