OILS / osh / split.py View on Github | oilshell.org

307 lines, 158 significant
1"""
2split.py - Word Splitting
3
4Nice blog post on the complexity/corner cases/differing intuition of splitting
5strings:
6
7https://chriszetter.com/blog/2017/10/29/splitting-strings/
8
9python-dev doesn't want to touch it anymore!
10
11Other possible splitters:
12
13- AwkSplitter -- how does this compare to awk -F?
14- RegexSplitter
15- CsvSplitter
16- TSV2Splitter -- Data is transformed because of # \u0065 in JSON. So it's not
17 a pure slice, but neither is IFS splitting because of backslashes.
18- Perl?
19 - does perl have a spilt context?
20
21with SPLIT_REGEX = / digit+ / {
22 echo $#
23 echo $len(argv)
24 echo $1 $2
25 echo @argv
26}
27"""
28
29from _devbuild.gen.runtime_asdl import (scope_e, span_e, emit_i, char_kind_i,
30 state_i)
31from _devbuild.gen.value_asdl import (value, value_e, value_t)
32from mycpp.mylib import log
33from core import pyutil
34from frontend import consts
35from mycpp import mylib
36from mycpp.mylib import tagswitch
37
38from typing import List, Tuple, Dict, Optional, TYPE_CHECKING, cast
39if TYPE_CHECKING:
40 from core.state import Mem
41 from _devbuild.gen.runtime_asdl import span_t
42 Span = Tuple[span_t, int]
43
44DEFAULT_IFS = ' \t\n'
45
46
47def _SpansToParts(s, spans):
48 # type: (str, List[Span]) -> List[str]
49 """Helper for SplitForWordEval."""
50 parts = [] # type: List[mylib.BufWriter]
51 start_index = 0
52
53 # If the last span was black, and we get a backslash, set join_next to merge
54 # two black spans.
55 join_next = False
56 last_span_was_black = False
57
58 for span_type, end_index in spans:
59 if span_type == span_e.Black:
60 if len(parts) and join_next:
61 parts[-1].write(s[start_index:end_index])
62 join_next = False
63 else:
64 buf = mylib.BufWriter()
65 buf.write(s[start_index:end_index])
66 parts.append(buf)
67
68 last_span_was_black = True
69
70 elif span_type == span_e.Backslash:
71 if last_span_was_black:
72 join_next = True
73 last_span_was_black = False
74
75 else:
76 last_span_was_black = False
77
78 start_index = end_index
79
80 result = [buf.getvalue() for buf in parts]
81 return result
82
83
84class SplitContext(object):
85 """A polymorphic interface to field splitting.
86
87 It respects a STACK of IFS values, for example:
88
89 echo $x # uses default shell IFS
90 IFS=':' myfunc # new splitter
91 echo $x # uses default shell IFS again.
92 """
93
94 def __init__(self, mem):
95 # type: (Mem) -> None
96 self.mem = mem
97 # Split into (ifs_whitespace, ifs_other)
98 self.splitters = {
99 } # type: Dict[str, IfsSplitter] # aka IFS value -> splitter instance
100
101 def _GetSplitter(self, ifs=None):
102 # type: (str) -> IfsSplitter
103 """Based on the current stack frame, get the splitter."""
104 if ifs is None:
105 # Like _ESCAPER, this has dynamic scope! See the real value with
106 # getvar()
107 val = self.mem.GetValue('IFS', scope_e.Dynamic)
108
109 UP_val = val
110 with tagswitch(val) as case:
111 if case(value_e.Undef):
112 ifs = DEFAULT_IFS
113 elif case(value_e.Str):
114 val = cast(value.Str, UP_val)
115 ifs = val.s
116 else:
117 # TODO: Raise proper error
118 raise AssertionError("IFS shouldn't be an array")
119
120 sp = self.splitters.get(ifs)
121 if sp is None:
122 # Figure out what kind of splitter we should instantiate.
123
124 ifs_whitespace = mylib.BufWriter()
125 ifs_other = mylib.BufWriter()
126 for c in ifs:
127 if c in ' \t\n': # Happens to be the same as DEFAULT_IFS
128 ifs_whitespace.write(c)
129 else:
130 # TODO: \ not supported
131 ifs_other.write(c)
132
133 sp = IfsSplitter(ifs_whitespace.getvalue(), ifs_other.getvalue())
134
135 # NOTE: Technically, we could make the key more precise. IFS=$' \t' is
136 # the same as IFS=$'\t '. But most programs probably don't do that, and
137 # everything should work in any case.
138 self.splitters[ifs] = sp
139
140 return sp
141
142 def GetJoinChar(self):
143 # type: () -> str
144 """For decaying arrays by joining, eg.
145
146 "$@" -> $@. array
147 """
148 # https://www.gnu.org/software/bash/manual/bashref.html#Special-Parameters
149 # http://pubs.opengroup.org/onlinepubs/9699919799/utilities/V3_chap02.html#tag_18_05_02
150 # "When the expansion occurs within a double-quoted string (see
151 # Double-Quotes), it shall expand to a single field with the value of
152 # each parameter separated by the first character of the IFS variable, or
153 # by a <space> if IFS is unset. If IFS is set to a null string, this is
154 # not equivalent to unsetting it; its first character does not exist, so
155 # the parameter values are concatenated."
156 val = self.mem.GetValue('IFS', scope_e.Dynamic) # type: value_t
157 UP_val = val
158 with tagswitch(val) as case:
159 if case(value_e.Undef):
160 return ' '
161 elif case(value_e.Str):
162 val = cast(value.Str, UP_val)
163 if len(val.s):
164 return val.s[0]
165 else:
166 return ''
167 else:
168 # TODO: Raise proper error
169 raise AssertionError("IFS shouldn't be an array")
170
171 raise AssertionError('for -Wreturn-type in C++')
172
173 def Escape(self, s):
174 # type: (str) -> str
175 """Escape IFS chars."""
176 sp = self._GetSplitter()
177 return sp.Escape(s)
178
179 def SplitForWordEval(self, s, ifs=None):
180 # type: (str, Optional[str]) -> List[str]
181 """Split used by word evaluation.
182
183 Also used by the explicit @split() function.
184 """
185 sp = self._GetSplitter(ifs=ifs)
186 spans = sp.Split(s, True)
187 if 0:
188 for span in spans:
189 log('SPAN %s', span)
190 return _SpansToParts(s, spans)
191
192 def SplitForRead(self, line, allow_escape):
193 # type: (str, bool) -> List[Span]
194 sp = self._GetSplitter()
195 return sp.Split(line, allow_escape)
196
197
198class _BaseSplitter(object):
199
200 def __init__(self, escape_chars):
201 # type: (str) -> None
202 self.escape_chars = escape_chars + '\\' # Backslash is always escaped
203
204 def Escape(self, s):
205 # type: (str) -> str
206 # Note the characters here are DYNAMIC, unlike other usages of
207 # BackslashEscape().
208 return pyutil.BackslashEscape(s, self.escape_chars)
209
210
211class IfsSplitter(_BaseSplitter):
212 """Split a string when IFS has non-whitespace characters."""
213
214 def __init__(self, ifs_whitespace, ifs_other):
215 # type: (str, str) -> None
216 _BaseSplitter.__init__(self, ifs_whitespace + ifs_other)
217 self.ifs_whitespace = ifs_whitespace
218 self.ifs_other = ifs_other
219
220 def Split(self, s, allow_escape):
221 # type: (str, bool) -> List[Span]
222 """
223 Args:
224 s: string to split
225 allow_escape: False for read -r, this means \ doesn't do anything.
226
227 Returns:
228 List of (runtime.span, end_index) pairs
229
230 TODO: This should be (frag, do_split) pairs, to avoid IFS='\'
231 double-escaping issue.
232 """
233 ws_chars = self.ifs_whitespace
234 other_chars = self.ifs_other
235
236 n = len(s)
237 # NOTE: in C, could reserve() this to len(s)
238 spans = [] # type: List[Span]
239
240 if n == 0:
241 return spans # empty
242
243 # Ad hoc rule from POSIX: ignore leading whitespace.
244 # "IFS white space shall be ignored at the beginning and end of the input"
245 # This can't really be handled by the state machine.
246
247 i = 0
248 while i < n and mylib.ByteInSet(mylib.ByteAt(s, i), ws_chars):
249 i += 1
250
251 # Append an ignored span.
252 if i != 0:
253 spans.append((span_e.Delim, i))
254
255 # String is ONLY whitespace. We want to skip the last span after the
256 # while loop.
257 if i == n:
258 return spans
259
260 state = state_i.Start
261 while state != state_i.Done:
262 if i < n:
263 byte = mylib.ByteAt(s, i)
264
265 if mylib.ByteInSet(byte, ws_chars):
266 ch = char_kind_i.DE_White
267 elif mylib.ByteInSet(byte, other_chars):
268 ch = char_kind_i.DE_Gray
269 elif allow_escape and mylib.ByteEquals(byte, '\\'):
270 ch = char_kind_i.Backslash
271 else:
272 ch = char_kind_i.Black
273
274 elif i == n:
275 ch = char_kind_i.Sentinel # one more iterations for the end of string
276
277 else:
278 raise AssertionError() # shouldn't happen
279
280 new_state, action = consts.IfsEdge(state, ch)
281 if new_state == state_i.Invalid:
282 raise AssertionError('Invalid transition from %r with %r' %
283 (state, ch))
284
285 if 0:
286 log('i %d byte %r ch %s current: %s next: %s %s', i, byte, ch,
287 state, new_state, action)
288
289 if action == emit_i.Part:
290 spans.append((span_e.Black, i))
291 elif action == emit_i.Delim:
292 spans.append((span_e.Delim, i)) # ignored delimiter
293 elif action == emit_i.Empty:
294 spans.append((span_e.Delim, i)) # ignored delimiter
295 # EMPTY part that is NOT ignored
296 spans.append((span_e.Black, i))
297 elif action == emit_i.Escape:
298 spans.append((span_e.Backslash, i)) # \
299 elif action == emit_i.Nothing:
300 pass
301 else:
302 raise AssertionError()
303
304 state = new_state
305 i += 1
306
307 return spans