OILS / builtin / method_str.py View on Github | oilshell.org

482 lines, 284 significant
1"""YSH Str methods"""
2
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import loc_t
6from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
7 eggex_ops_t, RegexMatch)
8from builtin import pure_ysh
9from core import error
10from core import state
11from core import vm
12from frontend import typed_args
13from mycpp import mops
14from mycpp.mylib import log, tagswitch
15from osh import string_ops
16from ysh import expr_eval
17from ysh import regex_translate
18from ysh import val_ops
19
20import libc
21from libc import REG_NOTBOL
22
23from typing import cast, List, Tuple
24
25_ = log
26
27
28def _StrMatchStart(s, p):
29 # type: (str, str) -> Tuple[bool, int, int]
30 """Returns the range of bytes in 's' that match string pattern `p`. the
31 pattern matches if 's' starts with all the characters in 'p'.
32
33 The returned match result is the tuple "(matched, begin, end)". 'matched'
34 is true if the pattern matched. 'begin' and 'end' give the half-open range
35 "[begin, end)" of byte indices from 's' for the match, and are a valid but
36 empty range if 'match' is false.
37
38 Used for shell functions like 'trimStart' when trimming a prefix string.
39 """
40 if s.startswith(p):
41 return (True, 0, len(p))
42 else:
43 return (False, 0, 0)
44
45
46def _StrMatchEnd(s, p):
47 # type: (str, str) -> Tuple[bool, int, int]
48 """Returns a match result for the bytes in 's' that match string pattern
49 `p`. the pattern matches if 's' ends with all the characters in 'p'.
50
51 The returned match result is the tuple "(matched, begin, end)". 'matched'
52 is true if the pattern matched. 'begin' and 'end' give the half-open range
53 "[begin, end)" of byte indices from 's' for the match, and are a valid but
54 empty range if 'match' is false.
55
56 Used for shell functions like 'trimEnd' when trimming a suffix string.
57 """
58 len_s = len(s)
59 if s.endswith(p):
60 return (True, len_s - len(p), len_s)
61 else:
62 return (False, len_s, len_s)
63
64
65def _EggexMatchCommon(s, p, ere, empty_p):
66 # type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
67 cflags = regex_translate.LibcFlags(p.canonical_flags)
68 eflags = 0
69 indices = libc.regex_search(ere, cflags, s, eflags)
70 if indices is None:
71 return (False, empty_p, empty_p)
72
73 start = indices[0]
74 end = indices[1]
75
76 return (True, start, end)
77
78
79def _EggexMatchStart(s, p):
80 # type: (str, value.Eggex) -> Tuple[bool, int, int]
81 """Returns a match result for the bytes in 's' that match Eggex pattern
82 `p` when constrained to match at the start of the string.
83
84 Any capturing done by the Eggex pattern is ignored.
85
86 The returned match result is the tuple "(matched, begin, end)". 'matched'
87 is true if the pattern matched. 'begin' and 'end' give the half-open range
88 "[begin, end)" of byte indices from 's' for the match, and are a valid but
89 empty range if 'match' is false.
90
91 Used for shell functions like 'trimStart' when trimming with an Eggex
92 pattern.
93 """
94 ere = regex_translate.AsPosixEre(p)
95 if not ere.startswith('^'):
96 ere = '^' + ere
97 return _EggexMatchCommon(s, p, ere, 0)
98
99
100def _EggexMatchEnd(s, p):
101 # type: (str, value.Eggex) -> Tuple[bool, int, int]
102 """Like _EggexMatchStart, but matches against the end of the
103 string.
104 """
105 ere = regex_translate.AsPosixEre(p)
106 if not ere.endswith('$'):
107 ere = ere + '$'
108 return _EggexMatchCommon(s, p, ere, len(s))
109
110
111START = 0b01
112END = 0b10
113
114
115class HasAffix(vm._Callable):
116 """ Implements `startsWith()`, `endsWith()`. """
117
118 def __init__(self, anchor):
119 # type: (int) -> None
120 assert anchor in (START, END), ("Anchor must be START or END")
121 self.anchor = anchor
122
123 def Call(self, rd):
124 # type: (typed_args.Reader) -> value_t
125 """
126 string => startsWith(pattern_str) # => bool
127 string => startsWith(pattern_eggex) # => bool
128 string => endsWith(pattern_str) # => bool
129 string => endsWith(pattern_eggex) # => bool
130 """
131
132 string = rd.PosStr()
133 pattern_val = rd.PosValue()
134 pattern_str = None # type: str
135 pattern_eggex = None # type: value.Eggex
136 with tagswitch(pattern_val) as case:
137 if case(value_e.Eggex):
138 pattern_eggex = cast(value.Eggex, pattern_val)
139 elif case(value_e.Str):
140 pattern_str = cast(value.Str, pattern_val).s
141 else:
142 raise error.TypeErr(pattern_val,
143 'expected pattern to be Eggex or Str',
144 rd.LeftParenToken())
145 rd.Done()
146
147 matched = False
148 try:
149 if pattern_str is not None:
150 if self.anchor & START:
151 matched, _, _ = _StrMatchStart(string, pattern_str)
152 else:
153 matched, _, _ = _StrMatchEnd(string, pattern_str)
154 else:
155 assert pattern_eggex is not None
156 if self.anchor & START:
157 matched, _, _ = _EggexMatchStart(string, pattern_eggex)
158 else:
159 matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
160 except error.Strict as e:
161 raise error.Expr(e.msg, e.location)
162
163 return value.Bool(matched)
164
165
166class Trim(vm._Callable):
167 """ Implements `trimStart()`, `trimEnd()`, and `trim()` """
168
169 def __init__(self, anchor):
170 # type: (int) -> None
171 assert anchor in (START, END, START
172 | END), ("Anchor must be START, END, or START|END")
173 self.anchor = anchor
174
175 def Call(self, rd):
176 # type: (typed_args.Reader) -> value_t
177 """
178 string => trimStart() # => Str
179 string => trimEnd() # => Str
180 string => trim() # => Str
181 string => trimStart(pattern_str) # => Str
182 string => trimEnd(pattern_str) # => Str
183 string => trim(pattern_str) # => Str
184 string => trimStart(pattern_eggex) # => Str
185 string => trimEnd(pattern_eggex) # => Str
186 string => trim(pattern_eggex) # => Str
187 """
188
189 string = rd.PosStr()
190 pattern_val = rd.OptionalValue()
191 pattern_str = None # type: str
192 pattern_eggex = None # type: value.Eggex
193 if pattern_val:
194 with tagswitch(pattern_val) as case:
195 if case(value_e.Eggex):
196 pattern_eggex = cast(value.Eggex, pattern_val)
197 elif case(value_e.Str):
198 pattern_str = cast(value.Str, pattern_val).s
199 else:
200 raise error.TypeErr(pattern_val,
201 'expected pattern to be Eggex or Str',
202 rd.LeftParenToken())
203 rd.Done()
204
205 start = 0
206 end = len(string)
207 try:
208 if pattern_str is not None:
209 if self.anchor & START:
210 _, _, start = _StrMatchStart(string, pattern_str)
211 if self.anchor & END:
212 _, end, _ = _StrMatchEnd(string, pattern_str)
213 elif pattern_eggex is not None:
214 if self.anchor & START:
215 _, _, start = _EggexMatchStart(string, pattern_eggex)
216 if self.anchor & END:
217 _, end, _ = _EggexMatchEnd(string, pattern_eggex)
218 else:
219 if self.anchor & START:
220 _, start = string_ops.StartsWithWhitespaceByteRange(string)
221 if self.anchor & END:
222 end, _ = string_ops.EndsWithWhitespaceByteRange(string)
223 except error.Strict as e:
224 raise error.Expr(e.msg, e.location)
225
226 res = string[start:end]
227 return value.Str(res)
228
229
230class Upper(vm._Callable):
231
232 def __init__(self):
233 # type: () -> None
234 pass
235
236 def Call(self, rd):
237 # type: (typed_args.Reader) -> value_t
238
239 s = rd.PosStr()
240 rd.Done()
241
242 # TODO: unicode support
243 return value.Str(s.upper())
244
245
246class Lower(vm._Callable):
247
248 def __init__(self):
249 # type: () -> None
250 pass
251
252 def Call(self, rd):
253 # type: (typed_args.Reader) -> value_t
254
255 s = rd.PosStr()
256 rd.Done()
257
258 # TODO: unicode support
259 return value.Str(s.lower())
260
261
262SEARCH = 0
263LEFT_MATCH = 1
264
265
266class SearchMatch(vm._Callable):
267
268 def __init__(self, which_method):
269 # type: (int) -> None
270 self.which_method = which_method
271
272 def Call(self, rd):
273 # type: (typed_args.Reader) -> value_t
274 """
275 s => search(eggex, pos=0)
276 """
277 string = rd.PosStr()
278
279 pattern = rd.PosValue() # Eggex or ERE Str
280 with tagswitch(pattern) as case:
281 if case(value_e.Eggex):
282 eggex_val = cast(value.Eggex, pattern)
283
284 # lazily converts to ERE
285 ere = regex_translate.AsPosixEre(eggex_val)
286 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
287 capture = eggex_ops.Yes(
288 eggex_val.convert_funcs, eggex_val.convert_toks,
289 eggex_val.capture_names) # type: eggex_ops_t
290
291 elif case(value_e.Str):
292 ere = cast(value.Str, pattern).s
293 cflags = 0
294 capture = eggex_ops.No
295
296 else:
297 # TODO: add method name to this error
298 raise error.TypeErr(pattern, 'expected Eggex or Str',
299 rd.LeftParenToken())
300
301 # It's called 'pos', not 'start' like Python. Python has 2 kinds of
302 # 'start' in its regex API, which can be confusing.
303 pos = mops.BigTruncate(rd.NamedInt('pos', 0))
304 rd.Done()
305
306 # Make it anchored
307 if self.which_method == LEFT_MATCH and not ere.startswith('^'):
308 ere = '^' + ere
309
310 if self.which_method == LEFT_MATCH:
311 eflags = 0 # ^ matches beginning even if pos=5
312 else:
313 eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
314
315 indices = libc.regex_search(ere, cflags, string, eflags, pos)
316
317 if indices is None:
318 return value.Null
319
320 return RegexMatch(string, indices, capture)
321
322
323class Replace(vm._Callable):
324
325 def __init__(self, mem, expr_ev):
326 # type: (state.Mem, expr_eval.ExprEvaluator) -> None
327 self.mem = mem
328 self.expr_ev = expr_ev
329
330 def EvalSubstExpr(self, expr, blame_loc):
331 # type: (value.Expr, loc_t) -> str
332 res = self.expr_ev.EvalExpr(expr.e, blame_loc)
333 if res.tag() == value_e.Str:
334 return cast(value.Str, res).s
335
336 raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
337
338 def Call(self, rd):
339 # type: (typed_args.Reader) -> value_t
340 """
341 s => replace(string_val, subst_str, count=-1)
342 s => replace(string_val, subst_expr, count=-1)
343 s => replace(eggex_val, subst_str, count=-1)
344 s => replace(eggex_val, subst_expr, count=-1)
345
346 For count in [0, MAX_INT], there will be no more than count
347 replacements. Any negative count should read as unset, and replace will
348 replace all occurances of the pattern.
349 """
350 string = rd.PosStr()
351
352 string_val = None # type: value.Str
353 eggex_val = None # type: value.Eggex
354 subst_str = None # type: value.Str
355 subst_expr = None # type: value.Expr
356
357 pattern = rd.PosValue()
358 with tagswitch(pattern) as case:
359 if case(value_e.Eggex):
360 # HACK: mycpp will otherwise generate:
361 # value::Eggex* eggex_val ...
362 eggex_val_ = cast(value.Eggex, pattern)
363 eggex_val = eggex_val_
364
365 elif case(value_e.Str):
366 string_val_ = cast(value.Str, pattern)
367 string_val = string_val_
368
369 else:
370 raise error.TypeErr(pattern,
371 'expected pattern to be Eggex or Str',
372 rd.LeftParenToken())
373
374 subst = rd.PosValue()
375 with tagswitch(subst) as case:
376 if case(value_e.Str):
377 subst_str_ = cast(value.Str, subst)
378 subst_str = subst_str_
379
380 elif case(value_e.Expr):
381 subst_expr_ = cast(value.Expr, subst)
382 subst_expr = subst_expr_
383
384 else:
385 raise error.TypeErr(subst,
386 'expected substitution to be Str or Expr',
387 rd.LeftParenToken())
388
389 count = mops.BigTruncate(rd.NamedInt("count", -1))
390 rd.Done()
391
392 if count == 0:
393 return value.Str(string)
394
395 if string_val:
396 if subst_str:
397 s = subst_str.s
398 if subst_expr:
399 # Eval with $0 set to string_val (the matched substring)
400 with state.ctx_Eval(self.mem, string_val.s, None, None):
401 s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
402 assert s is not None
403
404 result = string.replace(string_val.s, s, count)
405
406 return value.Str(result)
407
408 if eggex_val:
409 ere = regex_translate.AsPosixEre(eggex_val)
410 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
411
412 # Walk through the string finding all matches of the compiled ere.
413 # Then, collect unmatched substrings and substitutions into the
414 # `parts` list.
415 pos = 0
416 parts = [] # type: List[str]
417 replace_count = 0
418 while pos < len(string):
419 indices = libc.regex_search(ere, cflags, string, 0, pos)
420 if indices is None:
421 break
422
423 # Collect captures
424 arg0 = None # type: str
425 argv = [] # type: List[str]
426 named_vars = [] # type: List[Tuple[str, value_t]]
427 num_groups = len(indices) / 2
428 for group in xrange(num_groups):
429 start = indices[2 * group]
430 end = indices[2 * group + 1]
431 captured = string[start:end]
432 val = value.Str(captured) # type: value_t
433
434 if len(eggex_val.convert_funcs) and group != 0:
435 convert_func = eggex_val.convert_funcs[group - 1]
436 convert_tok = eggex_val.convert_toks[group - 1]
437
438 if convert_func:
439 val = self.expr_ev.CallConvertFunc(
440 convert_func, val, convert_tok,
441 rd.LeftParenToken())
442
443 # $0, $1, $2 variables are argv values, which must be
444 # strings. Furthermore, they can only be used in string
445 # contexts
446 # eg. "$[1]" != "$1".
447 val_str = val_ops.Stringify(val, rd.LeftParenToken())
448 if group == 0:
449 arg0 = val_str
450 else:
451 argv.append(val_str)
452
453 # $0 cannot be named
454 if group != 0:
455 name = eggex_val.capture_names[group - 2]
456 if name is not None:
457 named_vars.append((name, val))
458
459 if subst_str:
460 s = subst_str.s
461 if subst_expr:
462 with state.ctx_Eval(self.mem, arg0, argv, None):
463 with pure_ysh.ctx_Shvar(self.mem, named_vars):
464 s = self.EvalSubstExpr(subst_expr,
465 rd.LeftParenToken())
466 assert s is not None
467
468 start = indices[0]
469 end = indices[1]
470 parts.append(string[pos:start]) # Unmatched substring
471 parts.append(s) # Replacement
472 pos = end # Move to end of match
473
474 replace_count += 1
475 if count != -1 and replace_count == count:
476 break
477
478 parts.append(string[pos:]) # Remaining unmatched substring
479
480 return value.Str("".join(parts))
481
482 raise AssertionError()