OILS / builtin / method_str.py View on Github | oilshell.org

483 lines, 285 significant
1"""YSH Str methods"""
2
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import loc_t, loc
6from _devbuild.gen.runtime_asdl import scope_e
7from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
8 eggex_ops_t, RegexMatch, LeftName)
9from builtin import pure_ysh
10from core import error
11from core import state
12from core import vm
13from frontend import typed_args
14from mycpp import mops
15from mycpp.mylib import log, tagswitch
16from osh import string_ops
17from ysh import expr_eval
18from ysh import regex_translate
19from ysh import val_ops
20
21import libc
22from libc import REG_NOTBOL
23
24from typing import cast, Any, List, Optional, Tuple
25
26_ = log
27
28
29def _StrMatchStart(s, p):
30 # type: (str, str) -> Tuple[bool, int, int]
31 """Returns the range of bytes in 's' that match string pattern `p`. the
32 pattern matches if 's' starts with all the characters in 'p'.
33
34 The returned match result is the tuple "(matched, begin, end)". 'matched'
35 is true if the pattern matched. 'begin' and 'end' give the half-open range
36 "[begin, end)" of byte indices from 's' for the match, and are a valid but
37 empty range if 'match' is false.
38
39 Used for shell functions like 'trimStart' when trimming a prefix string.
40 """
41 if s.startswith(p):
42 return (True, 0, len(p))
43 else:
44 return (False, 0, 0)
45
46
47def _StrMatchEnd(s, p):
48 # type: (str, str) -> Tuple[bool, int, int]
49 """Returns a match result for the bytes in 's' that match string pattern
50 `p`. the pattern matches if 's' ends with all the characters in 'p'.
51
52 The returned match result is the tuple "(matched, begin, end)". 'matched'
53 is true if the pattern matched. 'begin' and 'end' give the half-open range
54 "[begin, end)" of byte indices from 's' for the match, and are a valid but
55 empty range if 'match' is false.
56
57 Used for shell functions like 'trimEnd' when trimming a suffix string.
58 """
59 len_s = len(s)
60 if s.endswith(p):
61 return (True, len_s - len(p), len_s)
62 else:
63 return (False, len_s, len_s)
64
65
66def _EggexMatchCommon(s, p, ere, empty_p):
67 # type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
68 cflags = regex_translate.LibcFlags(p.canonical_flags)
69 eflags = 0
70 indices = libc.regex_search(ere, cflags, s, eflags)
71 if indices is None:
72 return (False, empty_p, empty_p)
73
74 start = indices[0]
75 end = indices[1]
76
77 return (True, start, end)
78
79
80def _EggexMatchStart(s, p):
81 # type: (str, value.Eggex) -> Tuple[bool, int, int]
82 """Returns a match result for the bytes in 's' that match Eggex pattern
83 `p` when constrained to match at the start of the string.
84
85 Any capturing done by the Eggex pattern is ignored.
86
87 The returned match result is the tuple "(matched, begin, end)". 'matched'
88 is true if the pattern matched. 'begin' and 'end' give the half-open range
89 "[begin, end)" of byte indices from 's' for the match, and are a valid but
90 empty range if 'match' is false.
91
92 Used for shell functions like 'trimStart' when trimming with an Eggex
93 pattern.
94 """
95 ere = regex_translate.AsPosixEre(p)
96 if not ere.startswith('^'):
97 ere = '^' + ere
98 return _EggexMatchCommon(s, p, ere, 0)
99
100
101def _EggexMatchEnd(s, p):
102 # type: (str, value.Eggex) -> Tuple[bool, int, int]
103 """Like _EggexMatchStart, but matches against the end of the
104 string.
105 """
106 ere = regex_translate.AsPosixEre(p)
107 if not ere.endswith('$'):
108 ere = ere + '$'
109 return _EggexMatchCommon(s, p, ere, len(s))
110
111
112START = 0b01
113END = 0b10
114
115
116class HasAffix(vm._Callable):
117 """ Implements `startsWith()`, `endsWith()`. """
118
119 def __init__(self, anchor):
120 # type: (int) -> None
121 assert anchor in (START, END), ("Anchor must be START or END")
122 self.anchor = anchor
123
124 def Call(self, rd):
125 # type: (typed_args.Reader) -> value_t
126 """
127 string => startsWith(pattern_str) # => bool
128 string => startsWith(pattern_eggex) # => bool
129 string => endsWith(pattern_str) # => bool
130 string => endsWith(pattern_eggex) # => bool
131 """
132
133 string = rd.PosStr()
134 pattern_val = rd.PosValue()
135 pattern_str = None # type: str
136 pattern_eggex = None # type: value.Eggex
137 with tagswitch(pattern_val) as case:
138 if case(value_e.Eggex):
139 pattern_eggex = cast(value.Eggex, pattern_val)
140 elif case(value_e.Str):
141 pattern_str = cast(value.Str, pattern_val).s
142 else:
143 raise error.TypeErr(pattern_val,
144 'expected pattern to be Eggex or Str',
145 rd.LeftParenToken())
146 rd.Done()
147
148 matched = False
149 try:
150 if pattern_str is not None:
151 if self.anchor & START:
152 matched, _, _ = _StrMatchStart(string, pattern_str)
153 else:
154 matched, _, _ = _StrMatchEnd(string, pattern_str)
155 else:
156 assert pattern_eggex is not None
157 if self.anchor & START:
158 matched, _, _ = _EggexMatchStart(string, pattern_eggex)
159 else:
160 matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
161 except error.Strict as e:
162 raise error.Expr(e.msg, e.location)
163
164 return value.Bool(matched)
165
166
167class Trim(vm._Callable):
168 """ Implements `trimStart()`, `trimEnd()`, and `trim()` """
169
170 def __init__(self, anchor):
171 # type: (int) -> None
172 assert anchor in (START, END, START
173 | END), ("Anchor must be START, END, or START|END")
174 self.anchor = anchor
175
176 def Call(self, rd):
177 # type: (typed_args.Reader) -> value_t
178 """
179 string => trimStart() # => Str
180 string => trimEnd() # => Str
181 string => trim() # => Str
182 string => trimStart(pattern_str) # => Str
183 string => trimEnd(pattern_str) # => Str
184 string => trim(pattern_str) # => Str
185 string => trimStart(pattern_eggex) # => Str
186 string => trimEnd(pattern_eggex) # => Str
187 string => trim(pattern_eggex) # => Str
188 """
189
190 string = rd.PosStr()
191 pattern_val = rd.OptionalValue()
192 pattern_str = None # type: str
193 pattern_eggex = None # type: value.Eggex
194 if pattern_val:
195 with tagswitch(pattern_val) as case:
196 if case(value_e.Eggex):
197 pattern_eggex = cast(value.Eggex, pattern_val)
198 elif case(value_e.Str):
199 pattern_str = cast(value.Str, pattern_val).s
200 else:
201 raise error.TypeErr(pattern_val,
202 'expected pattern to be Eggex or Str',
203 rd.LeftParenToken())
204 rd.Done()
205
206 start = 0
207 end = len(string)
208 try:
209 if pattern_str is not None:
210 if self.anchor & START:
211 _, _, start = _StrMatchStart(string, pattern_str)
212 if self.anchor & END:
213 _, end, _ = _StrMatchEnd(string, pattern_str)
214 elif pattern_eggex is not None:
215 if self.anchor & START:
216 _, _, start = _EggexMatchStart(string, pattern_eggex)
217 if self.anchor & END:
218 _, end, _ = _EggexMatchEnd(string, pattern_eggex)
219 else:
220 if self.anchor & START:
221 _, start = string_ops.StartsWithWhitespaceByteRange(string)
222 if self.anchor & END:
223 end, _ = string_ops.EndsWithWhitespaceByteRange(string)
224 except error.Strict as e:
225 raise error.Expr(e.msg, e.location)
226
227 res = string[start:end]
228 return value.Str(res)
229
230
231class Upper(vm._Callable):
232
233 def __init__(self):
234 # type: () -> None
235 pass
236
237 def Call(self, rd):
238 # type: (typed_args.Reader) -> value_t
239
240 s = rd.PosStr()
241 rd.Done()
242
243 # TODO: unicode support
244 return value.Str(s.upper())
245
246
247class Lower(vm._Callable):
248
249 def __init__(self):
250 # type: () -> None
251 pass
252
253 def Call(self, rd):
254 # type: (typed_args.Reader) -> value_t
255
256 s = rd.PosStr()
257 rd.Done()
258
259 # TODO: unicode support
260 return value.Str(s.lower())
261
262
263SEARCH = 0
264LEFT_MATCH = 1
265
266
267class SearchMatch(vm._Callable):
268
269 def __init__(self, which_method):
270 # type: (int) -> None
271 self.which_method = which_method
272
273 def Call(self, rd):
274 # type: (typed_args.Reader) -> value_t
275 """
276 s => search(eggex, pos=0)
277 """
278 string = rd.PosStr()
279
280 pattern = rd.PosValue() # Eggex or ERE Str
281 with tagswitch(pattern) as case:
282 if case(value_e.Eggex):
283 eggex_val = cast(value.Eggex, pattern)
284
285 # lazily converts to ERE
286 ere = regex_translate.AsPosixEre(eggex_val)
287 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
288 capture = eggex_ops.Yes(
289 eggex_val.convert_funcs, eggex_val.convert_toks,
290 eggex_val.capture_names) # type: eggex_ops_t
291
292 elif case(value_e.Str):
293 ere = cast(value.Str, pattern).s
294 cflags = 0
295 capture = eggex_ops.No
296
297 else:
298 # TODO: add method name to this error
299 raise error.TypeErr(pattern, 'expected Eggex or Str',
300 rd.LeftParenToken())
301
302 # It's called 'pos', not 'start' like Python. Python has 2 kinds of
303 # 'start' in its regex API, which can be confusing.
304 pos = mops.BigTruncate(rd.NamedInt('pos', 0))
305 rd.Done()
306
307 # Make it anchored
308 if self.which_method == LEFT_MATCH and not ere.startswith('^'):
309 ere = '^' + ere
310
311 if self.which_method == LEFT_MATCH:
312 eflags = 0 # ^ matches beginning even if pos=5
313 else:
314 eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
315
316 indices = libc.regex_search(ere, cflags, string, eflags, pos)
317
318 if indices is None:
319 return value.Null
320
321 return RegexMatch(string, indices, capture)
322
323
324class Replace(vm._Callable):
325
326 def __init__(self, mem, expr_ev):
327 # type: (state.Mem, expr_eval.ExprEvaluator) -> None
328 self.mem = mem
329 self.expr_ev = expr_ev
330
331 def EvalSubstExpr(self, expr, blame_loc):
332 # type: (value.Expr, loc_t) -> str
333 res = self.expr_ev.EvalExpr(expr.e, blame_loc)
334 if res.tag() == value_e.Str:
335 return cast(value.Str, res).s
336
337 raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
338
339 def Call(self, rd):
340 # type: (typed_args.Reader) -> value_t
341 """
342 s => replace(string_val, subst_str, count=-1)
343 s => replace(string_val, subst_expr, count=-1)
344 s => replace(eggex_val, subst_str, count=-1)
345 s => replace(eggex_val, subst_expr, count=-1)
346
347 For count in [0, MAX_INT], there will be no more than count
348 replacements. Any negative count should read as unset, and replace will
349 replace all occurances of the pattern.
350 """
351 string = rd.PosStr()
352
353 string_val = None # type: value.Str
354 eggex_val = None # type: value.Eggex
355 subst_str = None # type: value.Str
356 subst_expr = None # type: value.Expr
357
358 pattern = rd.PosValue()
359 with tagswitch(pattern) as case:
360 if case(value_e.Eggex):
361 # HACK: mycpp will otherwise generate:
362 # value::Eggex* eggex_val ...
363 eggex_val_ = cast(value.Eggex, pattern)
364 eggex_val = eggex_val_
365
366 elif case(value_e.Str):
367 string_val_ = cast(value.Str, pattern)
368 string_val = string_val_
369
370 else:
371 raise error.TypeErr(pattern,
372 'expected pattern to be Eggex or Str',
373 rd.LeftParenToken())
374
375 subst = rd.PosValue()
376 with tagswitch(subst) as case:
377 if case(value_e.Str):
378 subst_str_ = cast(value.Str, subst)
379 subst_str = subst_str_
380
381 elif case(value_e.Expr):
382 subst_expr_ = cast(value.Expr, subst)
383 subst_expr = subst_expr_
384
385 else:
386 raise error.TypeErr(subst,
387 'expected substitution to be Str or Expr',
388 rd.LeftParenToken())
389
390 count = mops.BigTruncate(rd.NamedInt("count", -1))
391 rd.Done()
392
393 if count == 0:
394 return value.Str(string)
395
396 if string_val:
397 if subst_str:
398 s = subst_str.s
399 if subst_expr:
400 # Eval with $0 set to string_val (the matched substring)
401 with state.ctx_Eval(self.mem, string_val.s, None, None):
402 s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
403 assert s is not None
404
405 result = string.replace(string_val.s, s, count)
406
407 return value.Str(result)
408
409 if eggex_val:
410 ere = regex_translate.AsPosixEre(eggex_val)
411 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
412
413 # Walk through the string finding all matches of the compiled ere.
414 # Then, collect unmatched substrings and substitutions into the
415 # `parts` list.
416 pos = 0
417 parts = [] # type: List[str]
418 replace_count = 0
419 while pos < len(string):
420 indices = libc.regex_search(ere, cflags, string, 0, pos)
421 if indices is None:
422 break
423
424 # Collect captures
425 arg0 = None # type: str
426 argv = [] # type: List[str]
427 named_vars = [] # type: List[Tuple[str, value_t]]
428 num_groups = len(indices) / 2
429 for group in xrange(num_groups):
430 start = indices[2 * group]
431 end = indices[2 * group + 1]
432 captured = string[start:end]
433 val = value.Str(captured) # type: value_t
434
435 if len(eggex_val.convert_funcs) and group != 0:
436 convert_func = eggex_val.convert_funcs[group - 1]
437 convert_tok = eggex_val.convert_toks[group - 1]
438
439 if convert_func:
440 val = self.expr_ev.CallConvertFunc(
441 convert_func, val, convert_tok,
442 rd.LeftParenToken())
443
444 # $0, $1, $2 variables are argv values, which must be
445 # strings. Furthermore, they can only be used in string
446 # contexts
447 # eg. "$[1]" != "$1".
448 val_str = val_ops.Stringify(val, rd.LeftParenToken())
449 if group == 0:
450 arg0 = val_str
451 else:
452 argv.append(val_str)
453
454 # $0 cannot be named
455 if group != 0:
456 name = eggex_val.capture_names[group - 2]
457 if name is not None:
458 named_vars.append((name, val))
459
460 if subst_str:
461 s = subst_str.s
462 if subst_expr:
463 with state.ctx_Eval(self.mem, arg0, argv, None):
464 with pure_ysh.ctx_Shvar(self.mem, named_vars):
465 s = self.EvalSubstExpr(subst_expr,
466 rd.LeftParenToken())
467 assert s is not None
468
469 start = indices[0]
470 end = indices[1]
471 parts.append(string[pos:start]) # Unmatched substring
472 parts.append(s) # Replacement
473 pos = end # Move to end of match
474
475 replace_count += 1
476 if count != -1 and replace_count == count:
477 break
478
479 parts.append(string[pos:]) # Remaining unmatched substring
480
481 return value.Str("".join(parts))
482
483 raise AssertionError()