OILS / builtin / method_str.py View on Github | oilshell.org

498 lines, 296 significant
1"""YSH Str methods"""
2
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import loc_t, loc
6from _devbuild.gen.runtime_asdl import scope_e
7from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
8 eggex_ops_t, RegexMatch, LeftName)
9from builtin import pure_ysh
10from core import error
11from core import state
12from core import vm
13from frontend import typed_args
14from mycpp import mops
15from mycpp.mylib import log, tagswitch
16from osh import string_ops
17from ysh import expr_eval
18from ysh import regex_translate
19from ysh import val_ops
20
21import libc
22from libc import REG_NOTBOL
23
24from typing import cast, Any, List, Optional, Tuple
25
26_ = log
27
28
29def _StrMatchStart(s, p):
30 # type: (str, str) -> Tuple[bool, int, int]
31 """Returns the range of bytes in 's' that match string pattern `p`. the
32 pattern matches if 's' starts with all the characters in 'p'.
33
34 The returned match result is the tuple "(matched, begin, end)". 'matched'
35 is true if the pattern matched. 'begin' and 'end' give the half-open range
36 "[begin, end)" of byte indices from 's' for the match, and are a valid but
37 empty range if 'match' is false.
38
39 Used for shell functions like 'trimStart' when trimming a prefix string.
40 """
41 if s.startswith(p):
42 return (True, 0, len(p))
43 else:
44 return (False, 0, 0)
45
46
47def _StrMatchEnd(s, p):
48 # type: (str, str) -> Tuple[bool, int, int]
49 """Returns a match result for the bytes in 's' that match string pattern
50 `p`. the pattern matches if 's' ends with all the characters in 'p'.
51
52 The returned match result is the tuple "(matched, begin, end)". 'matched'
53 is true if the pattern matched. 'begin' and 'end' give the half-open range
54 "[begin, end)" of byte indices from 's' for the match, and are a valid but
55 empty range if 'match' is false.
56
57 Used for shell functions like 'trimEnd' when trimming a suffix string.
58 """
59 len_s = len(s)
60 if s.endswith(p):
61 return (True, len_s - len(p), len_s)
62 else:
63 return (False, len_s, len_s)
64
65
66def _EggexMatchCommon(s, p, ere, empty_p):
67 # type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
68 cflags = regex_translate.LibcFlags(p.canonical_flags)
69 eflags = 0
70 indices = libc.regex_search(ere, cflags, s, eflags)
71 if indices is None:
72 return (False, empty_p, empty_p)
73
74 start = indices[0]
75 end = indices[1]
76
77 return (True, start, end)
78
79
80def _EggexMatchStart(s, p):
81 # type: (str, value.Eggex) -> Tuple[bool, int, int]
82 """Returns a match result for the bytes in 's' that match Eggex pattern
83 `p` when constrained to match at the start of the string.
84
85 Any capturing done by the Eggex pattern is ignored.
86
87 The returned match result is the tuple "(matched, begin, end)". 'matched'
88 is true if the pattern matched. 'begin' and 'end' give the half-open range
89 "[begin, end)" of byte indices from 's' for the match, and are a valid but
90 empty range if 'match' is false.
91
92 Used for shell functions like 'trimStart' when trimming with an Eggex
93 pattern.
94 """
95 ere = regex_translate.AsPosixEre(p)
96 if not ere.startswith('^'):
97 ere = '^' + ere
98 return _EggexMatchCommon(s, p, ere, 0)
99
100
101def _EggexMatchEnd(s, p):
102 # type: (str, value.Eggex) -> Tuple[bool, int, int]
103 """Like _EggexMatchStart, but matches against the end of the
104 string.
105 """
106 ere = regex_translate.AsPosixEre(p)
107 if not ere.endswith('$'):
108 ere = ere + '$'
109 return _EggexMatchCommon(s, p, ere, len(s))
110
111
112START = 0b01
113END = 0b10
114
115
116class HasAffix(vm._Callable):
117 """ Implements `startsWith()`, `endsWith()`. """
118
119 def __init__(self, anchor):
120 # type: (int) -> None
121 assert anchor in (START, END), ("Anchor must be START or END")
122 self.anchor = anchor
123
124 def Call(self, rd):
125 # type: (typed_args.Reader) -> value_t
126 """
127 string => startsWith(pattern_str) # => bool
128 string => startsWith(pattern_eggex) # => bool
129 string => endsWith(pattern_str) # => bool
130 string => endsWith(pattern_eggex) # => bool
131 """
132
133 string = rd.PosStr()
134 pattern_val = rd.PosValue()
135 pattern_str = None # type: str
136 pattern_eggex = None # type: value.Eggex
137 with tagswitch(pattern_val) as case:
138 if case(value_e.Eggex):
139 pattern_eggex = cast(value.Eggex, pattern_val)
140 elif case(value_e.Str):
141 pattern_str = cast(value.Str, pattern_val).s
142 else:
143 raise error.TypeErr(pattern_val,
144 'expected pattern to be Eggex or Str',
145 rd.LeftParenToken())
146 rd.Done()
147
148 matched = False
149 try:
150 if pattern_str is not None:
151 if self.anchor & START:
152 matched, _, _ = _StrMatchStart(string, pattern_str)
153 else:
154 matched, _, _ = _StrMatchEnd(string, pattern_str)
155 else:
156 assert pattern_eggex is not None
157 if self.anchor & START:
158 matched, _, _ = _EggexMatchStart(string, pattern_eggex)
159 else:
160 matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
161 except error.Strict as e:
162 raise error.Expr(e.msg, e.location)
163
164 return value.Bool(matched)
165
166
167class Trim(vm._Callable):
168 """ Implements `trimStart()`, `trimEnd()`, and `trim()` """
169
170 def __init__(self, anchor):
171 # type: (int) -> None
172 assert anchor in (START, END, START
173 | END), ("Anchor must be START, END, or START|END")
174 self.anchor = anchor
175
176 def Call(self, rd):
177 # type: (typed_args.Reader) -> value_t
178 """
179 string => trimStart() # => Str
180 string => trimEnd() # => Str
181 string => trim() # => Str
182 string => trimStart(pattern_str) # => Str
183 string => trimEnd(pattern_str) # => Str
184 string => trim(pattern_str) # => Str
185 string => trimStart(pattern_eggex) # => Str
186 string => trimEnd(pattern_eggex) # => Str
187 string => trim(pattern_eggex) # => Str
188 """
189
190 string = rd.PosStr()
191 pattern_val = rd.OptionalValue()
192 pattern_str = None # type: str
193 pattern_eggex = None # type: value.Eggex
194 if pattern_val:
195 with tagswitch(pattern_val) as case:
196 if case(value_e.Eggex):
197 pattern_eggex = cast(value.Eggex, pattern_val)
198 elif case(value_e.Str):
199 pattern_str = cast(value.Str, pattern_val).s
200 else:
201 raise error.TypeErr(pattern_val,
202 'expected pattern to be Eggex or Str',
203 rd.LeftParenToken())
204 rd.Done()
205
206 start = 0
207 end = len(string)
208 try:
209 if pattern_str is not None:
210 if self.anchor & START:
211 _, _, start = _StrMatchStart(string, pattern_str)
212 if self.anchor & END:
213 _, end, _ = _StrMatchEnd(string, pattern_str)
214 elif pattern_eggex is not None:
215 if self.anchor & START:
216 _, _, start = _EggexMatchStart(string, pattern_eggex)
217 if self.anchor & END:
218 _, end, _ = _EggexMatchEnd(string, pattern_eggex)
219 else:
220 if self.anchor & START:
221 _, start = string_ops.StartsWithWhitespaceByteRange(string)
222 if self.anchor & END:
223 end, _ = string_ops.EndsWithWhitespaceByteRange(string)
224 except error.Strict as e:
225 raise error.Expr(e.msg, e.location)
226
227 res = string[start:end]
228 return value.Str(res)
229
230
231class Upper(vm._Callable):
232
233 def __init__(self):
234 # type: () -> None
235 pass
236
237 def Call(self, rd):
238 # type: (typed_args.Reader) -> value_t
239
240 string = rd.PosStr()
241 rd.Done()
242
243 res = string.upper()
244 return value.Str(res)
245
246
247SEARCH = 0
248LEFT_MATCH = 1
249
250
251class SearchMatch(vm._Callable):
252
253 def __init__(self, which_method):
254 # type: (int) -> None
255 self.which_method = which_method
256
257 def Call(self, rd):
258 # type: (typed_args.Reader) -> value_t
259 """
260 s => search(eggex, pos=0)
261 """
262 string = rd.PosStr()
263
264 pattern = rd.PosValue() # Eggex or ERE Str
265 with tagswitch(pattern) as case:
266 if case(value_e.Eggex):
267 eggex_val = cast(value.Eggex, pattern)
268
269 # lazily converts to ERE
270 ere = regex_translate.AsPosixEre(eggex_val)
271 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
272 capture = eggex_ops.Yes(
273 eggex_val.convert_funcs, eggex_val.convert_toks,
274 eggex_val.capture_names) # type: eggex_ops_t
275
276 elif case(value_e.Str):
277 ere = cast(value.Str, pattern).s
278 cflags = 0
279 capture = eggex_ops.No
280
281 else:
282 # TODO: add method name to this error
283 raise error.TypeErr(pattern, 'expected Eggex or Str',
284 rd.LeftParenToken())
285
286 # It's called 'pos', not 'start' like Python. Python has 2 kinds of
287 # 'start' in its regex API, which can be confusing.
288 pos = mops.BigTruncate(rd.NamedInt('pos', 0))
289 rd.Done()
290
291 # Make it anchored
292 if self.which_method == LEFT_MATCH and not ere.startswith('^'):
293 ere = '^' + ere
294
295 if self.which_method == LEFT_MATCH:
296 eflags = 0 # ^ matches beginning even if pos=5
297 else:
298 eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
299
300 indices = libc.regex_search(ere, cflags, string, eflags, pos)
301
302 if indices is None:
303 return value.Null
304
305 return RegexMatch(string, indices, capture)
306
307
308class ctx_EvalReplace(object):
309 """For $0, $1, $2, $3, ... replacements in Str => replace()"""
310
311 def __init__(self, mem, arg0, argv):
312 # type: (state.Mem, str, Optional[List[str]]) -> None
313 # argv will be None for Str => replace(Str, Expr)
314 if argv is None:
315 self.pushed_argv = False
316 else:
317 mem.argv_stack.append(state._ArgFrame(argv))
318 self.pushed_argv = True
319
320 # $0 needs to have lexical scoping. So we store it with other locals.
321 # As "0" cannot be parsed as an lvalue, we can safely store arg0 there.
322 assert mem.GetValue("0", scope_e.LocalOnly).tag() == value_e.Undef
323 self.lval = LeftName("0", loc.Missing)
324 mem.SetLocalName(self.lval, value.Str(arg0))
325
326 self.mem = mem
327
328 def __enter__(self):
329 # type: () -> None
330 pass
331
332 def __exit__(self, type, value_, traceback):
333 # type: (Any, Any, Any) -> None
334 self.mem.SetLocalName(self.lval, value.Undef)
335 if self.pushed_argv:
336 self.mem.argv_stack.pop()
337
338
339class Replace(vm._Callable):
340
341 def __init__(self, mem, expr_ev):
342 # type: (state.Mem, expr_eval.ExprEvaluator) -> None
343 self.mem = mem
344 self.expr_ev = expr_ev
345
346 def EvalSubstExpr(self, expr, blame_loc):
347 # type: (value.Expr, loc_t) -> str
348 res = self.expr_ev.EvalExpr(expr.e, blame_loc)
349 if res.tag() == value_e.Str:
350 return cast(value.Str, res).s
351
352 raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
353
354 def Call(self, rd):
355 # type: (typed_args.Reader) -> value_t
356 """
357 s => replace(string_val, subst_str, count=-1)
358 s => replace(string_val, subst_expr, count=-1)
359 s => replace(eggex_val, subst_str, count=-1)
360 s => replace(eggex_val, subst_expr, count=-1)
361
362 For count in [0, MAX_INT], there will be no more than count
363 replacements. Any negative count should read as unset, and replace will
364 replace all occurances of the pattern.
365 """
366 string = rd.PosStr()
367
368 string_val = None # type: value.Str
369 eggex_val = None # type: value.Eggex
370 subst_str = None # type: value.Str
371 subst_expr = None # type: value.Expr
372
373 pattern = rd.PosValue()
374 with tagswitch(pattern) as case:
375 if case(value_e.Eggex):
376 # HACK: mycpp will otherwise generate:
377 # value::Eggex* eggex_val ...
378 eggex_val_ = cast(value.Eggex, pattern)
379 eggex_val = eggex_val_
380
381 elif case(value_e.Str):
382 string_val_ = cast(value.Str, pattern)
383 string_val = string_val_
384
385 else:
386 raise error.TypeErr(pattern,
387 'expected pattern to be Eggex or Str',
388 rd.LeftParenToken())
389
390 subst = rd.PosValue()
391 with tagswitch(subst) as case:
392 if case(value_e.Str):
393 subst_str_ = cast(value.Str, subst)
394 subst_str = subst_str_
395
396 elif case(value_e.Expr):
397 subst_expr_ = cast(value.Expr, subst)
398 subst_expr = subst_expr_
399
400 else:
401 raise error.TypeErr(subst,
402 'expected substitution to be Str or Expr',
403 rd.LeftParenToken())
404
405 count = mops.BigTruncate(rd.NamedInt("count", -1))
406 rd.Done()
407
408 if count == 0:
409 return value.Str(string)
410
411 if string_val:
412 if subst_str:
413 s = subst_str.s
414 if subst_expr:
415 # Eval with $0 set to string_val (the matched substring)
416 with ctx_EvalReplace(self.mem, string_val.s, None):
417 s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
418 assert s is not None
419
420 result = string.replace(string_val.s, s, count)
421
422 return value.Str(result)
423
424 if eggex_val:
425 ere = regex_translate.AsPosixEre(eggex_val)
426 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
427
428 # Walk through the string finding all matches of the compiled ere.
429 # Then, collect unmatched substrings and substitutions into the
430 # `parts` list.
431 pos = 0
432 parts = [] # type: List[str]
433 replace_count = 0
434 while pos < len(string):
435 indices = libc.regex_search(ere, cflags, string, 0, pos)
436 if indices is None:
437 break
438
439 # Collect captures
440 arg0 = None # type: str
441 argv = [] # type: List[str]
442 named_vars = [] # type: List[Tuple[str, value_t]]
443 num_groups = len(indices) / 2
444 for group in xrange(num_groups):
445 start = indices[2 * group]
446 end = indices[2 * group + 1]
447 captured = string[start:end]
448 val = value.Str(captured) # type: value_t
449
450 if len(eggex_val.convert_funcs) and group != 0:
451 convert_func = eggex_val.convert_funcs[group - 1]
452 convert_tok = eggex_val.convert_toks[group - 1]
453
454 if convert_func:
455 val = self.expr_ev.CallConvertFunc(
456 convert_func, val, convert_tok,
457 rd.LeftParenToken())
458
459 # $0, $1, $2 variables are argv values, which must be
460 # strings. Furthermore, they can only be used in string
461 # contexts
462 # eg. "$[1]" != "$1".
463 val_str = val_ops.Stringify(val, rd.LeftParenToken())
464 if group == 0:
465 arg0 = val_str
466 else:
467 argv.append(val_str)
468
469 # $0 cannot be named
470 if group != 0:
471 name = eggex_val.capture_names[group - 2]
472 if name is not None:
473 named_vars.append((name, val))
474
475 if subst_str:
476 s = subst_str.s
477 if subst_expr:
478 with ctx_EvalReplace(self.mem, arg0, argv):
479 with pure_ysh.ctx_Shvar(self.mem, named_vars):
480 s = self.EvalSubstExpr(subst_expr,
481 rd.LeftParenToken())
482 assert s is not None
483
484 start = indices[0]
485 end = indices[1]
486 parts.append(string[pos:start]) # Unmatched substring
487 parts.append(s) # Replacement
488 pos = end # Move to end of match
489
490 replace_count += 1
491 if count != -1 and replace_count == count:
492 break
493
494 parts.append(string[pos:]) # Remaining unmatched substring
495
496 return value.Str("".join(parts))
497
498 raise AssertionError()