OILS / builtin / method_str.py View on Github | oilshell.org

515 lines, 302 significant
1"""YSH Str methods"""
2
3from __future__ import print_function
4
5from _devbuild.gen.syntax_asdl import loc_t, loc
6from _devbuild.gen.runtime_asdl import scope_e
7from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
8 eggex_ops_t, RegexMatch, LeftName)
9from builtin import pure_ysh
10from core import error
11from core import state
12from core import vm
13from frontend import typed_args
14from mycpp import mops
15from mycpp.mylib import log, tagswitch
16from osh import string_ops
17from ysh import expr_eval
18from ysh import regex_translate
19from ysh import val_ops
20
21import libc
22from libc import REG_NOTBOL
23
24from typing import cast, Any, List, Optional, Tuple
25
26_ = log
27
28
29def _StrMatchStart(s, p):
30 # type: (str, str) -> Tuple[bool, int, int]
31 """Returns the range of bytes in 's' that match string pattern `p`. the
32 pattern matches if 's' starts with all the characters in 'p'.
33
34 The returned match result is the tuple "(matched, begin, end)". 'matched'
35 is true if the pattern matched. 'begin' and 'end' give the half-open range
36 "[begin, end)" of byte indices from 's' for the match, and are a valid but
37 empty range if 'match' is false.
38
39 Used for shell functions like 'trimStart' when trimming a prefix string.
40 """
41 if s.startswith(p):
42 return (True, 0, len(p))
43 else:
44 return (False, 0, 0)
45
46
47def _StrMatchEnd(s, p):
48 # type: (str, str) -> Tuple[bool, int, int]
49 """Returns a match result for the bytes in 's' that match string pattern
50 `p`. the pattern matches if 's' ends with all the characters in 'p'.
51
52 The returned match result is the tuple "(matched, begin, end)". 'matched'
53 is true if the pattern matched. 'begin' and 'end' give the half-open range
54 "[begin, end)" of byte indices from 's' for the match, and are a valid but
55 empty range if 'match' is false.
56
57 Used for shell functions like 'trimEnd' when trimming a suffix string.
58 """
59 len_s = len(s)
60 if s.endswith(p):
61 return (True, len_s - len(p), len_s)
62 else:
63 return (False, len_s, len_s)
64
65
66def _EggexMatchCommon(s, p, ere, empty_p):
67 # type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
68 cflags = regex_translate.LibcFlags(p.canonical_flags)
69 eflags = 0
70 indices = libc.regex_search(ere, cflags, s, eflags)
71 if indices is None:
72 return (False, empty_p, empty_p)
73
74 start = indices[0]
75 end = indices[1]
76
77 return (True, start, end)
78
79
80def _EggexMatchStart(s, p):
81 # type: (str, value.Eggex) -> Tuple[bool, int, int]
82 """Returns a match result for the bytes in 's' that match Eggex pattern
83 `p` when constrained to match at the start of the string.
84
85 Any capturing done by the Eggex pattern is ignored.
86
87 The returned match result is the tuple "(matched, begin, end)". 'matched'
88 is true if the pattern matched. 'begin' and 'end' give the half-open range
89 "[begin, end)" of byte indices from 's' for the match, and are a valid but
90 empty range if 'match' is false.
91
92 Used for shell functions like 'trimStart' when trimming with an Eggex
93 pattern.
94 """
95 ere = regex_translate.AsPosixEre(p)
96 if not ere.startswith('^'):
97 ere = '^' + ere
98 return _EggexMatchCommon(s, p, ere, 0)
99
100
101def _EggexMatchEnd(s, p):
102 # type: (str, value.Eggex) -> Tuple[bool, int, int]
103 """Like _EggexMatchStart, but matches against the end of the
104 string.
105 """
106 ere = regex_translate.AsPosixEre(p)
107 if not ere.endswith('$'):
108 ere = ere + '$'
109 return _EggexMatchCommon(s, p, ere, len(s))
110
111
112START = 0b01
113END = 0b10
114
115
116class HasAffix(vm._Callable):
117 """ Implements `startsWith()`, `endsWith()`. """
118
119 def __init__(self, anchor):
120 # type: (int) -> None
121 assert anchor in (START, END), ("Anchor must be START or END")
122 self.anchor = anchor
123
124 def Call(self, rd):
125 # type: (typed_args.Reader) -> value_t
126 """
127 string => startsWith(pattern_str) # => bool
128 string => startsWith(pattern_eggex) # => bool
129 string => endsWith(pattern_str) # => bool
130 string => endsWith(pattern_eggex) # => bool
131 """
132
133 string = rd.PosStr()
134 pattern_val = rd.PosValue()
135 pattern_str = None # type: str
136 pattern_eggex = None # type: value.Eggex
137 with tagswitch(pattern_val) as case:
138 if case(value_e.Eggex):
139 pattern_eggex = cast(value.Eggex, pattern_val)
140 elif case(value_e.Str):
141 pattern_str = cast(value.Str, pattern_val).s
142 else:
143 raise error.TypeErr(pattern_val,
144 'expected pattern to be Eggex or Str',
145 rd.LeftParenToken())
146 rd.Done()
147
148 matched = False
149 try:
150 if pattern_str is not None:
151 if self.anchor & START:
152 matched, _, _ = _StrMatchStart(string, pattern_str)
153 else:
154 matched, _, _ = _StrMatchEnd(string, pattern_str)
155 else:
156 assert pattern_eggex is not None
157 if self.anchor & START:
158 matched, _, _ = _EggexMatchStart(string, pattern_eggex)
159 else:
160 matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
161 except error.Strict as e:
162 raise error.Expr(e.msg, e.location)
163
164 return value.Bool(matched)
165
166
167class Trim(vm._Callable):
168 """ Implements `trimStart()`, `trimEnd()`, and `trim()` """
169
170 def __init__(self, anchor):
171 # type: (int) -> None
172 assert anchor in (START, END, START
173 | END), ("Anchor must be START, END, or START|END")
174 self.anchor = anchor
175
176 def Call(self, rd):
177 # type: (typed_args.Reader) -> value_t
178 """
179 string => trimStart() # => Str
180 string => trimEnd() # => Str
181 string => trim() # => Str
182 string => trimStart(pattern_str) # => Str
183 string => trimEnd(pattern_str) # => Str
184 string => trim(pattern_str) # => Str
185 string => trimStart(pattern_eggex) # => Str
186 string => trimEnd(pattern_eggex) # => Str
187 string => trim(pattern_eggex) # => Str
188 """
189
190 string = rd.PosStr()
191 pattern_val = rd.OptionalValue()
192 pattern_str = None # type: str
193 pattern_eggex = None # type: value.Eggex
194 if pattern_val:
195 with tagswitch(pattern_val) as case:
196 if case(value_e.Eggex):
197 pattern_eggex = cast(value.Eggex, pattern_val)
198 elif case(value_e.Str):
199 pattern_str = cast(value.Str, pattern_val).s
200 else:
201 raise error.TypeErr(pattern_val,
202 'expected pattern to be Eggex or Str',
203 rd.LeftParenToken())
204 rd.Done()
205
206 start = 0
207 end = len(string)
208 try:
209 if pattern_str is not None:
210 if self.anchor & START:
211 _, _, start = _StrMatchStart(string, pattern_str)
212 if self.anchor & END:
213 _, end, _ = _StrMatchEnd(string, pattern_str)
214 elif pattern_eggex is not None:
215 if self.anchor & START:
216 _, _, start = _EggexMatchStart(string, pattern_eggex)
217 if self.anchor & END:
218 _, end, _ = _EggexMatchEnd(string, pattern_eggex)
219 else:
220 if self.anchor & START:
221 _, start = string_ops.StartsWithWhitespaceByteRange(string)
222 if self.anchor & END:
223 end, _ = string_ops.EndsWithWhitespaceByteRange(string)
224 except error.Strict as e:
225 raise error.Expr(e.msg, e.location)
226
227 res = string[start:end]
228 return value.Str(res)
229
230
231class Upper(vm._Callable):
232
233 def __init__(self):
234 # type: () -> None
235 pass
236
237 def Call(self, rd):
238 # type: (typed_args.Reader) -> value_t
239
240 s = rd.PosStr()
241 rd.Done()
242
243 # TODO: unicode support
244 return value.Str(s.upper())
245
246
247class Lower(vm._Callable):
248
249 def __init__(self):
250 # type: () -> None
251 pass
252
253 def Call(self, rd):
254 # type: (typed_args.Reader) -> value_t
255
256 s = rd.PosStr()
257 rd.Done()
258
259 # TODO: unicode support
260 return value.Str(s.lower())
261
262
263SEARCH = 0
264LEFT_MATCH = 1
265
266
267class SearchMatch(vm._Callable):
268
269 def __init__(self, which_method):
270 # type: (int) -> None
271 self.which_method = which_method
272
273 def Call(self, rd):
274 # type: (typed_args.Reader) -> value_t
275 """
276 s => search(eggex, pos=0)
277 """
278 string = rd.PosStr()
279
280 pattern = rd.PosValue() # Eggex or ERE Str
281 with tagswitch(pattern) as case:
282 if case(value_e.Eggex):
283 eggex_val = cast(value.Eggex, pattern)
284
285 # lazily converts to ERE
286 ere = regex_translate.AsPosixEre(eggex_val)
287 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
288 capture = eggex_ops.Yes(
289 eggex_val.convert_funcs, eggex_val.convert_toks,
290 eggex_val.capture_names) # type: eggex_ops_t
291
292 elif case(value_e.Str):
293 ere = cast(value.Str, pattern).s
294 cflags = 0
295 capture = eggex_ops.No
296
297 else:
298 # TODO: add method name to this error
299 raise error.TypeErr(pattern, 'expected Eggex or Str',
300 rd.LeftParenToken())
301
302 # It's called 'pos', not 'start' like Python. Python has 2 kinds of
303 # 'start' in its regex API, which can be confusing.
304 pos = mops.BigTruncate(rd.NamedInt('pos', 0))
305 rd.Done()
306
307 # Make it anchored
308 if self.which_method == LEFT_MATCH and not ere.startswith('^'):
309 ere = '^' + ere
310
311 if self.which_method == LEFT_MATCH:
312 eflags = 0 # ^ matches beginning even if pos=5
313 else:
314 eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
315
316 indices = libc.regex_search(ere, cflags, string, eflags, pos)
317
318 if indices is None:
319 return value.Null
320
321 return RegexMatch(string, indices, capture)
322
323
324# TODO: replace this with state.ctx_Eval
325class ctx_EvalReplace(object):
326 """For $0, $1, $2, $3, ... replacements in Str => replace()"""
327
328 def __init__(self, mem, arg0, argv):
329 # type: (state.Mem, str, Optional[List[str]]) -> None
330 # argv will be None for Str => replace(Str, Expr)
331 if argv is None:
332 self.pushed_argv = False
333 else:
334 mem.argv_stack.append(state._ArgFrame(argv))
335 self.pushed_argv = True
336
337 # $0 needs to have lexical scoping. So we store it with other locals.
338 # As "0" cannot be parsed as an lvalue, we can safely store arg0 there.
339 assert mem.GetValue("0", scope_e.LocalOnly).tag() == value_e.Undef
340 self.lval = LeftName("0", loc.Missing)
341 mem.SetLocalName(self.lval, value.Str(arg0))
342
343 self.mem = mem
344
345 def __enter__(self):
346 # type: () -> None
347 pass
348
349 def __exit__(self, type, value_, traceback):
350 # type: (Any, Any, Any) -> None
351 self.mem.SetLocalName(self.lval, value.Undef)
352 if self.pushed_argv:
353 self.mem.argv_stack.pop()
354
355
356class Replace(vm._Callable):
357
358 def __init__(self, mem, expr_ev):
359 # type: (state.Mem, expr_eval.ExprEvaluator) -> None
360 self.mem = mem
361 self.expr_ev = expr_ev
362
363 def EvalSubstExpr(self, expr, blame_loc):
364 # type: (value.Expr, loc_t) -> str
365 res = self.expr_ev.EvalExpr(expr.e, blame_loc)
366 if res.tag() == value_e.Str:
367 return cast(value.Str, res).s
368
369 raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
370
371 def Call(self, rd):
372 # type: (typed_args.Reader) -> value_t
373 """
374 s => replace(string_val, subst_str, count=-1)
375 s => replace(string_val, subst_expr, count=-1)
376 s => replace(eggex_val, subst_str, count=-1)
377 s => replace(eggex_val, subst_expr, count=-1)
378
379 For count in [0, MAX_INT], there will be no more than count
380 replacements. Any negative count should read as unset, and replace will
381 replace all occurances of the pattern.
382 """
383 string = rd.PosStr()
384
385 string_val = None # type: value.Str
386 eggex_val = None # type: value.Eggex
387 subst_str = None # type: value.Str
388 subst_expr = None # type: value.Expr
389
390 pattern = rd.PosValue()
391 with tagswitch(pattern) as case:
392 if case(value_e.Eggex):
393 # HACK: mycpp will otherwise generate:
394 # value::Eggex* eggex_val ...
395 eggex_val_ = cast(value.Eggex, pattern)
396 eggex_val = eggex_val_
397
398 elif case(value_e.Str):
399 string_val_ = cast(value.Str, pattern)
400 string_val = string_val_
401
402 else:
403 raise error.TypeErr(pattern,
404 'expected pattern to be Eggex or Str',
405 rd.LeftParenToken())
406
407 subst = rd.PosValue()
408 with tagswitch(subst) as case:
409 if case(value_e.Str):
410 subst_str_ = cast(value.Str, subst)
411 subst_str = subst_str_
412
413 elif case(value_e.Expr):
414 subst_expr_ = cast(value.Expr, subst)
415 subst_expr = subst_expr_
416
417 else:
418 raise error.TypeErr(subst,
419 'expected substitution to be Str or Expr',
420 rd.LeftParenToken())
421
422 count = mops.BigTruncate(rd.NamedInt("count", -1))
423 rd.Done()
424
425 if count == 0:
426 return value.Str(string)
427
428 if string_val:
429 if subst_str:
430 s = subst_str.s
431 if subst_expr:
432 # Eval with $0 set to string_val (the matched substring)
433 with ctx_EvalReplace(self.mem, string_val.s, None):
434 s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
435 assert s is not None
436
437 result = string.replace(string_val.s, s, count)
438
439 return value.Str(result)
440
441 if eggex_val:
442 ere = regex_translate.AsPosixEre(eggex_val)
443 cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
444
445 # Walk through the string finding all matches of the compiled ere.
446 # Then, collect unmatched substrings and substitutions into the
447 # `parts` list.
448 pos = 0
449 parts = [] # type: List[str]
450 replace_count = 0
451 while pos < len(string):
452 indices = libc.regex_search(ere, cflags, string, 0, pos)
453 if indices is None:
454 break
455
456 # Collect captures
457 arg0 = None # type: str
458 argv = [] # type: List[str]
459 named_vars = [] # type: List[Tuple[str, value_t]]
460 num_groups = len(indices) / 2
461 for group in xrange(num_groups):
462 start = indices[2 * group]
463 end = indices[2 * group + 1]
464 captured = string[start:end]
465 val = value.Str(captured) # type: value_t
466
467 if len(eggex_val.convert_funcs) and group != 0:
468 convert_func = eggex_val.convert_funcs[group - 1]
469 convert_tok = eggex_val.convert_toks[group - 1]
470
471 if convert_func:
472 val = self.expr_ev.CallConvertFunc(
473 convert_func, val, convert_tok,
474 rd.LeftParenToken())
475
476 # $0, $1, $2 variables are argv values, which must be
477 # strings. Furthermore, they can only be used in string
478 # contexts
479 # eg. "$[1]" != "$1".
480 val_str = val_ops.Stringify(val, rd.LeftParenToken())
481 if group == 0:
482 arg0 = val_str
483 else:
484 argv.append(val_str)
485
486 # $0 cannot be named
487 if group != 0:
488 name = eggex_val.capture_names[group - 2]
489 if name is not None:
490 named_vars.append((name, val))
491
492 if subst_str:
493 s = subst_str.s
494 if subst_expr:
495 with ctx_EvalReplace(self.mem, arg0, argv):
496 with pure_ysh.ctx_Shvar(self.mem, named_vars):
497 s = self.EvalSubstExpr(subst_expr,
498 rd.LeftParenToken())
499 assert s is not None
500
501 start = indices[0]
502 end = indices[1]
503 parts.append(string[pos:start]) # Unmatched substring
504 parts.append(s) # Replacement
505 pos = end # Move to end of match
506
507 replace_count += 1
508 if count != -1 and replace_count == count:
509 break
510
511 parts.append(string[pos:]) # Remaining unmatched substring
512
513 return value.Str("".join(parts))
514
515 raise AssertionError()