1 | """YSH Str methods"""
|
2 |
|
3 | from __future__ import print_function
|
4 |
|
5 | from _devbuild.gen.syntax_asdl import loc_t, loc
|
6 | from _devbuild.gen.runtime_asdl import scope_e
|
7 | from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
|
8 | eggex_ops_t, RegexMatch, LeftName)
|
9 | from builtin import pure_ysh
|
10 | from core import error
|
11 | from core import state
|
12 | from core import vm
|
13 | from frontend import typed_args
|
14 | from mycpp import mops
|
15 | from mycpp.mylib import log, tagswitch
|
16 | from osh import string_ops
|
17 | from ysh import expr_eval
|
18 | from ysh import regex_translate
|
19 | from ysh import val_ops
|
20 |
|
21 | import libc
|
22 | from libc import REG_NOTBOL
|
23 |
|
24 | from typing import cast, Any, List, Optional, Tuple
|
25 |
|
26 | _ = log
|
27 |
|
28 |
|
29 | def _StrMatchStart(s, p):
|
30 | # type: (str, str) -> Tuple[bool, int, int]
|
31 | """Returns the range of bytes in 's' that match string pattern `p`. the
|
32 | pattern matches if 's' starts with all the characters in 'p'.
|
33 |
|
34 | The returned match result is the tuple "(matched, begin, end)". 'matched'
|
35 | is true if the pattern matched. 'begin' and 'end' give the half-open range
|
36 | "[begin, end)" of byte indices from 's' for the match, and are a valid but
|
37 | empty range if 'match' is false.
|
38 |
|
39 | Used for shell functions like 'trimStart' when trimming a prefix string.
|
40 | """
|
41 | if s.startswith(p):
|
42 | return (True, 0, len(p))
|
43 | else:
|
44 | return (False, 0, 0)
|
45 |
|
46 |
|
47 | def _StrMatchEnd(s, p):
|
48 | # type: (str, str) -> Tuple[bool, int, int]
|
49 | """Returns a match result for the bytes in 's' that match string pattern
|
50 | `p`. the pattern matches if 's' ends with all the characters in 'p'.
|
51 |
|
52 | The returned match result is the tuple "(matched, begin, end)". 'matched'
|
53 | is true if the pattern matched. 'begin' and 'end' give the half-open range
|
54 | "[begin, end)" of byte indices from 's' for the match, and are a valid but
|
55 | empty range if 'match' is false.
|
56 |
|
57 | Used for shell functions like 'trimEnd' when trimming a suffix string.
|
58 | """
|
59 | len_s = len(s)
|
60 | if s.endswith(p):
|
61 | return (True, len_s - len(p), len_s)
|
62 | else:
|
63 | return (False, len_s, len_s)
|
64 |
|
65 |
|
66 | def _EggexMatchCommon(s, p, ere, empty_p):
|
67 | # type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
|
68 | cflags = regex_translate.LibcFlags(p.canonical_flags)
|
69 | eflags = 0
|
70 | indices = libc.regex_search(ere, cflags, s, eflags)
|
71 | if indices is None:
|
72 | return (False, empty_p, empty_p)
|
73 |
|
74 | start = indices[0]
|
75 | end = indices[1]
|
76 |
|
77 | return (True, start, end)
|
78 |
|
79 |
|
80 | def _EggexMatchStart(s, p):
|
81 | # type: (str, value.Eggex) -> Tuple[bool, int, int]
|
82 | """Returns a match result for the bytes in 's' that match Eggex pattern
|
83 | `p` when constrained to match at the start of the string.
|
84 |
|
85 | Any capturing done by the Eggex pattern is ignored.
|
86 |
|
87 | The returned match result is the tuple "(matched, begin, end)". 'matched'
|
88 | is true if the pattern matched. 'begin' and 'end' give the half-open range
|
89 | "[begin, end)" of byte indices from 's' for the match, and are a valid but
|
90 | empty range if 'match' is false.
|
91 |
|
92 | Used for shell functions like 'trimStart' when trimming with an Eggex
|
93 | pattern.
|
94 | """
|
95 | ere = regex_translate.AsPosixEre(p)
|
96 | if not ere.startswith('^'):
|
97 | ere = '^' + ere
|
98 | return _EggexMatchCommon(s, p, ere, 0)
|
99 |
|
100 |
|
101 | def _EggexMatchEnd(s, p):
|
102 | # type: (str, value.Eggex) -> Tuple[bool, int, int]
|
103 | """Like _EggexMatchStart, but matches against the end of the
|
104 | string.
|
105 | """
|
106 | ere = regex_translate.AsPosixEre(p)
|
107 | if not ere.endswith('$'):
|
108 | ere = ere + '$'
|
109 | return _EggexMatchCommon(s, p, ere, len(s))
|
110 |
|
111 |
|
112 | START = 0b01
|
113 | END = 0b10
|
114 |
|
115 |
|
116 | class HasAffix(vm._Callable):
|
117 | """ Implements `startsWith()`, `endsWith()`. """
|
118 |
|
119 | def __init__(self, anchor):
|
120 | # type: (int) -> None
|
121 | assert anchor in (START, END), ("Anchor must be START or END")
|
122 | self.anchor = anchor
|
123 |
|
124 | def Call(self, rd):
|
125 | # type: (typed_args.Reader) -> value_t
|
126 | """
|
127 | string => startsWith(pattern_str) # => bool
|
128 | string => startsWith(pattern_eggex) # => bool
|
129 | string => endsWith(pattern_str) # => bool
|
130 | string => endsWith(pattern_eggex) # => bool
|
131 | """
|
132 |
|
133 | string = rd.PosStr()
|
134 | pattern_val = rd.PosValue()
|
135 | pattern_str = None # type: str
|
136 | pattern_eggex = None # type: value.Eggex
|
137 | with tagswitch(pattern_val) as case:
|
138 | if case(value_e.Eggex):
|
139 | pattern_eggex = cast(value.Eggex, pattern_val)
|
140 | elif case(value_e.Str):
|
141 | pattern_str = cast(value.Str, pattern_val).s
|
142 | else:
|
143 | raise error.TypeErr(pattern_val,
|
144 | 'expected pattern to be Eggex or Str',
|
145 | rd.LeftParenToken())
|
146 | rd.Done()
|
147 |
|
148 | matched = False
|
149 | try:
|
150 | if pattern_str is not None:
|
151 | if self.anchor & START:
|
152 | matched, _, _ = _StrMatchStart(string, pattern_str)
|
153 | else:
|
154 | matched, _, _ = _StrMatchEnd(string, pattern_str)
|
155 | else:
|
156 | assert pattern_eggex is not None
|
157 | if self.anchor & START:
|
158 | matched, _, _ = _EggexMatchStart(string, pattern_eggex)
|
159 | else:
|
160 | matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
|
161 | except error.Strict as e:
|
162 | raise error.Expr(e.msg, e.location)
|
163 |
|
164 | return value.Bool(matched)
|
165 |
|
166 |
|
167 | class Trim(vm._Callable):
|
168 | """ Implements `trimStart()`, `trimEnd()`, and `trim()` """
|
169 |
|
170 | def __init__(self, anchor):
|
171 | # type: (int) -> None
|
172 | assert anchor in (START, END, START
|
173 | | END), ("Anchor must be START, END, or START|END")
|
174 | self.anchor = anchor
|
175 |
|
176 | def Call(self, rd):
|
177 | # type: (typed_args.Reader) -> value_t
|
178 | """
|
179 | string => trimStart() # => Str
|
180 | string => trimEnd() # => Str
|
181 | string => trim() # => Str
|
182 | string => trimStart(pattern_str) # => Str
|
183 | string => trimEnd(pattern_str) # => Str
|
184 | string => trim(pattern_str) # => Str
|
185 | string => trimStart(pattern_eggex) # => Str
|
186 | string => trimEnd(pattern_eggex) # => Str
|
187 | string => trim(pattern_eggex) # => Str
|
188 | """
|
189 |
|
190 | string = rd.PosStr()
|
191 | pattern_val = rd.OptionalValue()
|
192 | pattern_str = None # type: str
|
193 | pattern_eggex = None # type: value.Eggex
|
194 | if pattern_val:
|
195 | with tagswitch(pattern_val) as case:
|
196 | if case(value_e.Eggex):
|
197 | pattern_eggex = cast(value.Eggex, pattern_val)
|
198 | elif case(value_e.Str):
|
199 | pattern_str = cast(value.Str, pattern_val).s
|
200 | else:
|
201 | raise error.TypeErr(pattern_val,
|
202 | 'expected pattern to be Eggex or Str',
|
203 | rd.LeftParenToken())
|
204 | rd.Done()
|
205 |
|
206 | start = 0
|
207 | end = len(string)
|
208 | try:
|
209 | if pattern_str is not None:
|
210 | if self.anchor & START:
|
211 | _, _, start = _StrMatchStart(string, pattern_str)
|
212 | if self.anchor & END:
|
213 | _, end, _ = _StrMatchEnd(string, pattern_str)
|
214 | elif pattern_eggex is not None:
|
215 | if self.anchor & START:
|
216 | _, _, start = _EggexMatchStart(string, pattern_eggex)
|
217 | if self.anchor & END:
|
218 | _, end, _ = _EggexMatchEnd(string, pattern_eggex)
|
219 | else:
|
220 | if self.anchor & START:
|
221 | _, start = string_ops.StartsWithWhitespaceByteRange(string)
|
222 | if self.anchor & END:
|
223 | end, _ = string_ops.EndsWithWhitespaceByteRange(string)
|
224 | except error.Strict as e:
|
225 | raise error.Expr(e.msg, e.location)
|
226 |
|
227 | res = string[start:end]
|
228 | return value.Str(res)
|
229 |
|
230 |
|
231 | class Upper(vm._Callable):
|
232 |
|
233 | def __init__(self):
|
234 | # type: () -> None
|
235 | pass
|
236 |
|
237 | def Call(self, rd):
|
238 | # type: (typed_args.Reader) -> value_t
|
239 |
|
240 | string = rd.PosStr()
|
241 | rd.Done()
|
242 |
|
243 | res = string.upper()
|
244 | return value.Str(res)
|
245 |
|
246 |
|
247 | SEARCH = 0
|
248 | LEFT_MATCH = 1
|
249 |
|
250 |
|
251 | class SearchMatch(vm._Callable):
|
252 |
|
253 | def __init__(self, which_method):
|
254 | # type: (int) -> None
|
255 | self.which_method = which_method
|
256 |
|
257 | def Call(self, rd):
|
258 | # type: (typed_args.Reader) -> value_t
|
259 | """
|
260 | s => search(eggex, pos=0)
|
261 | """
|
262 | string = rd.PosStr()
|
263 |
|
264 | pattern = rd.PosValue() # Eggex or ERE Str
|
265 | with tagswitch(pattern) as case:
|
266 | if case(value_e.Eggex):
|
267 | eggex_val = cast(value.Eggex, pattern)
|
268 |
|
269 | # lazily converts to ERE
|
270 | ere = regex_translate.AsPosixEre(eggex_val)
|
271 | cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
|
272 | capture = eggex_ops.Yes(
|
273 | eggex_val.convert_funcs, eggex_val.convert_toks,
|
274 | eggex_val.capture_names) # type: eggex_ops_t
|
275 |
|
276 | elif case(value_e.Str):
|
277 | ere = cast(value.Str, pattern).s
|
278 | cflags = 0
|
279 | capture = eggex_ops.No
|
280 |
|
281 | else:
|
282 | # TODO: add method name to this error
|
283 | raise error.TypeErr(pattern, 'expected Eggex or Str',
|
284 | rd.LeftParenToken())
|
285 |
|
286 | # It's called 'pos', not 'start' like Python. Python has 2 kinds of
|
287 | # 'start' in its regex API, which can be confusing.
|
288 | pos = mops.BigTruncate(rd.NamedInt('pos', 0))
|
289 | rd.Done()
|
290 |
|
291 | # Make it anchored
|
292 | if self.which_method == LEFT_MATCH and not ere.startswith('^'):
|
293 | ere = '^' + ere
|
294 |
|
295 | if self.which_method == LEFT_MATCH:
|
296 | eflags = 0 # ^ matches beginning even if pos=5
|
297 | else:
|
298 | eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
|
299 |
|
300 | indices = libc.regex_search(ere, cflags, string, eflags, pos)
|
301 |
|
302 | if indices is None:
|
303 | return value.Null
|
304 |
|
305 | return RegexMatch(string, indices, capture)
|
306 |
|
307 |
|
308 | class ctx_EvalReplace(object):
|
309 | """For $0, $1, $2, $3, ... replacements in Str => replace()"""
|
310 |
|
311 | def __init__(self, mem, arg0, argv):
|
312 | # type: (state.Mem, str, Optional[List[str]]) -> None
|
313 | # argv will be None for Str => replace(Str, Expr)
|
314 | if argv is None:
|
315 | self.pushed_argv = False
|
316 | else:
|
317 | mem.argv_stack.append(state._ArgFrame(argv))
|
318 | self.pushed_argv = True
|
319 |
|
320 | # $0 needs to have lexical scoping. So we store it with other locals.
|
321 | # As "0" cannot be parsed as an lvalue, we can safely store arg0 there.
|
322 | assert mem.GetValue("0", scope_e.LocalOnly).tag() == value_e.Undef
|
323 | self.lval = LeftName("0", loc.Missing)
|
324 | mem.SetLocalName(self.lval, value.Str(arg0))
|
325 |
|
326 | self.mem = mem
|
327 |
|
328 | def __enter__(self):
|
329 | # type: () -> None
|
330 | pass
|
331 |
|
332 | def __exit__(self, type, value_, traceback):
|
333 | # type: (Any, Any, Any) -> None
|
334 | self.mem.SetLocalName(self.lval, value.Undef)
|
335 | if self.pushed_argv:
|
336 | self.mem.argv_stack.pop()
|
337 |
|
338 |
|
339 | class Replace(vm._Callable):
|
340 |
|
341 | def __init__(self, mem, expr_ev):
|
342 | # type: (state.Mem, expr_eval.ExprEvaluator) -> None
|
343 | self.mem = mem
|
344 | self.expr_ev = expr_ev
|
345 |
|
346 | def EvalSubstExpr(self, expr, blame_loc):
|
347 | # type: (value.Expr, loc_t) -> str
|
348 | res = self.expr_ev.EvalExpr(expr.e, blame_loc)
|
349 | if res.tag() == value_e.Str:
|
350 | return cast(value.Str, res).s
|
351 |
|
352 | raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
|
353 |
|
354 | def Call(self, rd):
|
355 | # type: (typed_args.Reader) -> value_t
|
356 | """
|
357 | s => replace(string_val, subst_str, count=-1)
|
358 | s => replace(string_val, subst_expr, count=-1)
|
359 | s => replace(eggex_val, subst_str, count=-1)
|
360 | s => replace(eggex_val, subst_expr, count=-1)
|
361 |
|
362 | For count in [0, MAX_INT], there will be no more than count
|
363 | replacements. Any negative count should read as unset, and replace will
|
364 | replace all occurances of the pattern.
|
365 | """
|
366 | string = rd.PosStr()
|
367 |
|
368 | string_val = None # type: value.Str
|
369 | eggex_val = None # type: value.Eggex
|
370 | subst_str = None # type: value.Str
|
371 | subst_expr = None # type: value.Expr
|
372 |
|
373 | pattern = rd.PosValue()
|
374 | with tagswitch(pattern) as case:
|
375 | if case(value_e.Eggex):
|
376 | # HACK: mycpp will otherwise generate:
|
377 | # value::Eggex* eggex_val ...
|
378 | eggex_val_ = cast(value.Eggex, pattern)
|
379 | eggex_val = eggex_val_
|
380 |
|
381 | elif case(value_e.Str):
|
382 | string_val_ = cast(value.Str, pattern)
|
383 | string_val = string_val_
|
384 |
|
385 | else:
|
386 | raise error.TypeErr(pattern,
|
387 | 'expected pattern to be Eggex or Str',
|
388 | rd.LeftParenToken())
|
389 |
|
390 | subst = rd.PosValue()
|
391 | with tagswitch(subst) as case:
|
392 | if case(value_e.Str):
|
393 | subst_str_ = cast(value.Str, subst)
|
394 | subst_str = subst_str_
|
395 |
|
396 | elif case(value_e.Expr):
|
397 | subst_expr_ = cast(value.Expr, subst)
|
398 | subst_expr = subst_expr_
|
399 |
|
400 | else:
|
401 | raise error.TypeErr(subst,
|
402 | 'expected substitution to be Str or Expr',
|
403 | rd.LeftParenToken())
|
404 |
|
405 | count = mops.BigTruncate(rd.NamedInt("count", -1))
|
406 | rd.Done()
|
407 |
|
408 | if count == 0:
|
409 | return value.Str(string)
|
410 |
|
411 | if string_val:
|
412 | if subst_str:
|
413 | s = subst_str.s
|
414 | if subst_expr:
|
415 | # Eval with $0 set to string_val (the matched substring)
|
416 | with ctx_EvalReplace(self.mem, string_val.s, None):
|
417 | s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
|
418 | assert s is not None
|
419 |
|
420 | result = string.replace(string_val.s, s, count)
|
421 |
|
422 | return value.Str(result)
|
423 |
|
424 | if eggex_val:
|
425 | ere = regex_translate.AsPosixEre(eggex_val)
|
426 | cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
|
427 |
|
428 | # Walk through the string finding all matches of the compiled ere.
|
429 | # Then, collect unmatched substrings and substitutions into the
|
430 | # `parts` list.
|
431 | pos = 0
|
432 | parts = [] # type: List[str]
|
433 | replace_count = 0
|
434 | while pos < len(string):
|
435 | indices = libc.regex_search(ere, cflags, string, 0, pos)
|
436 | if indices is None:
|
437 | break
|
438 |
|
439 | # Collect captures
|
440 | arg0 = None # type: str
|
441 | argv = [] # type: List[str]
|
442 | named_vars = [] # type: List[Tuple[str, value_t]]
|
443 | num_groups = len(indices) / 2
|
444 | for group in xrange(num_groups):
|
445 | start = indices[2 * group]
|
446 | end = indices[2 * group + 1]
|
447 | captured = string[start:end]
|
448 | val = value.Str(captured) # type: value_t
|
449 |
|
450 | if len(eggex_val.convert_funcs) and group != 0:
|
451 | convert_func = eggex_val.convert_funcs[group - 1]
|
452 | convert_tok = eggex_val.convert_toks[group - 1]
|
453 |
|
454 | if convert_func:
|
455 | val = self.expr_ev.CallConvertFunc(
|
456 | convert_func, val, convert_tok,
|
457 | rd.LeftParenToken())
|
458 |
|
459 | # $0, $1, $2 variables are argv values, which must be
|
460 | # strings. Furthermore, they can only be used in string
|
461 | # contexts
|
462 | # eg. "$[1]" != "$1".
|
463 | val_str = val_ops.Stringify(val, rd.LeftParenToken())
|
464 | if group == 0:
|
465 | arg0 = val_str
|
466 | else:
|
467 | argv.append(val_str)
|
468 |
|
469 | # $0 cannot be named
|
470 | if group != 0:
|
471 | name = eggex_val.capture_names[group - 2]
|
472 | if name is not None:
|
473 | named_vars.append((name, val))
|
474 |
|
475 | if subst_str:
|
476 | s = subst_str.s
|
477 | if subst_expr:
|
478 | with ctx_EvalReplace(self.mem, arg0, argv):
|
479 | with pure_ysh.ctx_Shvar(self.mem, named_vars):
|
480 | s = self.EvalSubstExpr(subst_expr,
|
481 | rd.LeftParenToken())
|
482 | assert s is not None
|
483 |
|
484 | start = indices[0]
|
485 | end = indices[1]
|
486 | parts.append(string[pos:start]) # Unmatched substring
|
487 | parts.append(s) # Replacement
|
488 | pos = end # Move to end of match
|
489 |
|
490 | replace_count += 1
|
491 | if count != -1 and replace_count == count:
|
492 | break
|
493 |
|
494 | parts.append(string[pos:]) # Remaining unmatched substring
|
495 |
|
496 | return value.Str("".join(parts))
|
497 |
|
498 | raise AssertionError()
|