1 | """YSH Str methods"""
|
2 |
|
3 | from __future__ import print_function
|
4 |
|
5 | from _devbuild.gen.syntax_asdl import loc_t
|
6 | from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
|
7 | eggex_ops_t, RegexMatch)
|
8 | from builtin import pure_ysh
|
9 | from core import error
|
10 | from core import state
|
11 | from core import vm
|
12 | from frontend import typed_args
|
13 | from mycpp import mops
|
14 | from mycpp.mylib import log, tagswitch
|
15 | from osh import string_ops
|
16 | from ysh import expr_eval
|
17 | from ysh import regex_translate
|
18 | from ysh import val_ops
|
19 |
|
20 | import libc
|
21 | from libc import REG_NOTBOL
|
22 |
|
23 | from typing import cast, List, Tuple
|
24 |
|
25 | _ = log
|
26 |
|
27 |
|
28 | def _StrMatchStart(s, p):
|
29 | # type: (str, str) -> Tuple[bool, int, int]
|
30 | """Returns the range of bytes in 's' that match string pattern `p`. the
|
31 | pattern matches if 's' starts with all the characters in 'p'.
|
32 |
|
33 | The returned match result is the tuple "(matched, begin, end)". 'matched'
|
34 | is true if the pattern matched. 'begin' and 'end' give the half-open range
|
35 | "[begin, end)" of byte indices from 's' for the match, and are a valid but
|
36 | empty range if 'match' is false.
|
37 |
|
38 | Used for shell functions like 'trimStart' when trimming a prefix string.
|
39 | """
|
40 | if s.startswith(p):
|
41 | return (True, 0, len(p))
|
42 | else:
|
43 | return (False, 0, 0)
|
44 |
|
45 |
|
46 | def _StrMatchEnd(s, p):
|
47 | # type: (str, str) -> Tuple[bool, int, int]
|
48 | """Returns a match result for the bytes in 's' that match string pattern
|
49 | `p`. the pattern matches if 's' ends with all the characters in 'p'.
|
50 |
|
51 | The returned match result is the tuple "(matched, begin, end)". 'matched'
|
52 | is true if the pattern matched. 'begin' and 'end' give the half-open range
|
53 | "[begin, end)" of byte indices from 's' for the match, and are a valid but
|
54 | empty range if 'match' is false.
|
55 |
|
56 | Used for shell functions like 'trimEnd' when trimming a suffix string.
|
57 | """
|
58 | len_s = len(s)
|
59 | if s.endswith(p):
|
60 | return (True, len_s - len(p), len_s)
|
61 | else:
|
62 | return (False, len_s, len_s)
|
63 |
|
64 |
|
65 | def _EggexMatchCommon(s, p, ere, empty_p):
|
66 | # type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
|
67 | cflags = regex_translate.LibcFlags(p.canonical_flags)
|
68 | eflags = 0
|
69 | indices = libc.regex_search(ere, cflags, s, eflags)
|
70 | if indices is None:
|
71 | return (False, empty_p, empty_p)
|
72 |
|
73 | start = indices[0]
|
74 | end = indices[1]
|
75 |
|
76 | return (True, start, end)
|
77 |
|
78 |
|
79 | def _EggexMatchStart(s, p):
|
80 | # type: (str, value.Eggex) -> Tuple[bool, int, int]
|
81 | """Returns a match result for the bytes in 's' that match Eggex pattern
|
82 | `p` when constrained to match at the start of the string.
|
83 |
|
84 | Any capturing done by the Eggex pattern is ignored.
|
85 |
|
86 | The returned match result is the tuple "(matched, begin, end)". 'matched'
|
87 | is true if the pattern matched. 'begin' and 'end' give the half-open range
|
88 | "[begin, end)" of byte indices from 's' for the match, and are a valid but
|
89 | empty range if 'match' is false.
|
90 |
|
91 | Used for shell functions like 'trimStart' when trimming with an Eggex
|
92 | pattern.
|
93 | """
|
94 | ere = regex_translate.AsPosixEre(p)
|
95 | if not ere.startswith('^'):
|
96 | ere = '^' + ere
|
97 | return _EggexMatchCommon(s, p, ere, 0)
|
98 |
|
99 |
|
100 | def _EggexMatchEnd(s, p):
|
101 | # type: (str, value.Eggex) -> Tuple[bool, int, int]
|
102 | """Like _EggexMatchStart, but matches against the end of the
|
103 | string.
|
104 | """
|
105 | ere = regex_translate.AsPosixEre(p)
|
106 | if not ere.endswith('$'):
|
107 | ere = ere + '$'
|
108 | return _EggexMatchCommon(s, p, ere, len(s))
|
109 |
|
110 |
|
111 | START = 0b01
|
112 | END = 0b10
|
113 |
|
114 |
|
115 | class HasAffix(vm._Callable):
|
116 | """ Implements `startsWith()`, `endsWith()`. """
|
117 |
|
118 | def __init__(self, anchor):
|
119 | # type: (int) -> None
|
120 | assert anchor in (START, END), ("Anchor must be START or END")
|
121 | self.anchor = anchor
|
122 |
|
123 | def Call(self, rd):
|
124 | # type: (typed_args.Reader) -> value_t
|
125 | """
|
126 | string => startsWith(pattern_str) # => bool
|
127 | string => startsWith(pattern_eggex) # => bool
|
128 | string => endsWith(pattern_str) # => bool
|
129 | string => endsWith(pattern_eggex) # => bool
|
130 | """
|
131 |
|
132 | string = rd.PosStr()
|
133 | pattern_val = rd.PosValue()
|
134 | pattern_str = None # type: str
|
135 | pattern_eggex = None # type: value.Eggex
|
136 | with tagswitch(pattern_val) as case:
|
137 | if case(value_e.Eggex):
|
138 | pattern_eggex = cast(value.Eggex, pattern_val)
|
139 | elif case(value_e.Str):
|
140 | pattern_str = cast(value.Str, pattern_val).s
|
141 | else:
|
142 | raise error.TypeErr(pattern_val,
|
143 | 'expected pattern to be Eggex or Str',
|
144 | rd.LeftParenToken())
|
145 | rd.Done()
|
146 |
|
147 | matched = False
|
148 | try:
|
149 | if pattern_str is not None:
|
150 | if self.anchor & START:
|
151 | matched, _, _ = _StrMatchStart(string, pattern_str)
|
152 | else:
|
153 | matched, _, _ = _StrMatchEnd(string, pattern_str)
|
154 | else:
|
155 | assert pattern_eggex is not None
|
156 | if self.anchor & START:
|
157 | matched, _, _ = _EggexMatchStart(string, pattern_eggex)
|
158 | else:
|
159 | matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
|
160 | except error.Strict as e:
|
161 | raise error.Expr(e.msg, e.location)
|
162 |
|
163 | return value.Bool(matched)
|
164 |
|
165 |
|
166 | class Trim(vm._Callable):
|
167 | """ Implements `trimStart()`, `trimEnd()`, and `trim()` """
|
168 |
|
169 | def __init__(self, anchor):
|
170 | # type: (int) -> None
|
171 | assert anchor in (START, END, START
|
172 | | END), ("Anchor must be START, END, or START|END")
|
173 | self.anchor = anchor
|
174 |
|
175 | def Call(self, rd):
|
176 | # type: (typed_args.Reader) -> value_t
|
177 | """
|
178 | string => trimStart() # => Str
|
179 | string => trimEnd() # => Str
|
180 | string => trim() # => Str
|
181 | string => trimStart(pattern_str) # => Str
|
182 | string => trimEnd(pattern_str) # => Str
|
183 | string => trim(pattern_str) # => Str
|
184 | string => trimStart(pattern_eggex) # => Str
|
185 | string => trimEnd(pattern_eggex) # => Str
|
186 | string => trim(pattern_eggex) # => Str
|
187 | """
|
188 |
|
189 | string = rd.PosStr()
|
190 | pattern_val = rd.OptionalValue()
|
191 | pattern_str = None # type: str
|
192 | pattern_eggex = None # type: value.Eggex
|
193 | if pattern_val:
|
194 | with tagswitch(pattern_val) as case:
|
195 | if case(value_e.Eggex):
|
196 | pattern_eggex = cast(value.Eggex, pattern_val)
|
197 | elif case(value_e.Str):
|
198 | pattern_str = cast(value.Str, pattern_val).s
|
199 | else:
|
200 | raise error.TypeErr(pattern_val,
|
201 | 'expected pattern to be Eggex or Str',
|
202 | rd.LeftParenToken())
|
203 | rd.Done()
|
204 |
|
205 | start = 0
|
206 | end = len(string)
|
207 | try:
|
208 | if pattern_str is not None:
|
209 | if self.anchor & START:
|
210 | _, _, start = _StrMatchStart(string, pattern_str)
|
211 | if self.anchor & END:
|
212 | _, end, _ = _StrMatchEnd(string, pattern_str)
|
213 | elif pattern_eggex is not None:
|
214 | if self.anchor & START:
|
215 | _, _, start = _EggexMatchStart(string, pattern_eggex)
|
216 | if self.anchor & END:
|
217 | _, end, _ = _EggexMatchEnd(string, pattern_eggex)
|
218 | else:
|
219 | if self.anchor & START:
|
220 | _, start = string_ops.StartsWithWhitespaceByteRange(string)
|
221 | if self.anchor & END:
|
222 | end, _ = string_ops.EndsWithWhitespaceByteRange(string)
|
223 | except error.Strict as e:
|
224 | raise error.Expr(e.msg, e.location)
|
225 |
|
226 | res = string[start:end]
|
227 | return value.Str(res)
|
228 |
|
229 |
|
230 | class Upper(vm._Callable):
|
231 |
|
232 | def __init__(self):
|
233 | # type: () -> None
|
234 | pass
|
235 |
|
236 | def Call(self, rd):
|
237 | # type: (typed_args.Reader) -> value_t
|
238 |
|
239 | s = rd.PosStr()
|
240 | rd.Done()
|
241 |
|
242 | # TODO: unicode support
|
243 | return value.Str(s.upper())
|
244 |
|
245 |
|
246 | class Lower(vm._Callable):
|
247 |
|
248 | def __init__(self):
|
249 | # type: () -> None
|
250 | pass
|
251 |
|
252 | def Call(self, rd):
|
253 | # type: (typed_args.Reader) -> value_t
|
254 |
|
255 | s = rd.PosStr()
|
256 | rd.Done()
|
257 |
|
258 | # TODO: unicode support
|
259 | return value.Str(s.lower())
|
260 |
|
261 |
|
262 | SEARCH = 0
|
263 | LEFT_MATCH = 1
|
264 |
|
265 |
|
266 | class SearchMatch(vm._Callable):
|
267 |
|
268 | def __init__(self, which_method):
|
269 | # type: (int) -> None
|
270 | self.which_method = which_method
|
271 |
|
272 | def Call(self, rd):
|
273 | # type: (typed_args.Reader) -> value_t
|
274 | """
|
275 | s => search(eggex, pos=0)
|
276 | """
|
277 | string = rd.PosStr()
|
278 |
|
279 | pattern = rd.PosValue() # Eggex or ERE Str
|
280 | with tagswitch(pattern) as case:
|
281 | if case(value_e.Eggex):
|
282 | eggex_val = cast(value.Eggex, pattern)
|
283 |
|
284 | # lazily converts to ERE
|
285 | ere = regex_translate.AsPosixEre(eggex_val)
|
286 | cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
|
287 | capture = eggex_ops.Yes(
|
288 | eggex_val.convert_funcs, eggex_val.convert_toks,
|
289 | eggex_val.capture_names) # type: eggex_ops_t
|
290 |
|
291 | elif case(value_e.Str):
|
292 | ere = cast(value.Str, pattern).s
|
293 | cflags = 0
|
294 | capture = eggex_ops.No
|
295 |
|
296 | else:
|
297 | # TODO: add method name to this error
|
298 | raise error.TypeErr(pattern, 'expected Eggex or Str',
|
299 | rd.LeftParenToken())
|
300 |
|
301 | # It's called 'pos', not 'start' like Python. Python has 2 kinds of
|
302 | # 'start' in its regex API, which can be confusing.
|
303 | pos = mops.BigTruncate(rd.NamedInt('pos', 0))
|
304 | rd.Done()
|
305 |
|
306 | # Make it anchored
|
307 | if self.which_method == LEFT_MATCH and not ere.startswith('^'):
|
308 | ere = '^' + ere
|
309 |
|
310 | if self.which_method == LEFT_MATCH:
|
311 | eflags = 0 # ^ matches beginning even if pos=5
|
312 | else:
|
313 | eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
|
314 |
|
315 | indices = libc.regex_search(ere, cflags, string, eflags, pos)
|
316 |
|
317 | if indices is None:
|
318 | return value.Null
|
319 |
|
320 | return RegexMatch(string, indices, capture)
|
321 |
|
322 |
|
323 | class Replace(vm._Callable):
|
324 |
|
325 | def __init__(self, mem, expr_ev):
|
326 | # type: (state.Mem, expr_eval.ExprEvaluator) -> None
|
327 | self.mem = mem
|
328 | self.expr_ev = expr_ev
|
329 |
|
330 | def EvalSubstExpr(self, expr, blame_loc):
|
331 | # type: (value.Expr, loc_t) -> str
|
332 | res = self.expr_ev.EvalExpr(expr.e, blame_loc)
|
333 | if res.tag() == value_e.Str:
|
334 | return cast(value.Str, res).s
|
335 |
|
336 | raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
|
337 |
|
338 | def Call(self, rd):
|
339 | # type: (typed_args.Reader) -> value_t
|
340 | """
|
341 | s => replace(string_val, subst_str, count=-1)
|
342 | s => replace(string_val, subst_expr, count=-1)
|
343 | s => replace(eggex_val, subst_str, count=-1)
|
344 | s => replace(eggex_val, subst_expr, count=-1)
|
345 |
|
346 | For count in [0, MAX_INT], there will be no more than count
|
347 | replacements. Any negative count should read as unset, and replace will
|
348 | replace all occurances of the pattern.
|
349 | """
|
350 | string = rd.PosStr()
|
351 |
|
352 | string_val = None # type: value.Str
|
353 | eggex_val = None # type: value.Eggex
|
354 | subst_str = None # type: value.Str
|
355 | subst_expr = None # type: value.Expr
|
356 |
|
357 | pattern = rd.PosValue()
|
358 | with tagswitch(pattern) as case:
|
359 | if case(value_e.Eggex):
|
360 | # HACK: mycpp will otherwise generate:
|
361 | # value::Eggex* eggex_val ...
|
362 | eggex_val_ = cast(value.Eggex, pattern)
|
363 | eggex_val = eggex_val_
|
364 |
|
365 | elif case(value_e.Str):
|
366 | string_val_ = cast(value.Str, pattern)
|
367 | string_val = string_val_
|
368 |
|
369 | else:
|
370 | raise error.TypeErr(pattern,
|
371 | 'expected pattern to be Eggex or Str',
|
372 | rd.LeftParenToken())
|
373 |
|
374 | subst = rd.PosValue()
|
375 | with tagswitch(subst) as case:
|
376 | if case(value_e.Str):
|
377 | subst_str_ = cast(value.Str, subst)
|
378 | subst_str = subst_str_
|
379 |
|
380 | elif case(value_e.Expr):
|
381 | subst_expr_ = cast(value.Expr, subst)
|
382 | subst_expr = subst_expr_
|
383 |
|
384 | else:
|
385 | raise error.TypeErr(subst,
|
386 | 'expected substitution to be Str or Expr',
|
387 | rd.LeftParenToken())
|
388 |
|
389 | count = mops.BigTruncate(rd.NamedInt("count", -1))
|
390 | rd.Done()
|
391 |
|
392 | if count == 0:
|
393 | return value.Str(string)
|
394 |
|
395 | if string_val:
|
396 | if subst_str:
|
397 | s = subst_str.s
|
398 | if subst_expr:
|
399 | # Eval with $0 set to string_val (the matched substring)
|
400 | with state.ctx_Eval(self.mem, string_val.s, None, None):
|
401 | s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
|
402 | assert s is not None
|
403 |
|
404 | result = string.replace(string_val.s, s, count)
|
405 |
|
406 | return value.Str(result)
|
407 |
|
408 | if eggex_val:
|
409 | ere = regex_translate.AsPosixEre(eggex_val)
|
410 | cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
|
411 |
|
412 | # Walk through the string finding all matches of the compiled ere.
|
413 | # Then, collect unmatched substrings and substitutions into the
|
414 | # `parts` list.
|
415 | pos = 0
|
416 | parts = [] # type: List[str]
|
417 | replace_count = 0
|
418 | while pos < len(string):
|
419 | indices = libc.regex_search(ere, cflags, string, 0, pos)
|
420 | if indices is None:
|
421 | break
|
422 |
|
423 | # Collect captures
|
424 | arg0 = None # type: str
|
425 | argv = [] # type: List[str]
|
426 | named_vars = [] # type: List[Tuple[str, value_t]]
|
427 | num_groups = len(indices) / 2
|
428 | for group in xrange(num_groups):
|
429 | start = indices[2 * group]
|
430 | end = indices[2 * group + 1]
|
431 | captured = string[start:end]
|
432 | val = value.Str(captured) # type: value_t
|
433 |
|
434 | if len(eggex_val.convert_funcs) and group != 0:
|
435 | convert_func = eggex_val.convert_funcs[group - 1]
|
436 | convert_tok = eggex_val.convert_toks[group - 1]
|
437 |
|
438 | if convert_func:
|
439 | val = self.expr_ev.CallConvertFunc(
|
440 | convert_func, val, convert_tok,
|
441 | rd.LeftParenToken())
|
442 |
|
443 | # $0, $1, $2 variables are argv values, which must be
|
444 | # strings. Furthermore, they can only be used in string
|
445 | # contexts
|
446 | # eg. "$[1]" != "$1".
|
447 | val_str = val_ops.Stringify(val, rd.LeftParenToken())
|
448 | if group == 0:
|
449 | arg0 = val_str
|
450 | else:
|
451 | argv.append(val_str)
|
452 |
|
453 | # $0 cannot be named
|
454 | if group != 0:
|
455 | name = eggex_val.capture_names[group - 2]
|
456 | if name is not None:
|
457 | named_vars.append((name, val))
|
458 |
|
459 | if subst_str:
|
460 | s = subst_str.s
|
461 | if subst_expr:
|
462 | with state.ctx_Eval(self.mem, arg0, argv, None):
|
463 | with pure_ysh.ctx_Shvar(self.mem, named_vars):
|
464 | s = self.EvalSubstExpr(subst_expr,
|
465 | rd.LeftParenToken())
|
466 | assert s is not None
|
467 |
|
468 | start = indices[0]
|
469 | end = indices[1]
|
470 | parts.append(string[pos:start]) # Unmatched substring
|
471 | parts.append(s) # Replacement
|
472 | pos = end # Move to end of match
|
473 |
|
474 | replace_count += 1
|
475 | if count != -1 and replace_count == count:
|
476 | break
|
477 |
|
478 | parts.append(string[pos:]) # Remaining unmatched substring
|
479 |
|
480 | return value.Str("".join(parts))
|
481 |
|
482 | raise AssertionError()
|