builtin/method

OILS / builtin / method_str.py View on Github | oilshell.org

479 lines, 281 significant

1	"""YSH Str methods"""
2
3	from __future__ import print_function
4
5	from _devbuild.gen.syntax_asdl import loc_t
6	from _devbuild.gen.value_asdl import (value, value_e, value_t, eggex_ops,
7	eggex_ops_t, RegexMatch)
8	from core import error
9	from core import state
10	from core import vm
11	from frontend import typed_args
12	from mycpp import mops
13	from mycpp.mylib import log, tagswitch
14	from osh import string_ops
15	from ysh import expr_eval
16	from ysh import regex_translate
17	from ysh import val_ops
18
19	import libc
20	from libc import REG_NOTBOL
21
22	from typing import cast, Dict, List, Tuple
23
24	_ = log
25
26
27	def _StrMatchStart(s, p):
28	# type: (str, str) -> Tuple[bool, int, int]
29	"""Returns the range of bytes in 's' that match string pattern `p`. the
30	pattern matches if 's' starts with all the characters in 'p'.
31
32	The returned match result is the tuple "(matched, begin, end)". 'matched'
33	is true if the pattern matched. 'begin' and 'end' give the half-open range
34	"[begin, end)" of byte indices from 's' for the match, and are a valid but
35	empty range if 'match' is false.
36
37	Used for shell functions like 'trimStart' when trimming a prefix string.
38	"""
39	if s.startswith(p):
40	return (True, 0, len(p))
41	else:
42	return (False, 0, 0)
43
44
45	def _StrMatchEnd(s, p):
46	# type: (str, str) -> Tuple[bool, int, int]
47	"""Returns a match result for the bytes in 's' that match string pattern
48	`p`. the pattern matches if 's' ends with all the characters in 'p'.
49
50	The returned match result is the tuple "(matched, begin, end)". 'matched'
51	is true if the pattern matched. 'begin' and 'end' give the half-open range
52	"[begin, end)" of byte indices from 's' for the match, and are a valid but
53	empty range if 'match' is false.
54
55	Used for shell functions like 'trimEnd' when trimming a suffix string.
56	"""
57	len_s = len(s)
58	if s.endswith(p):
59	return (True, len_s - len(p), len_s)
60	else:
61	return (False, len_s, len_s)
62
63
64	def _EggexMatchCommon(s, p, ere, empty_p):
65	# type: (str, value.Eggex, str, int) -> Tuple[bool, int, int]
66	cflags = regex_translate.LibcFlags(p.canonical_flags)
67	eflags = 0
68	indices = libc.regex_search(ere, cflags, s, eflags)
69	if indices is None:
70	return (False, empty_p, empty_p)
71
72	start = indices[0]
73	end = indices[1]
74
75	return (True, start, end)
76
77
78	def _EggexMatchStart(s, p):
79	# type: (str, value.Eggex) -> Tuple[bool, int, int]
80	"""Returns a match result for the bytes in 's' that match Eggex pattern
81	`p` when constrained to match at the start of the string.
82
83	Any capturing done by the Eggex pattern is ignored.
84
85	The returned match result is the tuple "(matched, begin, end)". 'matched'
86	is true if the pattern matched. 'begin' and 'end' give the half-open range
87	"[begin, end)" of byte indices from 's' for the match, and are a valid but
88	empty range if 'match' is false.
89
90	Used for shell functions like 'trimStart' when trimming with an Eggex
91	pattern.
92	"""
93	ere = regex_translate.AsPosixEre(p)
94	if not ere.startswith('^'):
95	ere = '^' + ere
96	return _EggexMatchCommon(s, p, ere, 0)
97
98
99	def _EggexMatchEnd(s, p):
100	# type: (str, value.Eggex) -> Tuple[bool, int, int]
101	"""Like _EggexMatchStart, but matches against the end of the
102	string.
103	"""
104	ere = regex_translate.AsPosixEre(p)
105	if not ere.endswith('$'):
106	ere = ere + '$'
107	return _EggexMatchCommon(s, p, ere, len(s))
108
109
110	START = 0b01
111	END = 0b10
112
113
114	class HasAffix(vm._Callable):
115	""" Implements `startsWith()`, `endsWith()`. """
116
117	def __init__(self, anchor):
118	# type: (int) -> None
119	assert anchor in (START, END), ("Anchor must be START or END")
120	self.anchor = anchor
121
122	def Call(self, rd):
123	# type: (typed_args.Reader) -> value_t
124	"""
125	string => startsWith(pattern_str) # => bool
126	string => startsWith(pattern_eggex) # => bool
127	string => endsWith(pattern_str) # => bool
128	string => endsWith(pattern_eggex) # => bool
129	"""
130
131	string = rd.PosStr()
132	pattern_val = rd.PosValue()
133	pattern_str = None # type: str
134	pattern_eggex = None # type: value.Eggex
135	with tagswitch(pattern_val) as case:
136	if case(value_e.Eggex):
137	pattern_eggex = cast(value.Eggex, pattern_val)
138	elif case(value_e.Str):
139	pattern_str = cast(value.Str, pattern_val).s
140	else:
141	raise error.TypeErr(pattern_val,
142	'expected pattern to be Eggex or Str',
143	rd.LeftParenToken())
144	rd.Done()
145
146	matched = False
147	try:
148	if pattern_str is not None:
149	if self.anchor & START:
150	matched, _, _ = _StrMatchStart(string, pattern_str)
151	else:
152	matched, _, _ = _StrMatchEnd(string, pattern_str)
153	else:
154	assert pattern_eggex is not None
155	if self.anchor & START:
156	matched, _, _ = _EggexMatchStart(string, pattern_eggex)
157	else:
158	matched, _, _ = _EggexMatchEnd(string, pattern_eggex)
159	except error.Strict as e:
160	raise error.Expr(e.msg, e.location)
161
162	return value.Bool(matched)
163
164
165	class Trim(vm._Callable):
166	""" Implements `trimStart()`, `trimEnd()`, and `trim()` """
167
168	def __init__(self, anchor):
169	# type: (int) -> None
170	assert anchor in (START, END, START
171	\| END), ("Anchor must be START, END, or START\|END")
172	self.anchor = anchor
173
174	def Call(self, rd):
175	# type: (typed_args.Reader) -> value_t
176	"""
177	string => trimStart() # => Str
178	string => trimEnd() # => Str
179	string => trim() # => Str
180	string => trimStart(pattern_str) # => Str
181	string => trimEnd(pattern_str) # => Str
182	string => trim(pattern_str) # => Str
183	string => trimStart(pattern_eggex) # => Str
184	string => trimEnd(pattern_eggex) # => Str
185	string => trim(pattern_eggex) # => Str
186	"""
187
188	string = rd.PosStr()
189	pattern_val = rd.OptionalValue()
190	pattern_str = None # type: str
191	pattern_eggex = None # type: value.Eggex
192	if pattern_val:
193	with tagswitch(pattern_val) as case:
194	if case(value_e.Eggex):
195	pattern_eggex = cast(value.Eggex, pattern_val)
196	elif case(value_e.Str):
197	pattern_str = cast(value.Str, pattern_val).s
198	else:
199	raise error.TypeErr(pattern_val,
200	'expected pattern to be Eggex or Str',
201	rd.LeftParenToken())
202	rd.Done()
203
204	start = 0
205	end = len(string)
206	try:
207	if pattern_str is not None:
208	if self.anchor & START:
209	_, _, start = _StrMatchStart(string, pattern_str)
210	if self.anchor & END:
211	_, end, _ = _StrMatchEnd(string, pattern_str)
212	elif pattern_eggex is not None:
213	if self.anchor & START:
214	_, _, start = _EggexMatchStart(string, pattern_eggex)
215	if self.anchor & END:
216	_, end, _ = _EggexMatchEnd(string, pattern_eggex)
217	else:
218	if self.anchor & START:
219	_, start = string_ops.StartsWithWhitespaceByteRange(string)
220	if self.anchor & END:
221	end, _ = string_ops.EndsWithWhitespaceByteRange(string)
222	except error.Strict as e:
223	raise error.Expr(e.msg, e.location)
224
225	res = string[start:end]
226	return value.Str(res)
227
228
229	class Upper(vm._Callable):
230
231	def __init__(self):
232	# type: () -> None
233	pass
234
235	def Call(self, rd):
236	# type: (typed_args.Reader) -> value_t
237
238	s = rd.PosStr()
239	rd.Done()
240
241	# TODO: unicode support
242	return value.Str(s.upper())
243
244
245	class Lower(vm._Callable):
246
247	def __init__(self):
248	# type: () -> None
249	pass
250
251	def Call(self, rd):
252	# type: (typed_args.Reader) -> value_t
253
254	s = rd.PosStr()
255	rd.Done()
256
257	# TODO: unicode support
258	return value.Str(s.lower())
259
260
261	SEARCH = 0
262	LEFT_MATCH = 1
263
264
265	class SearchMatch(vm._Callable):
266
267	def __init__(self, which_method):
268	# type: (int) -> None
269	self.which_method = which_method
270
271	def Call(self, rd):
272	# type: (typed_args.Reader) -> value_t
273	"""
274	s => search(eggex, pos=0)
275	"""
276	string = rd.PosStr()
277
278	pattern = rd.PosValue() # Eggex or ERE Str
279	with tagswitch(pattern) as case:
280	if case(value_e.Eggex):
281	eggex_val = cast(value.Eggex, pattern)
282
283	# lazily converts to ERE
284	ere = regex_translate.AsPosixEre(eggex_val)
285	cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
286	capture = eggex_ops.Yes(
287	eggex_val.convert_funcs, eggex_val.convert_toks,
288	eggex_val.capture_names) # type: eggex_ops_t
289
290	elif case(value_e.Str):
291	ere = cast(value.Str, pattern).s
292	cflags = 0
293	capture = eggex_ops.No
294
295	else:
296	# TODO: add method name to this error
297	raise error.TypeErr(pattern, 'expected Eggex or Str',
298	rd.LeftParenToken())
299
300	# It's called 'pos', not 'start' like Python. Python has 2 kinds of
301	# 'start' in its regex API, which can be confusing.
302	pos = mops.BigTruncate(rd.NamedInt('pos', 0))
303	rd.Done()
304
305	# Make it anchored
306	if self.which_method == LEFT_MATCH and not ere.startswith('^'):
307	ere = '^' + ere
308
309	if self.which_method == LEFT_MATCH:
310	eflags = 0 # ^ matches beginning even if pos=5
311	else:
312	eflags = 0 if pos == 0 else REG_NOTBOL # ^ only matches when pos=0
313
314	indices = libc.regex_search(ere, cflags, string, eflags, pos)
315
316	if indices is None:
317	return value.Null
318
319	return RegexMatch(string, indices, capture)
320
321
322	class Replace(vm._Callable):
323
324	def __init__(self, mem, expr_ev):
325	# type: (state.Mem, expr_eval.ExprEvaluator) -> None
326	self.mem = mem
327	self.expr_ev = expr_ev
328
329	def EvalSubstExpr(self, expr, blame_loc):
330	# type: (value.Expr, loc_t) -> str
331	res = self.expr_ev.EvalExpr(expr.e, blame_loc)
332	if res.tag() == value_e.Str:
333	return cast(value.Str, res).s
334
335	raise error.TypeErr(res, "expected expr to eval to a Str", blame_loc)
336
337	def Call(self, rd):
338	# type: (typed_args.Reader) -> value_t
339	"""
340	s => replace(string_val, subst_str, count=-1)
341	s => replace(string_val, subst_expr, count=-1)
342	s => replace(eggex_val, subst_str, count=-1)
343	s => replace(eggex_val, subst_expr, count=-1)
344
345	For count in [0, MAX_INT], there will be no more than count
346	replacements. Any negative count should read as unset, and replace will
347	replace all occurances of the pattern.
348	"""
349	string = rd.PosStr()
350
351	string_val = None # type: value.Str
352	eggex_val = None # type: value.Eggex
353	subst_str = None # type: value.Str
354	subst_expr = None # type: value.Expr
355
356	pattern = rd.PosValue()
357	with tagswitch(pattern) as case:
358	if case(value_e.Eggex):
359	# HACK: mycpp will otherwise generate:
360	# value::Eggex* eggex_val ...
361	eggex_val_ = cast(value.Eggex, pattern)
362	eggex_val = eggex_val_
363
364	elif case(value_e.Str):
365	string_val_ = cast(value.Str, pattern)
366	string_val = string_val_
367
368	else:
369	raise error.TypeErr(pattern,
370	'expected pattern to be Eggex or Str',
371	rd.LeftParenToken())
372
373	subst = rd.PosValue()
374	with tagswitch(subst) as case:
375	if case(value_e.Str):
376	subst_str_ = cast(value.Str, subst)
377	subst_str = subst_str_
378
379	elif case(value_e.Expr):
380	subst_expr_ = cast(value.Expr, subst)
381	subst_expr = subst_expr_
382
383	else:
384	raise error.TypeErr(subst,
385	'expected substitution to be Str or Expr',
386	rd.LeftParenToken())
387
388	count = mops.BigTruncate(rd.NamedInt("count", -1))
389	rd.Done()
390
391	if count == 0:
392	return value.Str(string)
393
394	if string_val:
395	if subst_str:
396	s = subst_str.s
397	if subst_expr:
398	# Eval with $0 set to string_val (the matched substring)
399	with state.ctx_Eval(self.mem, string_val.s, None, None):
400	s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
401	assert s is not None
402
403	result = string.replace(string_val.s, s, count)
404
405	return value.Str(result)
406
407	if eggex_val:
408	ere = regex_translate.AsPosixEre(eggex_val)
409	cflags = regex_translate.LibcFlags(eggex_val.canonical_flags)
410
411	# Walk through the string finding all matches of the compiled ere.
412	# Then, collect unmatched substrings and substitutions into the
413	# `parts` list.
414	pos = 0
415	parts = [] # type: List[str]
416	replace_count = 0
417	while pos < len(string):
418	indices = libc.regex_search(ere, cflags, string, 0, pos)
419	if indices is None:
420	break
421
422	# Collect captures
423	arg0 = None # type: str
424	argv = [] # type: List[str]
425	named_vars = {} # type: Dict[str, value_t]
426	num_groups = len(indices) / 2
427	for group in xrange(num_groups):
428	start = indices[2 * group]
429	end = indices[2 * group + 1]
430	captured = string[start:end]
431	val = value.Str(captured) # type: value_t
432
433	if len(eggex_val.convert_funcs) and group != 0:
434	convert_func = eggex_val.convert_funcs[group - 1]
435	convert_tok = eggex_val.convert_toks[group - 1]
436
437	if convert_func:
438	val = self.expr_ev.CallConvertFunc(
439	convert_func, val, convert_tok,
440	rd.LeftParenToken())
441
442	# $0, $1, $2 variables are argv values, which must be
443	# strings. Furthermore, they can only be used in string
444	# contexts
445	# eg. "$[1]" != "$1".
446	val_str = val_ops.Stringify(val, rd.LeftParenToken())
447	if group == 0:
448	arg0 = val_str
449	else:
450	argv.append(val_str)
451
452	# $0 cannot be named
453	if group != 0:
454	name = eggex_val.capture_names[group - 2]
455	if name is not None:
456	named_vars[name] = val
457
458	if subst_str:
459	s = subst_str.s
460	if subst_expr:
461	with state.ctx_Eval(self.mem, arg0, argv, named_vars):
462	s = self.EvalSubstExpr(subst_expr, rd.LeftParenToken())
463	assert s is not None
464
465	start = indices[0]
466	end = indices[1]
467	parts.append(string[pos:start]) # Unmatched substring
468	parts.append(s) # Replacement
469	pos = end # Move to end of match
470
471	replace_count += 1
472	if count != -1 and replace_count == count:
473	break
474
475	parts.append(string[pos:]) # Remaining unmatched substring
476
477	return value.Str("".join(parts))
478
479	raise AssertionError()