osh/string_ops.py

OILS / osh / string_ops.py View on Github | oilshell.org

601 lines, 272 significant

1	"""
2	string_ops.py - String library functions that can be exposed with a saner syntax.
3
4	OSH:
5
6	local y=${x//a*/b}
7
8	YSH:
9
10	var y = x => sub('a*', 'b', :ALL)
11
12	Pass x => sub('a*', 'b', :ALL) => var y
13	"""
14
15	from _devbuild.gen.id_kind_asdl import Id
16	from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
17	from core import pyutil
18	from core import ui
19	from core import error
20	from core.error import e_die, e_strict
21	from mycpp.mylib import log
22	from osh import glob_
23
24	import libc
25
26	from typing import List, Tuple
27
28	_ = log
29
30	# TODO: Add details of the invalid character/byte here?
31
32	INCOMPLETE_CHAR = 'Incomplete UTF-8 character'
33	INVALID_CONT = 'Invalid UTF-8 continuation byte'
34	INVALID_START = 'Invalid start of UTF-8 character'
35
36
37	def _CheckContinuationByte(byte):
38	# type: (str) -> None
39	if (ord(byte) >> 6) != 0b10:
40	e_strict(INVALID_CONT, loc.Missing)
41
42
43	def _Utf8CharLen(starting_byte):
44	# type: (int) -> int
45	if (starting_byte >> 7) == 0b0:
46	return 1
47	elif (starting_byte >> 5) == 0b110:
48	return 2
49	elif (starting_byte >> 4) == 0b1110:
50	return 3
51	elif (starting_byte >> 3) == 0b11110:
52	return 4
53	else:
54	e_strict(INVALID_START, loc.Missing)
55
56
57	def _ReadOneUnit(s, cursor):
58	# type: (str, int) -> Tuple[int, int]
59	"""Helper for DecodeUtf8Char"""
60	if cursor >= len(s):
61	raise error.Expr(INCOMPLETE_CHAR, loc.Missing)
62
63	b = ord(s[cursor])
64	cursor += 1
65
66	if b & 0b11000000 != 0b10000000:
67	raise error.Expr(INVALID_CONT, loc.Missing)
68
69	return b & 0b00111111, cursor
70
71
72	def DecodeUtf8Char(s, start):
73	# type: (str, int) -> int
74	"""Given a string and start index, decode the Unicode char immediately
75	following the start index. The start location is in bytes and should be
76	found using a function like NextUtf8Char or PreviousUtf8Char.
77
78	If the codepoint in invalid, we raise an `error.Expr`. (This is different
79	from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
80	errors.)
81
82	Known Issues:
83	- Doesn't raise issue on surrogate pairs
84	- Doesn't raise issue on non-shortest form encodings
85	- Isn't very performant and allocates one-byte-strings for each byte
86	"""
87	# We use table 3.6 (reproduced below) from [0]. Note that table 3.6 is not
88	# sufficient for validating UTF-8 as it allows surrogate pairs and
89	# non-shortest form encodings. A correct decoder should follow the
90	# encodings in table 3.7 from [0].
91	#
92	# \| Scalar Value \| 1st Byte \| 2nd Byte \| 3rd Byte \| 4th Byte \|
93	# +----------------------------+----------+----------+----------+----------+
94	# \| 00000000 0xxxxxxx \| 0xxxxxxx \| \| \| \|
95	# \| 00000yyy yyxxxxxx \| 110yyyyy \| 10xxxxxx \| \| \|
96	# \| zzzzyyyy yyxxxxxx \| 1110zzzz \| 10yyyyyy \| 10xxxxxx \| \|
97	# \| 000uuuuu zzzzyyyy yyxxxxxx \| 11110uuu \| 10uuzzzz \| 10yyyyyy \| 10xxxxxx \|
98	#
99	# [0] https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
100	assert 0 <= start < len(s)
101
102	b = ord(s[start])
103	cursor = start + 1
104
105	if b & 0b10000000 == 0:
106	return b & 0b01111111
107
108	if b & 0b11100000 == 0b11000000:
109	y = b & 0b00011111
110	y <<= 6
111
112	x, cursor = _ReadOneUnit(s, cursor)
113
114	return y \| x
115
116	if b & 0b11110000 == 0b11100000:
117	z = b & 0b00001111
118	z <<= 12
119
120	y, cursor = _ReadOneUnit(s, cursor)
121	y <<= 6
122
123	x, cursor = _ReadOneUnit(s, cursor)
124
125	return z \| x \| y
126
127	if b & 0b11111000 == 0b11110000:
128	u = b & 0b00000111
129	u <<= 18
130
131	z, cursor = _ReadOneUnit(s, cursor)
132	z <<= 12
133
134	y, cursor = _ReadOneUnit(s, cursor)
135	y <<= 6
136
137	x, cursor = _ReadOneUnit(s, cursor)
138
139	return u \| z \| x \| y
140
141	raise error.Expr(INVALID_START, loc.Missing)
142
143
144	def NextUtf8Char(s, i):
145	# type: (str, int) -> int
146	"""Given a string and a byte offset, returns the byte position after the
147	character at this position. Usually this is the position of the next
148	character, but for the last character in the string, it's the position just
149	past the end of the string.
150
151	Validates UTF-8.
152	"""
153	n = len(s)
154	assert i < n, i # should always be in range
155	byte_as_int = ord(s[i])
156	length = _Utf8CharLen(byte_as_int)
157	for j in xrange(i + 1, i + length):
158	if j >= n:
159	e_strict(INCOMPLETE_CHAR, loc.Missing)
160	_CheckContinuationByte(s[j])
161
162	return i + length
163
164
165	def PreviousUtf8Char(s, i):
166	# type: (str, int) -> int
167	"""Given a string and a byte offset, returns the position of the character
168	before that offset. To start (find the first byte of the last character),
169	pass len(s) for the initial value of i.
170
171	Validates UTF-8.
172	"""
173	# All bytes in a valid UTF-8 string have one of the following formats:
174	#
175	# 0xxxxxxx (1-byte char)
176	# 110xxxxx (start of 2-byte char)
177	# 1110xxxx (start of 3-byte char)
178	# 11110xxx (start of 4-byte char)
179	# 10xxxxxx (continuation byte)
180	#
181	# Any byte that starts with 10... MUST be a continuation byte,
182	# otherwise it must be the start of a character (or just invalid
183	# data).
184	#
185	# Walking backward, we stop at the first non-continuaton byte
186	# found. We try to interpret it as a valid UTF-8 character starting
187	# byte, and check that it indicates the correct length, based on how
188	# far we've moved from the original byte. Possible problems:
189	# * byte we stopped on does not have a valid value (e.g., 11111111)
190	# * start byte indicates more or fewer continuation bytes than we've seen
191	# * no start byte at beginning of array
192	#
193	# Note that because we are going backward, on malformed input, we
194	# won't error out in the same place as when parsing the string
195	# forwards as normal.
196	orig_i = i
197
198	while i > 0:
199	i -= 1
200	byte_as_int = ord(s[i])
201	if (byte_as_int >> 6) != 0b10:
202	offset = orig_i - i
203	if offset != _Utf8CharLen(byte_as_int):
204	# Leaving a generic error for now, but if we want to, it's not
205	# hard to calculate the position where things go wrong. Note
206	# that offset might be more than 4, for an invalid utf-8 string.
207	e_strict(INVALID_START, loc.Missing)
208	return i
209
210	e_strict(INVALID_START, loc.Missing)
211
212
213	def CountUtf8Chars(s):
214	# type: (str) -> int
215	"""Returns the number of utf-8 characters in the byte string 's'.
216
217	TODO: Raise exception rather than returning a string, so we can set the exit
218	code of the command to 1 ?
219
220	$ echo ${#bad}
221	Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
222	$ echo $?
223	1
224	"""
225	num_chars = 0
226	num_bytes = len(s)
227	i = 0
228	while i < num_bytes:
229	i = NextUtf8Char(s, i)
230	num_chars += 1
231	return num_chars
232
233
234	def AdvanceUtf8Chars(s, num_chars, byte_offset):
235	# type: (str, int, int) -> int
236	"""Starting from byte offset, advance by N UTF-8 runes
237
238	Returns a byte offset.
239
240	Used for shell slicing.
241	"""
242	num_bytes = len(s)
243	i = byte_offset # current byte position
244
245	for _ in xrange(num_chars):
246	# Neither bash or zsh checks out of bounds for slicing. Either begin or
247	# length.
248	if i >= num_bytes:
249	return i
250	#raise RuntimeError('Out of bounds')
251
252	i = NextUtf8Char(s, i)
253
254	return i
255
256
257	# Limited Unicode codepoints for whitespace characters.
258	# Oils intentionally does not include characters from <USP>, as that set
259	# depends on the version of the Unicode standard used.
260	#
261	# See discussion on the original pull request which added this list here:
262	#
263	# https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
264	#
265	# See also the Mozilla Javascript documentation, and the note on how
266	# changes to the standard affected Javascript:
267	#
268	# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
269
270	SPACES = [
271	0x0009, # Horizontal tab (\t)
272	0x000A, # Newline (\n)
273	0x000B, # Vertical tab (\v)
274	0x000C, # Form feed (\f)
275	0x000D, # Carriage return (\r)
276	0x0020, # Normal space
277	0x00A0, # No-break space <NBSP>
278	0xFEFF, # Zero-width no-break space <ZWNBSP>
279	]
280
281
282	def _IsSpace(codepoint):
283	# type: (int) -> bool
284	return codepoint in SPACES
285
286
287	def StartsWithWhitespaceByteRange(s):
288	# type: (str) -> Tuple[int, int]
289	"""Returns the range of 's' which has leading whitespace characters.
290
291	If 's' has no leading whitespace, an valid but empty range is returned.
292
293	The returned range is given as byte positions, and is a half-open range
294	"[start, end)" which is returned as a tuple.
295
296	Used for shell functions like 'trimStart' to match then trim whitespace.
297	"""
298	len_s = len(s)
299	i = 0
300	while i < len_s:
301	codepoint = DecodeUtf8Char(s, i)
302	if not _IsSpace(codepoint):
303	break
304
305	try:
306	i = NextUtf8Char(s, i)
307	except error.Strict:
308	assert False, "DecodeUtf8Char should have caught any encoding errors"
309
310	start = 0
311	end = i
312	return (start, end)
313
314
315	def EndsWithWhitespaceByteRange(s):
316	# type: (str) -> Tuple[int, int]
317	"""Returns the range of 's' which has trailing whitespace characters.
318
319	If 's' has no leading whitespace, an valid but empty range is returned.
320
321	The returned range is given as byte positions, and is a half-open range
322	"[start, end)" which is returned as a tuple.
323
324	Used for shell functions like 'trimEnd' to match then trim whitespace.
325	"""
326	len_s = len(s)
327	i = len_s
328	while i > 0:
329	# TODO: Gracefully handle surrogate pairs and overlong encodings when
330	# finding the start of each character.
331	prev = PreviousUtf8Char(s, i)
332
333	codepoint = DecodeUtf8Char(s, prev)
334	if not _IsSpace(codepoint):
335	break
336
337	i = prev
338
339	start = i
340	end = len_s
341	return (start, end)
342
343
344	# Implementation without Python regex:
345	#
346	# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
347	# regexec. in a loop. fnmatch() does NOT given positions of matches.
348	#
349	# (2) Strip -- % %% # ## -
350	#
351	# a. Fast path for constant strings.
352	# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
353	# doesn't match, short circuit out? We can't do this with fnmatch.
354	# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
355	#
356	# - # shortest prefix - [:1], [:2], [:3] until it matches
357	# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
358	# match prefixes, it matches EXACTLY.
359	# - % shortest suffix - [-1:] [-2:] [-3:] ...
360	# - %% longest suffix - [1:] [2:] [3:]
361	#
362	# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
363	# mksh. Dash doesn't implement it.
364
365	# TODO:
366	# - Unicode support: Convert both pattern, string, and replacement to unicode,
367	# then the result back at the end.
368	# - Compile time errors for [[:space:]] ?
369
370
371	def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
372	# type: (str, Token, str, bool) -> str
373	"""Helper for ${x#prefix} and family."""
374
375	id_ = op_tok.id
376
377	# Fast path for constant strings.
378	# TODO: Should be LooksLikeExtendedGlob!
379	if not is_extglob and not glob_.LooksLikeGlob(arg):
380	# It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
381	# reverse it. NOTE: We also do this check in Globber.Expand(). It would
382	# be nice to somehow store the original string rather than
383	# escaping/unescaping.
384	arg = glob_.GlobUnescape(arg)
385
386	if id_ in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
387	# explicit check for non-empty arg (len for mycpp)
388	if len(arg) and s.startswith(arg):
389	return s[len(arg):]
390	else:
391	return s
392
393	elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
394	# need explicit check for non-empty arg (len for mycpp)
395	if len(arg) and s.endswith(arg):
396	return s[:-len(arg)]
397	else:
398	return s
399
400	# These operators take glob arguments, we don't implement that obscure case.
401	elif id_ == Id.VOp1_Comma: # Only lowercase the first letter
402	if arg != '':
403	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
404	if len(s):
405	return s[0].lower() + s[1:]
406	else:
407	return s
408
409	elif id_ == Id.VOp1_DComma:
410	if arg != '':
411	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
412	return s.lower()
413
414	elif id_ == Id.VOp1_Caret: # Only uppercase the first letter
415	if arg != '':
416	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
417	if len(s):
418	return s[0].upper() + s[1:]
419	else:
420	return s
421
422	elif id_ == Id.VOp1_DCaret:
423	if arg != '':
424	e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
425	return s.upper()
426
427	else: # e.g. ^ ^^ , ,,
428	raise AssertionError(id_)
429
430	# For patterns, do fnmatch() in a loop.
431	#
432	# TODO:
433	# - Another potential fast path:
434	# v=aabbccdd
435	# echo ${v#*b} # strip shortest prefix
436	#
437	# If the whole thing doesn't match 'b', then no test can succeed. So we
438	# can fail early. Conversely echo ${v%%c} and 'c*'.
439	#
440	# (Although honestly this whole construct is nuts and should be deprecated.)
441
442	n = len(s)
443
444	if id_ == Id.VOp1_Pound: # shortest prefix
445	# 'abcd': match '', 'a', 'ab', 'abc', ...
446	i = 0
447	while True:
448	assert i <= n
449	#log('Matching pattern %r with %r', arg, s[:i])
450	if libc.fnmatch(arg, s[:i]):
451	return s[i:]
452	if i >= n:
453	break
454	i = NextUtf8Char(s, i)
455	return s
456
457	elif id_ == Id.VOp1_DPound: # longest prefix
458	# 'abcd': match 'abc', 'ab', 'a'
459	i = n
460	while True:
461	assert i >= 0
462	#log('Matching pattern %r with %r', arg, s[:i])
463	if libc.fnmatch(arg, s[:i]):
464	return s[i:]
465	if i == 0:
466	break
467	i = PreviousUtf8Char(s, i)
468	return s
469
470	elif id_ == Id.VOp1_Percent: # shortest suffix
471	# 'abcd': match 'abcd', 'abc', 'ab', 'a'
472	i = n
473	while True:
474	assert i >= 0
475	#log('Matching pattern %r with %r', arg, s[:i])
476	if libc.fnmatch(arg, s[i:]):
477	return s[:i]
478	if i == 0:
479	break
480	i = PreviousUtf8Char(s, i)
481	return s
482
483	elif id_ == Id.VOp1_DPercent: # longest suffix
484	# 'abcd': match 'abc', 'bc', 'c', ...
485	i = 0
486	while True:
487	assert i <= n
488	#log('Matching pattern %r with %r', arg, s[:i])
489	if libc.fnmatch(arg, s[i:]):
490	return s[:i]
491	if i >= n:
492	break
493	i = NextUtf8Char(s, i)
494	return s
495
496	else:
497	raise NotImplementedError(ui.PrettyId(id_))
498
499
500	def _AllMatchPositions(s, regex):
501	# type: (str, str) -> List[Tuple[int, int]]
502	"""Returns a list of all (start, end) match positions of the regex against
503	s.
504
505	(If there are no matches, it returns the empty list.)
506	"""
507	matches = [] # type: List[Tuple[int, int]]
508	pos = 0
509	n = len(s)
510	while pos < n: # needed to prevent infinite loop in (.*) case
511	m = libc.regex_first_group_match(regex, s, pos)
512	if m is None:
513	break
514	matches.append(m)
515	start, end = m
516	pos = end # advance position
517	return matches
518
519
520	def _PatSubAll(s, regex, replace_str):
521	# type: (str, str, str) -> str
522	parts = [] # type: List[str]
523	prev_end = 0
524	for start, end in _AllMatchPositions(s, regex):
525	parts.append(s[prev_end:start])
526	parts.append(replace_str)
527	prev_end = end
528	parts.append(s[prev_end:])
529	return ''.join(parts)
530
531
532	class GlobReplacer(object):
533
534	def __init__(self, regex, replace_str, slash_tok):
535	# type: (str, str, Token) -> None
536
537	# TODO: It would be nice to cache the compilation of the regex here,
538	# instead of just the string. That would require more sophisticated use of
539	# the Python/C API in libc.c, which we might want to avoid.
540	self.regex = regex
541	self.replace_str = replace_str
542	self.slash_tok = slash_tok
543
544	def __repr__(self):
545	# type: () -> str
546	return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
547
548	def Replace(self, s, op):
549	# type: (str, suffix_op.PatSub) -> str
550
551	regex = '(%s)' % self.regex # make it a group
552
553	if op.replace_mode == Id.Lit_Slash:
554	# Avoid infinite loop when replacing all copies of empty string
555	if len(self.regex) == 0:
556	return s
557
558	try:
559	return _PatSubAll(s, regex,
560	self.replace_str) # loop over matches
561	except RuntimeError as e:
562	# Not sure if this is possible since we convert from glob:
563	# libc.regex_first_group_match raises RuntimeError on regex syntax
564	# error.
565	msg = e.message # type: str
566	e_die('Error matching regex %r: %s' % (regex, msg),
567	self.slash_tok)
568
569	if op.replace_mode == Id.Lit_Pound:
570	regex = '^' + regex
571	elif op.replace_mode == Id.Lit_Percent:
572	regex = regex + '$'
573
574	m = libc.regex_first_group_match(regex, s, 0)
575	#log('regex = %r, s = %r, match = %r', regex, s, m)
576	if m is None:
577	return s
578	start, end = m
579	return s[:start] + self.replace_str + s[end:]
580
581
582	def ShellQuoteB(s):
583	# type: (str) -> str
584	"""Quote by adding backslashes.
585
586	Used for autocompletion, so it's friendlier for display on the
587	command line. We use the strategy above for other use cases.
588	"""
589	# There's no way to escape a newline! Bash prints ^J for some reason, but
590	# we're more explicit. This will happen if there's a newline on a file
591	# system or a completion plugin returns a newline.
592
593	# NOTE: tabs CAN be escaped with \.
594	s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
595
596	# ~ for home dir
597	# ! for history
598	# * [] ? for glob
599	# {} for brace expansion
600	# space because it separates words
601	return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\\|;\'"<>?')