OILS / osh / string_ops.py View on Github | oilshell.org

601 lines, 272 significant
1"""
2string_ops.py - String library functions that can be exposed with a saner syntax.
3
4OSH:
5
6 local y=${x//a*/b}
7
8YSH:
9
10 var y = x => sub('a*', 'b', :ALL)
11
12 Pass x => sub('a*', 'b', :ALL) => var y
13"""
14
15from _devbuild.gen.id_kind_asdl import Id
16from _devbuild.gen.syntax_asdl import loc, Token, suffix_op
17from core import pyutil
18from core import ui
19from core import error
20from core.error import e_die, e_strict
21from mycpp.mylib import log
22from osh import glob_
23
24import libc
25
26from typing import List, Tuple
27
28_ = log
29
30# TODO: Add details of the invalid character/byte here?
31
32INCOMPLETE_CHAR = 'Incomplete UTF-8 character'
33INVALID_CONT = 'Invalid UTF-8 continuation byte'
34INVALID_START = 'Invalid start of UTF-8 character'
35
36
37def _CheckContinuationByte(byte):
38 # type: (str) -> None
39 if (ord(byte) >> 6) != 0b10:
40 e_strict(INVALID_CONT, loc.Missing)
41
42
43def _Utf8CharLen(starting_byte):
44 # type: (int) -> int
45 if (starting_byte >> 7) == 0b0:
46 return 1
47 elif (starting_byte >> 5) == 0b110:
48 return 2
49 elif (starting_byte >> 4) == 0b1110:
50 return 3
51 elif (starting_byte >> 3) == 0b11110:
52 return 4
53 else:
54 e_strict(INVALID_START, loc.Missing)
55
56
57def _ReadOneUnit(s, cursor):
58 # type: (str, int) -> Tuple[int, int]
59 """Helper for DecodeUtf8Char"""
60 if cursor >= len(s):
61 raise error.Expr(INCOMPLETE_CHAR, loc.Missing)
62
63 b = ord(s[cursor])
64 cursor += 1
65
66 if b & 0b11000000 != 0b10000000:
67 raise error.Expr(INVALID_CONT, loc.Missing)
68
69 return b & 0b00111111, cursor
70
71
72def DecodeUtf8Char(s, start):
73 # type: (str, int) -> int
74 """Given a string and start index, decode the Unicode char immediately
75 following the start index. The start location is in bytes and should be
76 found using a function like NextUtf8Char or PreviousUtf8Char.
77
78 If the codepoint in invalid, we raise an `error.Expr`. (This is different
79 from {Next,Previous}Utf8Char which raises an `error.Strict` on encoding
80 errors.)
81
82 Known Issues:
83 - Doesn't raise issue on surrogate pairs
84 - Doesn't raise issue on non-shortest form encodings
85 - Isn't very performant and allocates one-byte-strings for each byte
86 """
87 # We use table 3.6 (reproduced below) from [0]. Note that table 3.6 is not
88 # sufficient for validating UTF-8 as it allows surrogate pairs and
89 # non-shortest form encodings. A correct decoder should follow the
90 # encodings in table 3.7 from [0].
91 #
92 # | Scalar Value | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
93 # +----------------------------+----------+----------+----------+----------+
94 # | 00000000 0xxxxxxx | 0xxxxxxx | | | |
95 # | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | |
96 # | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | |
97 # | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
98 #
99 # [0] https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf
100 assert 0 <= start < len(s)
101
102 b = ord(s[start])
103 cursor = start + 1
104
105 if b & 0b10000000 == 0:
106 return b & 0b01111111
107
108 if b & 0b11100000 == 0b11000000:
109 y = b & 0b00011111
110 y <<= 6
111
112 x, cursor = _ReadOneUnit(s, cursor)
113
114 return y | x
115
116 if b & 0b11110000 == 0b11100000:
117 z = b & 0b00001111
118 z <<= 12
119
120 y, cursor = _ReadOneUnit(s, cursor)
121 y <<= 6
122
123 x, cursor = _ReadOneUnit(s, cursor)
124
125 return z | x | y
126
127 if b & 0b11111000 == 0b11110000:
128 u = b & 0b00000111
129 u <<= 18
130
131 z, cursor = _ReadOneUnit(s, cursor)
132 z <<= 12
133
134 y, cursor = _ReadOneUnit(s, cursor)
135 y <<= 6
136
137 x, cursor = _ReadOneUnit(s, cursor)
138
139 return u | z | x | y
140
141 raise error.Expr(INVALID_START, loc.Missing)
142
143
144def NextUtf8Char(s, i):
145 # type: (str, int) -> int
146 """Given a string and a byte offset, returns the byte position after the
147 character at this position. Usually this is the position of the next
148 character, but for the last character in the string, it's the position just
149 past the end of the string.
150
151 Validates UTF-8.
152 """
153 n = len(s)
154 assert i < n, i # should always be in range
155 byte_as_int = ord(s[i])
156 length = _Utf8CharLen(byte_as_int)
157 for j in xrange(i + 1, i + length):
158 if j >= n:
159 e_strict(INCOMPLETE_CHAR, loc.Missing)
160 _CheckContinuationByte(s[j])
161
162 return i + length
163
164
165def PreviousUtf8Char(s, i):
166 # type: (str, int) -> int
167 """Given a string and a byte offset, returns the position of the character
168 before that offset. To start (find the first byte of the last character),
169 pass len(s) for the initial value of i.
170
171 Validates UTF-8.
172 """
173 # All bytes in a valid UTF-8 string have one of the following formats:
174 #
175 # 0xxxxxxx (1-byte char)
176 # 110xxxxx (start of 2-byte char)
177 # 1110xxxx (start of 3-byte char)
178 # 11110xxx (start of 4-byte char)
179 # 10xxxxxx (continuation byte)
180 #
181 # Any byte that starts with 10... MUST be a continuation byte,
182 # otherwise it must be the start of a character (or just invalid
183 # data).
184 #
185 # Walking backward, we stop at the first non-continuaton byte
186 # found. We try to interpret it as a valid UTF-8 character starting
187 # byte, and check that it indicates the correct length, based on how
188 # far we've moved from the original byte. Possible problems:
189 # * byte we stopped on does not have a valid value (e.g., 11111111)
190 # * start byte indicates more or fewer continuation bytes than we've seen
191 # * no start byte at beginning of array
192 #
193 # Note that because we are going backward, on malformed input, we
194 # won't error out in the same place as when parsing the string
195 # forwards as normal.
196 orig_i = i
197
198 while i > 0:
199 i -= 1
200 byte_as_int = ord(s[i])
201 if (byte_as_int >> 6) != 0b10:
202 offset = orig_i - i
203 if offset != _Utf8CharLen(byte_as_int):
204 # Leaving a generic error for now, but if we want to, it's not
205 # hard to calculate the position where things go wrong. Note
206 # that offset might be more than 4, for an invalid utf-8 string.
207 e_strict(INVALID_START, loc.Missing)
208 return i
209
210 e_strict(INVALID_START, loc.Missing)
211
212
213def CountUtf8Chars(s):
214 # type: (str) -> int
215 """Returns the number of utf-8 characters in the byte string 's'.
216
217 TODO: Raise exception rather than returning a string, so we can set the exit
218 code of the command to 1 ?
219
220 $ echo ${#bad}
221 Invalid utf-8 at index 3 of string 'bad': 'ab\xffd'
222 $ echo $?
223 1
224 """
225 num_chars = 0
226 num_bytes = len(s)
227 i = 0
228 while i < num_bytes:
229 i = NextUtf8Char(s, i)
230 num_chars += 1
231 return num_chars
232
233
234def AdvanceUtf8Chars(s, num_chars, byte_offset):
235 # type: (str, int, int) -> int
236 """Starting from byte offset, advance by N UTF-8 runes
237
238 Returns a byte offset.
239
240 Used for shell slicing.
241 """
242 num_bytes = len(s)
243 i = byte_offset # current byte position
244
245 for _ in xrange(num_chars):
246 # Neither bash or zsh checks out of bounds for slicing. Either begin or
247 # length.
248 if i >= num_bytes:
249 return i
250 #raise RuntimeError('Out of bounds')
251
252 i = NextUtf8Char(s, i)
253
254 return i
255
256
257# Limited Unicode codepoints for whitespace characters.
258# Oils intentionally does not include characters from <USP>, as that set
259# depends on the version of the Unicode standard used.
260#
261# See discussion on the original pull request which added this list here:
262#
263# https://github.com/oilshell/oil/pull/1836#issuecomment-1942173520
264#
265# See also the Mozilla Javascript documentation, and the note on how
266# changes to the standard affected Javascript:
267#
268# https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Lexical_grammar#white_space
269
270SPACES = [
271 0x0009, # Horizontal tab (\t)
272 0x000A, # Newline (\n)
273 0x000B, # Vertical tab (\v)
274 0x000C, # Form feed (\f)
275 0x000D, # Carriage return (\r)
276 0x0020, # Normal space
277 0x00A0, # No-break space <NBSP>
278 0xFEFF, # Zero-width no-break space <ZWNBSP>
279]
280
281
282def _IsSpace(codepoint):
283 # type: (int) -> bool
284 return codepoint in SPACES
285
286
287def StartsWithWhitespaceByteRange(s):
288 # type: (str) -> Tuple[int, int]
289 """Returns the range of 's' which has leading whitespace characters.
290
291 If 's' has no leading whitespace, an valid but empty range is returned.
292
293 The returned range is given as byte positions, and is a half-open range
294 "[start, end)" which is returned as a tuple.
295
296 Used for shell functions like 'trimStart' to match then trim whitespace.
297 """
298 len_s = len(s)
299 i = 0
300 while i < len_s:
301 codepoint = DecodeUtf8Char(s, i)
302 if not _IsSpace(codepoint):
303 break
304
305 try:
306 i = NextUtf8Char(s, i)
307 except error.Strict:
308 assert False, "DecodeUtf8Char should have caught any encoding errors"
309
310 start = 0
311 end = i
312 return (start, end)
313
314
315def EndsWithWhitespaceByteRange(s):
316 # type: (str) -> Tuple[int, int]
317 """Returns the range of 's' which has trailing whitespace characters.
318
319 If 's' has no leading whitespace, an valid but empty range is returned.
320
321 The returned range is given as byte positions, and is a half-open range
322 "[start, end)" which is returned as a tuple.
323
324 Used for shell functions like 'trimEnd' to match then trim whitespace.
325 """
326 len_s = len(s)
327 i = len_s
328 while i > 0:
329 # TODO: Gracefully handle surrogate pairs and overlong encodings when
330 # finding the start of each character.
331 prev = PreviousUtf8Char(s, i)
332
333 codepoint = DecodeUtf8Char(s, prev)
334 if not _IsSpace(codepoint):
335 break
336
337 i = prev
338
339 start = i
340 end = len_s
341 return (start, end)
342
343
344# Implementation without Python regex:
345#
346# (1) PatSub: I think we fill in GlobToExtendedRegex, then use regcomp and
347# regexec. in a loop. fnmatch() does NOT given positions of matches.
348#
349# (2) Strip -- % %% # ## -
350#
351# a. Fast path for constant strings.
352# b. Convert to POSIX extended regex, to see if it matches at ALL. If it
353# doesn't match, short circuit out? We can't do this with fnmatch.
354# c. If it does match, call fnmatch() iteratively over prefixes / suffixes.
355#
356# - # shortest prefix - [:1], [:2], [:3] until it matches
357# - ## longest prefix - [:-1] [:-2], [:3]. Works because fnmatch does not
358# match prefixes, it matches EXACTLY.
359# - % shortest suffix - [-1:] [-2:] [-3:] ...
360# - %% longest suffix - [1:] [2:] [3:]
361#
362# See remove_pattern() in subst.c for bash, and trimsub() in eval.c for
363# mksh. Dash doesn't implement it.
364
365# TODO:
366# - Unicode support: Convert both pattern, string, and replacement to unicode,
367# then the result back at the end.
368# - Compile time errors for [[:space:]] ?
369
370
371def DoUnarySuffixOp(s, op_tok, arg, is_extglob):
372 # type: (str, Token, str, bool) -> str
373 """Helper for ${x#prefix} and family."""
374
375 id_ = op_tok.id
376
377 # Fast path for constant strings.
378 # TODO: Should be LooksLikeExtendedGlob!
379 if not is_extglob and not glob_.LooksLikeGlob(arg):
380 # It doesn't look like a glob, but we glob-escaped it (e.g. [ -> \[). So
381 # reverse it. NOTE: We also do this check in Globber.Expand(). It would
382 # be nice to somehow store the original string rather than
383 # escaping/unescaping.
384 arg = glob_.GlobUnescape(arg)
385
386 if id_ in (Id.VOp1_Pound, Id.VOp1_DPound): # const prefix
387 # explicit check for non-empty arg (len for mycpp)
388 if len(arg) and s.startswith(arg):
389 return s[len(arg):]
390 else:
391 return s
392
393 elif id_ in (Id.VOp1_Percent, Id.VOp1_DPercent): # const suffix
394 # need explicit check for non-empty arg (len for mycpp)
395 if len(arg) and s.endswith(arg):
396 return s[:-len(arg)]
397 else:
398 return s
399
400 # These operators take glob arguments, we don't implement that obscure case.
401 elif id_ == Id.VOp1_Comma: # Only lowercase the first letter
402 if arg != '':
403 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
404 if len(s):
405 return s[0].lower() + s[1:]
406 else:
407 return s
408
409 elif id_ == Id.VOp1_DComma:
410 if arg != '':
411 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
412 return s.lower()
413
414 elif id_ == Id.VOp1_Caret: # Only uppercase the first letter
415 if arg != '':
416 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
417 if len(s):
418 return s[0].upper() + s[1:]
419 else:
420 return s
421
422 elif id_ == Id.VOp1_DCaret:
423 if arg != '':
424 e_die("%s can't have an argument" % ui.PrettyId(id_), op_tok)
425 return s.upper()
426
427 else: # e.g. ^ ^^ , ,,
428 raise AssertionError(id_)
429
430 # For patterns, do fnmatch() in a loop.
431 #
432 # TODO:
433 # - Another potential fast path:
434 # v=aabbccdd
435 # echo ${v#*b} # strip shortest prefix
436 #
437 # If the whole thing doesn't match '*b*', then no test can succeed. So we
438 # can fail early. Conversely echo ${v%%c*} and '*c*'.
439 #
440 # (Although honestly this whole construct is nuts and should be deprecated.)
441
442 n = len(s)
443
444 if id_ == Id.VOp1_Pound: # shortest prefix
445 # 'abcd': match '', 'a', 'ab', 'abc', ...
446 i = 0
447 while True:
448 assert i <= n
449 #log('Matching pattern %r with %r', arg, s[:i])
450 if libc.fnmatch(arg, s[:i]):
451 return s[i:]
452 if i >= n:
453 break
454 i = NextUtf8Char(s, i)
455 return s
456
457 elif id_ == Id.VOp1_DPound: # longest prefix
458 # 'abcd': match 'abc', 'ab', 'a'
459 i = n
460 while True:
461 assert i >= 0
462 #log('Matching pattern %r with %r', arg, s[:i])
463 if libc.fnmatch(arg, s[:i]):
464 return s[i:]
465 if i == 0:
466 break
467 i = PreviousUtf8Char(s, i)
468 return s
469
470 elif id_ == Id.VOp1_Percent: # shortest suffix
471 # 'abcd': match 'abcd', 'abc', 'ab', 'a'
472 i = n
473 while True:
474 assert i >= 0
475 #log('Matching pattern %r with %r', arg, s[:i])
476 if libc.fnmatch(arg, s[i:]):
477 return s[:i]
478 if i == 0:
479 break
480 i = PreviousUtf8Char(s, i)
481 return s
482
483 elif id_ == Id.VOp1_DPercent: # longest suffix
484 # 'abcd': match 'abc', 'bc', 'c', ...
485 i = 0
486 while True:
487 assert i <= n
488 #log('Matching pattern %r with %r', arg, s[:i])
489 if libc.fnmatch(arg, s[i:]):
490 return s[:i]
491 if i >= n:
492 break
493 i = NextUtf8Char(s, i)
494 return s
495
496 else:
497 raise NotImplementedError(ui.PrettyId(id_))
498
499
500def _AllMatchPositions(s, regex):
501 # type: (str, str) -> List[Tuple[int, int]]
502 """Returns a list of all (start, end) match positions of the regex against
503 s.
504
505 (If there are no matches, it returns the empty list.)
506 """
507 matches = [] # type: List[Tuple[int, int]]
508 pos = 0
509 n = len(s)
510 while pos < n: # needed to prevent infinite loop in (.*) case
511 m = libc.regex_first_group_match(regex, s, pos)
512 if m is None:
513 break
514 matches.append(m)
515 start, end = m
516 pos = end # advance position
517 return matches
518
519
520def _PatSubAll(s, regex, replace_str):
521 # type: (str, str, str) -> str
522 parts = [] # type: List[str]
523 prev_end = 0
524 for start, end in _AllMatchPositions(s, regex):
525 parts.append(s[prev_end:start])
526 parts.append(replace_str)
527 prev_end = end
528 parts.append(s[prev_end:])
529 return ''.join(parts)
530
531
532class GlobReplacer(object):
533
534 def __init__(self, regex, replace_str, slash_tok):
535 # type: (str, str, Token) -> None
536
537 # TODO: It would be nice to cache the compilation of the regex here,
538 # instead of just the string. That would require more sophisticated use of
539 # the Python/C API in libc.c, which we might want to avoid.
540 self.regex = regex
541 self.replace_str = replace_str
542 self.slash_tok = slash_tok
543
544 def __repr__(self):
545 # type: () -> str
546 return '<_GlobReplacer regex %r r %r>' % (self.regex, self.replace_str)
547
548 def Replace(self, s, op):
549 # type: (str, suffix_op.PatSub) -> str
550
551 regex = '(%s)' % self.regex # make it a group
552
553 if op.replace_mode == Id.Lit_Slash:
554 # Avoid infinite loop when replacing all copies of empty string
555 if len(self.regex) == 0:
556 return s
557
558 try:
559 return _PatSubAll(s, regex,
560 self.replace_str) # loop over matches
561 except RuntimeError as e:
562 # Not sure if this is possible since we convert from glob:
563 # libc.regex_first_group_match raises RuntimeError on regex syntax
564 # error.
565 msg = e.message # type: str
566 e_die('Error matching regex %r: %s' % (regex, msg),
567 self.slash_tok)
568
569 if op.replace_mode == Id.Lit_Pound:
570 regex = '^' + regex
571 elif op.replace_mode == Id.Lit_Percent:
572 regex = regex + '$'
573
574 m = libc.regex_first_group_match(regex, s, 0)
575 #log('regex = %r, s = %r, match = %r', regex, s, m)
576 if m is None:
577 return s
578 start, end = m
579 return s[:start] + self.replace_str + s[end:]
580
581
582def ShellQuoteB(s):
583 # type: (str) -> str
584 """Quote by adding backslashes.
585
586 Used for autocompletion, so it's friendlier for display on the
587 command line. We use the strategy above for other use cases.
588 """
589 # There's no way to escape a newline! Bash prints ^J for some reason, but
590 # we're more explicit. This will happen if there's a newline on a file
591 # system or a completion plugin returns a newline.
592
593 # NOTE: tabs CAN be escaped with \.
594 s = s.replace('\r', '<INVALID CR>').replace('\n', '<INVALID NEWLINE>')
595
596 # ~ for home dir
597 # ! for history
598 # * [] ? for glob
599 # {} for brace expansion
600 # space because it separates words
601 return pyutil.BackslashEscape(s, ' `~!$&*()[]{}\\|;\'"<>?')