OILS / osh / string_ops_test.py View on Github | oilshell.org

190 lines, 143 significant
1#!/usr/bin/env python2
2"""
3string_ops_test.py: Tests for string_ops.py
4"""
5from __future__ import print_function
6
7import unittest
8
9from core import error
10from osh import string_ops # module under test
11
12
13class LibStrTest(unittest.TestCase):
14
15 def test_NextUtf8Char(self):
16 CASES = [
17 ([1, 3, 6, 10], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
18 ([1, 3,
19 'Invalid UTF-8 continuation byte'], '\x24\xC2\xA2\xE0\xE0\xA4'),
20 ([1, 3, 6, 'Invalid start of UTF-8 character'],
21 '\x24\xC2\xA2\xE0\xA4\xA4\xB9'),
22 ([1, 3, 'Invalid start of UTF-8 character'], '\x24\xC2\xA2\xFF'),
23 ([1, 'Incomplete UTF-8 character'], '\x24\xF0\x90\x8D'),
24 ]
25 for expected_indexes, input_str in CASES:
26 print()
27 print('NextUtf8Char case %r %r' % (expected_indexes, input_str))
28 i = 0
29 actual_indexes = []
30 while True:
31 try:
32 i = string_ops.NextUtf8Char(input_str, i)
33 actual_indexes.append(i)
34 if i >= len(input_str):
35 break
36 except error.Strict as e:
37 actual_indexes.append(e.msg)
38 break
39 self.assertEqual(expected_indexes, actual_indexes)
40
41 def test_DecodeNextUtf8Char(self):
42 s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
43 codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
44 start = 0
45 for codepoint in codepoints:
46 end = string_ops.NextUtf8Char(s, start)
47 codepoint = string_ops.DecodeUtf8Char(s, start)
48 self.assertEqual(codepoint, codepoint)
49 start = end
50
51 def test_DecodePrevUtf8Char(self):
52 s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
53 codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
54 end = len(s)
55 for codepoint in reversed(codepoints):
56 start = string_ops.PreviousUtf8Char(s, end)
57 codepoint = string_ops.DecodeUtf8Char(s, start)
58 self.assertEqual(codepoint, codepoint)
59 end = start
60
61 def test_DecodeUtf8CharError(self):
62 CASES = [
63 ('Incomplete UTF-8 character', '\xC0'),
64 ('Invalid UTF-8 continuation byte', '\xC0\x01'),
65 ('Invalid start of UTF-8 character', '\xff'),
66 ]
67 for msg, input in CASES:
68 with self.assertRaises(error.Expr) as ctx:
69 string_ops.DecodeUtf8Char(input, 0)
70 self.assertEqual(ctx.exception.msg, msg)
71
72 def test_PreviousUtf8Char(self):
73 # The error messages could probably be improved for more consistency
74 # with NextUtf8Char, at the expense of more complexity.
75 CASES = [
76 ([6, 3, 1, 0], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
77 ([6, 3, 1, 'Invalid start of UTF-8 character'],
78 '\xA2\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
79 ([10, 'Invalid start of UTF-8 character'],
80 '\xF0\x90\x8D\x88\x90\x8D\x88\x90\x8D\x88\x24'),
81 ([3, 'Invalid start of UTF-8 character'], '\xF0\x90\x8D\x24'),
82 ]
83 for expected_indexes, input_str in CASES:
84 print()
85 print('PreviousUtf8Char case %r %r' %
86 (expected_indexes, input_str))
87 i = len(input_str)
88 actual_indexes = []
89 while True:
90 try:
91 i = string_ops.PreviousUtf8Char(input_str, i)
92 actual_indexes.append(i)
93 if i == 0:
94 break
95 except error.Strict as e:
96 actual_indexes.append(e.msg)
97 break
98 self.assertEqual(expected_indexes, actual_indexes)
99
100 # The UTF-8 encoding of all the characters from string_ops.SPACES.
101 # See comments there about why that set of characters was chosen.
102 #
103 # Generated by evaluating this Python3 fragment:
104 #
105 # ```
106 # print('\u0009\u000a\u000b\u000c\u000d\u0020\u00a0\ufeff'.encode('utf-8'))
107 # ```
108 ALL_WHITESPACES_UTF8 = '\t\n\x0b\x0c\r \xc2\xa0\xef\xbb\xbf'
109
110 def test_StartsWithWhitespaceByteRange(self):
111 CASES = [
112 ((0, 0), ''),
113 ((0, 0), 'x'),
114 ((0, 1), ' x'),
115 ((0, 1), ' x '),
116 ((0, 2), '\t x '),
117 ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
118 ]
119 for expected, input_str in CASES:
120 print()
121 print('StartsWithWhitespaceByteRange case %r %r' %
122 (expected, input_str))
123 self.assertEqual(
124 expected, string_ops.StartsWithWhitespaceByteRange(input_str))
125
126 def test_EndsWithWhitespaceByteRange(self):
127 CASES = [
128 ((0, 0), ''),
129 ((1, 1), 'x'),
130 ((2, 2), ' x'),
131 ((2, 3), ' x '),
132 ((2, 4), ' x \t'),
133 ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
134 ]
135
136 for expected, input_str in CASES:
137 print()
138 print('EndsWithWhitespaceByteRange case %r %r' %
139 (expected, input_str))
140 self.assertEqual(expected,
141 string_ops.EndsWithWhitespaceByteRange(input_str))
142
143 def testUnarySuffixOpDemo(self):
144 print(string_ops)
145
146 s = 'abcd'
147 n = len(s)
148
149 # All of these loops test exactly 4.
150 # NOTE: These are manually copied into DoUnarySuffixOp
151
152 print('## shortest prefix')
153 for i in xrange(1, n + 1):
154 print('%d test %06r return %06r' % (i, s[:i], s[i:]))
155 print()
156
157 print('# longest prefix')
158 for i in xrange(n, 0, -1):
159 print('%d test %06r return %06r' % (i, s[:i], s[i:]))
160 print()
161
162 print('% shortest suffix')
163 for i in xrange(n - 1, -1, -1):
164 print('%d test %06r return %06r' % (i, s[i:], s[:i]))
165 print()
166
167 print('%% longest suffix')
168 for i in xrange(0, n):
169 print('%d test %06r return %06r' % (i, s[i:], s[:i]))
170 print()
171
172 def testPatSubAllMatches(self):
173 s = 'oXooXoooX'
174
175 # Match positions
176 self.assertEqual([(1, 3), (4, 6)],
177 string_ops._AllMatchPositions(s, '(X.)'))
178
179 # No match
180 self.assertEqual([], string_ops._AllMatchPositions(s, '(z)'))
181
182 # Replacement
183 self.assertEqual('o_o_ooX', string_ops._PatSubAll(s, '(X.)', '_'))
184
185 # Replacement with no match
186 self.assertEqual(s, string_ops._PatSubAll(s, '(z)', '_'))
187
188
189if __name__ == '__main__':
190 unittest.main()