1 | #!/usr/bin/env python2
|
2 | """
|
3 | string_ops_test.py: Tests for string_ops.py
|
4 | """
|
5 | from __future__ import print_function
|
6 |
|
7 | import unittest
|
8 |
|
9 | from core import error
|
10 | from osh import string_ops # module under test
|
11 |
|
12 |
|
13 | class LibStrTest(unittest.TestCase):
|
14 |
|
15 | def test_NextUtf8Char(self):
|
16 | CASES = [
|
17 | ([1, 3, 6, 10], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
|
18 | ([1, 3,
|
19 | 'Invalid UTF-8 continuation byte'], '\x24\xC2\xA2\xE0\xE0\xA4'),
|
20 | ([1, 3, 6, 'Invalid start of UTF-8 character'],
|
21 | '\x24\xC2\xA2\xE0\xA4\xA4\xB9'),
|
22 | ([1, 3, 'Invalid start of UTF-8 character'], '\x24\xC2\xA2\xFF'),
|
23 | ([1, 'Incomplete UTF-8 character'], '\x24\xF0\x90\x8D'),
|
24 | ]
|
25 | for expected_indexes, input_str in CASES:
|
26 | print()
|
27 | print('NextUtf8Char case %r %r' % (expected_indexes, input_str))
|
28 | i = 0
|
29 | actual_indexes = []
|
30 | while True:
|
31 | try:
|
32 | i = string_ops.NextUtf8Char(input_str, i)
|
33 | actual_indexes.append(i)
|
34 | if i >= len(input_str):
|
35 | break
|
36 | except error.Strict as e:
|
37 | actual_indexes.append(e.msg)
|
38 | break
|
39 | self.assertEqual(expected_indexes, actual_indexes)
|
40 |
|
41 | def test_DecodeNextUtf8Char(self):
|
42 | s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
|
43 | codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
|
44 | start = 0
|
45 | for codepoint in codepoints:
|
46 | end = string_ops.NextUtf8Char(s, start)
|
47 | codepoint = string_ops.DecodeUtf8Char(s, start)
|
48 | self.assertEqual(codepoint, codepoint)
|
49 | start = end
|
50 |
|
51 | def test_DecodePrevUtf8Char(self):
|
52 | s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
|
53 | codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
|
54 | end = len(s)
|
55 | for codepoint in reversed(codepoints):
|
56 | start = string_ops.PreviousUtf8Char(s, end)
|
57 | codepoint = string_ops.DecodeUtf8Char(s, start)
|
58 | self.assertEqual(codepoint, codepoint)
|
59 | end = start
|
60 |
|
61 | def test_DecodeUtf8CharError(self):
|
62 | CASES = [
|
63 | ('Incomplete UTF-8 character', '\xC0'),
|
64 | ('Invalid UTF-8 continuation byte', '\xC0\x01'),
|
65 | ('Invalid start of UTF-8 character', '\xff'),
|
66 | ]
|
67 | for msg, input in CASES:
|
68 | with self.assertRaises(error.Expr) as ctx:
|
69 | string_ops.DecodeUtf8Char(input, 0)
|
70 | self.assertEqual(ctx.exception.msg, msg)
|
71 |
|
72 | def test_PreviousUtf8Char(self):
|
73 | # The error messages could probably be improved for more consistency
|
74 | # with NextUtf8Char, at the expense of more complexity.
|
75 | CASES = [
|
76 | ([6, 3, 1, 0], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
|
77 | ([6, 3, 1, 'Invalid start of UTF-8 character'],
|
78 | '\xA2\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
|
79 | ([10, 'Invalid start of UTF-8 character'],
|
80 | '\xF0\x90\x8D\x88\x90\x8D\x88\x90\x8D\x88\x24'),
|
81 | ([3, 'Invalid start of UTF-8 character'], '\xF0\x90\x8D\x24'),
|
82 | ]
|
83 | for expected_indexes, input_str in CASES:
|
84 | print()
|
85 | print('PreviousUtf8Char case %r %r' %
|
86 | (expected_indexes, input_str))
|
87 | i = len(input_str)
|
88 | actual_indexes = []
|
89 | while True:
|
90 | try:
|
91 | i = string_ops.PreviousUtf8Char(input_str, i)
|
92 | actual_indexes.append(i)
|
93 | if i == 0:
|
94 | break
|
95 | except error.Strict as e:
|
96 | actual_indexes.append(e.msg)
|
97 | break
|
98 | self.assertEqual(expected_indexes, actual_indexes)
|
99 |
|
100 | # The UTF-8 encoding of all the characters from string_ops.SPACES.
|
101 | # See comments there about why that set of characters was chosen.
|
102 | #
|
103 | # Generated by evaluating this Python3 fragment:
|
104 | #
|
105 | # ```
|
106 | # print('\u0009\u000a\u000b\u000c\u000d\u0020\u00a0\ufeff'.encode('utf-8'))
|
107 | # ```
|
108 | ALL_WHITESPACES_UTF8 = '\t\n\x0b\x0c\r \xc2\xa0\xef\xbb\xbf'
|
109 |
|
110 | def test_StartsWithWhitespaceByteRange(self):
|
111 | CASES = [
|
112 | ((0, 0), ''),
|
113 | ((0, 0), 'x'),
|
114 | ((0, 1), ' x'),
|
115 | ((0, 1), ' x '),
|
116 | ((0, 2), '\t x '),
|
117 | ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
|
118 | ]
|
119 | for expected, input_str in CASES:
|
120 | print()
|
121 | print('StartsWithWhitespaceByteRange case %r %r' %
|
122 | (expected, input_str))
|
123 | self.assertEqual(
|
124 | expected, string_ops.StartsWithWhitespaceByteRange(input_str))
|
125 |
|
126 | def test_EndsWithWhitespaceByteRange(self):
|
127 | CASES = [
|
128 | ((0, 0), ''),
|
129 | ((1, 1), 'x'),
|
130 | ((2, 2), ' x'),
|
131 | ((2, 3), ' x '),
|
132 | ((2, 4), ' x \t'),
|
133 | ((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
|
134 | ]
|
135 |
|
136 | for expected, input_str in CASES:
|
137 | print()
|
138 | print('EndsWithWhitespaceByteRange case %r %r' %
|
139 | (expected, input_str))
|
140 | self.assertEqual(expected,
|
141 | string_ops.EndsWithWhitespaceByteRange(input_str))
|
142 |
|
143 | def testUnarySuffixOpDemo(self):
|
144 | print(string_ops)
|
145 |
|
146 | s = 'abcd'
|
147 | n = len(s)
|
148 |
|
149 | # All of these loops test exactly 4.
|
150 | # NOTE: These are manually copied into DoUnarySuffixOp
|
151 |
|
152 | print('## shortest prefix')
|
153 | for i in xrange(1, n + 1):
|
154 | print('%d test %06r return %06r' % (i, s[:i], s[i:]))
|
155 | print()
|
156 |
|
157 | print('# longest prefix')
|
158 | for i in xrange(n, 0, -1):
|
159 | print('%d test %06r return %06r' % (i, s[:i], s[i:]))
|
160 | print()
|
161 |
|
162 | print('% shortest suffix')
|
163 | for i in xrange(n - 1, -1, -1):
|
164 | print('%d test %06r return %06r' % (i, s[i:], s[:i]))
|
165 | print()
|
166 |
|
167 | print('%% longest suffix')
|
168 | for i in xrange(0, n):
|
169 | print('%d test %06r return %06r' % (i, s[i:], s[:i]))
|
170 | print()
|
171 |
|
172 | def testPatSubAllMatches(self):
|
173 | s = 'oXooXoooX'
|
174 |
|
175 | # Match positions
|
176 | self.assertEqual([(1, 3), (4, 6)],
|
177 | string_ops._AllMatchPositions(s, '(X.)'))
|
178 |
|
179 | # No match
|
180 | self.assertEqual([], string_ops._AllMatchPositions(s, '(z)'))
|
181 |
|
182 | # Replacement
|
183 | self.assertEqual('o_o_ooX', string_ops._PatSubAll(s, '(X.)', '_'))
|
184 |
|
185 | # Replacement with no match
|
186 | self.assertEqual(s, string_ops._PatSubAll(s, '(z)', '_'))
|
187 |
|
188 |
|
189 | if __name__ == '__main__':
|
190 | unittest.main()
|