osh/string_ops

OILS / osh / string_ops_test.py View on Github | oilshell.org

190 lines, 143 significant

1	#!/usr/bin/env python2
2	"""
3	string_ops_test.py: Tests for string_ops.py
4	"""
5	from __future__ import print_function
6
7	import unittest
8
9	from core import error
10	from osh import string_ops # module under test
11
12
13	class LibStrTest(unittest.TestCase):
14
15	def test_NextUtf8Char(self):
16	CASES = [
17	([1, 3, 6, 10], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
18	([1, 3,
19	'Invalid UTF-8 continuation byte'], '\x24\xC2\xA2\xE0\xE0\xA4'),
20	([1, 3, 6, 'Invalid start of UTF-8 character'],
21	'\x24\xC2\xA2\xE0\xA4\xA4\xB9'),
22	([1, 3, 'Invalid start of UTF-8 character'], '\x24\xC2\xA2\xFF'),
23	([1, 'Incomplete UTF-8 character'], '\x24\xF0\x90\x8D'),
24	]
25	for expected_indexes, input_str in CASES:
26	print()
27	print('NextUtf8Char case %r %r' % (expected_indexes, input_str))
28	i = 0
29	actual_indexes = []
30	while True:
31	try:
32	i = string_ops.NextUtf8Char(input_str, i)
33	actual_indexes.append(i)
34	if i >= len(input_str):
35	break
36	except error.Strict as e:
37	actual_indexes.append(e.msg)
38	break
39	self.assertEqual(expected_indexes, actual_indexes)
40
41	def test_DecodeNextUtf8Char(self):
42	s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
43	codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
44	start = 0
45	for codepoint in codepoints:
46	end = string_ops.NextUtf8Char(s, start)
47	codepoint = string_ops.DecodeUtf8Char(s, start)
48	self.assertEqual(codepoint, codepoint)
49	start = end
50
51	def test_DecodePrevUtf8Char(self):
52	s = '\x61\xC3\x8A\xE1\x82\xA0\xF0\x93\x80\x80'
53	codepoints = [0x61, 0xCA, 0x10A0, 0x13000]
54	end = len(s)
55	for codepoint in reversed(codepoints):
56	start = string_ops.PreviousUtf8Char(s, end)
57	codepoint = string_ops.DecodeUtf8Char(s, start)
58	self.assertEqual(codepoint, codepoint)
59	end = start
60
61	def test_DecodeUtf8CharError(self):
62	CASES = [
63	('Incomplete UTF-8 character', '\xC0'),
64	('Invalid UTF-8 continuation byte', '\xC0\x01'),
65	('Invalid start of UTF-8 character', '\xff'),
66	]
67	for msg, input in CASES:
68	with self.assertRaises(error.Expr) as ctx:
69	string_ops.DecodeUtf8Char(input, 0)
70	self.assertEqual(ctx.exception.msg, msg)
71
72	def test_PreviousUtf8Char(self):
73	# The error messages could probably be improved for more consistency
74	# with NextUtf8Char, at the expense of more complexity.
75	CASES = [
76	([6, 3, 1, 0], '\x24\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
77	([6, 3, 1, 'Invalid start of UTF-8 character'],
78	'\xA2\xC2\xA2\xE0\xA4\xB9\xF0\x90\x8D\x88'),
79	([10, 'Invalid start of UTF-8 character'],
80	'\xF0\x90\x8D\x88\x90\x8D\x88\x90\x8D\x88\x24'),
81	([3, 'Invalid start of UTF-8 character'], '\xF0\x90\x8D\x24'),
82	]
83	for expected_indexes, input_str in CASES:
84	print()
85	print('PreviousUtf8Char case %r %r' %
86	(expected_indexes, input_str))
87	i = len(input_str)
88	actual_indexes = []
89	while True:
90	try:
91	i = string_ops.PreviousUtf8Char(input_str, i)
92	actual_indexes.append(i)
93	if i == 0:
94	break
95	except error.Strict as e:
96	actual_indexes.append(e.msg)
97	break
98	self.assertEqual(expected_indexes, actual_indexes)
99
100	# The UTF-8 encoding of all the characters from string_ops.SPACES.
101	# See comments there about why that set of characters was chosen.
102	#
103	# Generated by evaluating this Python3 fragment:
104	#
105	# ```
106	# print('\u0009\u000a\u000b\u000c\u000d\u0020\u00a0\ufeff'.encode('utf-8'))
107	# ```
108	ALL_WHITESPACES_UTF8 = '\t\n\x0b\x0c\r \xc2\xa0\xef\xbb\xbf'
109
110	def test_StartsWithWhitespaceByteRange(self):
111	CASES = [
112	((0, 0), ''),
113	((0, 0), 'x'),
114	((0, 1), ' x'),
115	((0, 1), ' x '),
116	((0, 2), '\t x '),
117	((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
118	]
119	for expected, input_str in CASES:
120	print()
121	print('StartsWithWhitespaceByteRange case %r %r' %
122	(expected, input_str))
123	self.assertEqual(
124	expected, string_ops.StartsWithWhitespaceByteRange(input_str))
125
126	def test_EndsWithWhitespaceByteRange(self):
127	CASES = [
128	((0, 0), ''),
129	((1, 1), 'x'),
130	((2, 2), ' x'),
131	((2, 3), ' x '),
132	((2, 4), ' x \t'),
133	((0, 11), LibStrTest.ALL_WHITESPACES_UTF8),
134	]
135
136	for expected, input_str in CASES:
137	print()
138	print('EndsWithWhitespaceByteRange case %r %r' %
139	(expected, input_str))
140	self.assertEqual(expected,
141	string_ops.EndsWithWhitespaceByteRange(input_str))
142
143	def testUnarySuffixOpDemo(self):
144	print(string_ops)
145
146	s = 'abcd'
147	n = len(s)
148
149	# All of these loops test exactly 4.
150	# NOTE: These are manually copied into DoUnarySuffixOp
151
152	print('## shortest prefix')
153	for i in xrange(1, n + 1):
154	print('%d test %06r return %06r' % (i, s[:i], s[i:]))
155	print()
156
157	print('# longest prefix')
158	for i in xrange(n, 0, -1):
159	print('%d test %06r return %06r' % (i, s[:i], s[i:]))
160	print()
161
162	print('% shortest suffix')
163	for i in xrange(n - 1, -1, -1):
164	print('%d test %06r return %06r' % (i, s[i:], s[:i]))
165	print()
166
167	print('%% longest suffix')
168	for i in xrange(0, n):
169	print('%d test %06r return %06r' % (i, s[i:], s[:i]))
170	print()
171
172	def testPatSubAllMatches(self):
173	s = 'oXooXoooX'
174
175	# Match positions
176	self.assertEqual([(1, 3), (4, 6)],
177	string_ops._AllMatchPositions(s, '(X.)'))
178
179	# No match
180	self.assertEqual([], string_ops._AllMatchPositions(s, '(z)'))
181
182	# Replacement
183	self.assertEqual('o_o_ooX', string_ops._PatSubAll(s, '(X.)', '_'))
184
185	# Replacement with no match
186	self.assertEqual(s, string_ops._PatSubAll(s, '(z)', '_'))
187
188
189	if __name__ == '__main__':
190	unittest.main()