data_lang/utf8

OILS / data_lang / utf8_test.cc View on Github | oilshell.org

349 lines, 167 significant

1	#include <inttypes.h>
2
3	#include "data_lang/utf8_impls/bjoern_dfa.h"
4	#include "data_lang/utf8_impls/utf8_decode.h"
5
6	//#include "mycpp/runtime.h"
7	#include "vendor/greatest.h"
8
9	// Copied from UTF-8 proc
10	// https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
11
12	int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
13	if (uc < 0x80) {
14	dst[0] = (uint8_t)uc;
15	return 1;
16	} else if (uc < 0x800) {
17	dst[0] = (uint8_t)(0xC0 + (uc >> 6));
18	dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
19	return 2;
20	// Note: we allow encoding 0xd800-0xdfff here, so as not to change
21	// the API, however, these are actually invalid in UTF-8
22	} else if (uc < 0x10000) {
23	dst[0] = (uint8_t)(0xE0 + (uc >> 12));
24	dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
25	dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
26	return 3;
27	} else if (uc < 0x110000) {
28	dst[0] = (uint8_t)(0xF0 + (uc >> 18));
29	dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
30	dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
31	dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
32	return 4;
33	} else
34	return 0;
35	}
36
37	void printCodePoints(const uint8_t* s) {
38	uint32_t codepoint;
39	uint32_t state = 0;
40
41	for (; *s; ++s)
42	if (!decode(&state, &codepoint, *s)) {
43	printf("U+%04X\n", codepoint);
44	}
45
46	if (state != UTF8_ACCEPT) printf("The string is not well-formed\n");
47	}
48
49	TEST dfa_test() {
50	const char* s = "hello\u03bc";
51	printCodePoints(reinterpret_cast<const uint8_t*>(s));
52
53	PASS();
54	}
55
56	uint32_t num_code_points = 17 * 1 << 16;
57
58	// uint32_t num_code_points = 1 << 21; // DFA has 983,040 disagreements
59
60	TEST decode_all_test() {
61	int num_disagree = 0;
62	int num_reject = 0;
63
64	// 17 "planes" of 2^16 - https://en.wikipedia.org/wiki/Unicode
65	// UTF-8 can represent more numbers (up to 21 bits), but this state machine
66	// doesn't need to handle them.
67
68	uint64_t sum_codepoint = 0;
69
70	for (uint32_t i = 0; i < num_code_points; ++i) {
71	uint8_t bytes[4] = {0};
72	int num_bytes = utf8proc_encode_char(i, bytes);
73	// printf("i = %d, num_bytes = %d\n", i, num_bytes);
74
75	uint32_t state = 0;
76	uint32_t codepoint = 0;
77	for (int j = 0; j < num_bytes; ++j) {
78	decode(&state, &codepoint, bytes[j]);
79	}
80	if (state == UTF8_ACCEPT) {
81	ASSERT_EQ_FMT(i, codepoint, "%d");
82	if (i != codepoint) {
83	num_disagree += 1;
84	}
85	} else {
86	num_reject += 1;
87	}
88
89	sum_codepoint += i;
90	}
91
92	printf("sum_codepoint = %ld\n", sum_codepoint);
93	printf("num_disagree = %d\n", num_disagree);
94	printf("num_reject = %d\n", num_reject);
95
96	PASS();
97	}
98
99	// Exhaustive DFA testing strategies
100	//
101	// All code points: 1,114,112 of them
102	//
103	// - Encode them all
104	// - Decode each byte
105	// - end state should be UTF8_ACCEPT, except for 2048 in surrogate range
106
107	// - code point should match
108	//
109	// All byte sequences
110	//
111	// 1. 256 1-byte sequences
112	// 2. 63,536 2-byte sequences
113	// - (ASCII, ASCII) will be 25% of these
114	// 3. 2**24 = 24 Mi 3-byte sequences
115	// overlapping cases
116	// 1 + 1 + 1
117	// 1 + 2
118	// 2 + 1
119	//
120	// 4. All 2**32 = 4 Gi 4-byte sequences? Possible but slightly slow
121	// maybe explore this in a smarter way
122
123	// Generating errors
124	//
125	// - enumerate all out of range
126	// - UTF-8 can go up to 2*21 - which is 2,097,152
127	// - enumerate ALL overlong encodings?
128	//
129	// - enumerate ALL sequences that correspond to code points in surrogate range
130	// - I think they're all 3 bytes long?
131
132	TEST exhaustive_test() {
133	long sum = 0;
134
135	for (int i = 0; i < (1 << 21); ++i) {
136	sum += i;
137	// Almost half these are out of range
138
139	// 2048 of them are surrogates!
140	}
141	printf("code points %ld\n", sum);
142
143	// How many overlong sequences?
144	// The key is making the leading byte 0 I think
145	// You can also have 2 or 3 leading zero bytes
146	//
147	// So you can test all these forms:
148	//
149	// [0 42 43 44] - should be [42 43 44]
150	//
151	// [0 0 42 43] - should be [42 43]
152	// [0 42 43]
153	//
154	// [0 0 0 42] - should be [42]
155	// [0 0 42]
156	// [0 42]
157	//
158	// Each continuation byte has 2*6 == 64 values.
159	// 2 byte sequences have 2*5 initial choicse
160	// 3 byte sequences have 2*4 initial choices
161	// 4 byte sequences have 2*3 initial choices
162
163	for (int i = 0; i < (1 << 8); ++i) {
164	sum += i;
165	}
166	printf("1 byte %ld\n", sum);
167
168	for (int i = 0; i < (1 << 16); ++i) {
169	sum += i;
170	}
171	printf("2 byte %ld\n", sum);
172
173	for (int i = 0; i < (1 << 24); ++i) {
174	sum += i;
175	}
176	printf("3 byte %ld\n", sum);
177
178	PASS();
179	}
180
181	TEST enumerate_utf8_test() {
182	// Valid digits, and some of them are overlong
183
184	// [a b c d] - 21 bits of info
185	int n = 0;
186	int overlong = 0;
187	for (int a = 0; a < (1 << 3); ++a) {
188	for (int b = 0; b < (1 << 6); ++b) {
189	for (int c = 0; c < (1 << 6); ++c) {
190	for (int d = 0; d < (1 << 6); ++d) {
191	n++;
192	if (a == 0) {
193	overlong++;
194	}
195	}
196	}
197	}
198	}
199	printf("4 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
200	n - overlong);
201
202	// [a b c] - 4 + 6 + 6 = 16 bits of info
203	n = 0;
204	overlong = 0;
205	for (int a = 0; a < (1 << 4); ++a) {
206	for (int b = 0; b < (1 << 6); ++b) {
207	for (int c = 0; c < (1 << 6); ++c) {
208	n++;
209	if (a == 0) {
210	overlong++;
211	}
212	}
213	}
214	}
215	printf("3 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
216	n - overlong);
217
218	// [a b] - 5 + 6 = 11 bits of info
219	n = 0;
220	overlong = 0;
221	for (int a = 0; a < (1 << 5); ++a) {
222	for (int b = 0; b < (1 << 6); ++b) {
223	n++;
224	if (a == 0) {
225	overlong++;
226	}
227	}
228	}
229	printf("2 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
230	n - overlong);
231
232	// Types of errors
233	// Surrogate Range
234	// Overlong
235	//
236	// Not covered by this:
237	// Invalid sequence (start byte, continuation byte) - we are leaving this off
238
239	n = 0;
240	for (int a = 0; a < (1 << 8); ++a) {
241	n++;
242	}
243	printf("1 byte: n = %10d\n", n);
244
245	PASS();
246	}
247
248	TEST crockford_test() {
249	char* s = const_cast<char*>("hello\u03bc");
250	utf8_decode_init(s, strlen(s));
251
252	int c;
253	bool done = false;
254	while (!done) {
255	c = utf8_decode_next();
256
257	switch (c) {
258	case UTF8_END:
259	done = true;
260	break;
261
262	case UTF8_ERROR:
263	printf("decode error");
264	break;
265
266	default:
267	printf("U+%04X\n", c);
268	break;
269	}
270	}
271
272	PASS();
273	}
274
275	TEST loop_all_test() {
276	int sum = 0;
277	int sum_codepoint = 0;
278
279	int num_errors = 0;
280
281	// Takes 1.4 seconds to iterate through all possibilities
282	for (uint32_t i = 0;; ++i) {
283	/*
284	uint32_t state = 0;
285	uint32_t codepoint = 0;
286
287	uint8_t* bytes = reinterpret_cast<uint8_t*>(&i);
288	int result = decode(&state, &codepoint, bytes[0]);
289
290	if (result == UTF8_REJECT) {
291	num_errors += 1;
292	} else {
293	sum_codepoint += codepoint;
294	}
295	*/
296	sum += i;
297
298	if (i == 0xFFFFFFFF) { // Can't test this condition in the loop
299	break;
300	}
301	}
302	printf("sum %d\n", sum);
303	printf("sum_codepoint %d\n", sum_codepoint);
304	printf("num_errors %d\n", num_errors);
305	printf("2 ^21 = %d\n", 1 << 21);
306
307	PASS();
308	}
309
310	void PrintStates(const uint8_t* s) {
311	uint32_t codepoint;
312	uint32_t state = UTF8_ACCEPT;
313
314	int n = strlen(reinterpret_cast<const char*>(s));
315	for (int i = 0; i < n; ++i) {
316	decode(&state, &codepoint, s[i]);
317	printf("state %d = %d\n", i, state);
318	}
319	}
320
321	TEST surrogate_test() {
322	// \ud83e
323	printf(" surrogate in utf-8\n");
324	const uint8_t* s = reinterpret_cast<const uint8_t*>("\xed\xa0\xbe");
325	PrintStates(s);
326
327	// mu
328	printf(" mu in utf-8\n");
329	const uint8_t* t = reinterpret_cast<const uint8_t*>("\xce\xbc");
330	PrintStates(t);
331
332	PASS();
333	}
334
335	GREATEST_MAIN_DEFS();
336
337	int main(int argc, char** argv) {
338	GREATEST_MAIN_BEGIN();
339
340	RUN_TEST(dfa_test);
341	RUN_TEST(decode_all_test);
342	RUN_TEST(exhaustive_test);
343	RUN_TEST(enumerate_utf8_test);
344	RUN_TEST(crockford_test);
345	RUN_TEST(surrogate_test);
346
347	GREATEST_MAIN_END();
348	return 0;
349	}