data_lang/utf8

OILS / data_lang / utf8_test.cc View on Github | oilshell.org

448 lines, 230 significant

1	#include <inttypes.h>
2
3	#include "data_lang/utf8_impls/bjoern_dfa.h"
4	#include "data_lang/utf8_impls/utf8_decode.h"
5	#include "data_lang/utf8.h"
6
7	//#include "mycpp/runtime.h"
8	#include "vendor/greatest.h"
9
10	// Copied from UTF-8 proc
11	// https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
12
13	int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
14	if (uc < 0x80) {
15	dst[0] = (uint8_t)uc;
16	return 1;
17	} else if (uc < 0x800) {
18	dst[0] = (uint8_t)(0xC0 + (uc >> 6));
19	dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
20	return 2;
21	// Note: we allow encoding 0xd800-0xdfff here, so as not to change
22	// the API, however, these are actually invalid in UTF-8
23	} else if (uc < 0x10000) {
24	dst[0] = (uint8_t)(0xE0 + (uc >> 12));
25	dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
26	dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
27	return 3;
28	} else if (uc < 0x110000) {
29	dst[0] = (uint8_t)(0xF0 + (uc >> 18));
30	dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
31	dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
32	dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
33	return 4;
34	} else
35	return 0;
36	}
37
38	void printCodePoints(const uint8_t* s) {
39	uint32_t codepoint;
40	uint32_t state = 0;
41
42	for (; *s; ++s)
43	if (!decode(&state, &codepoint, *s)) {
44	printf("U+%04X\n", codepoint);
45	}
46
47	if (state != UTF8_ACCEPT) printf("The string is not well-formed\n");
48	}
49
50	TEST dfa_test() {
51	const char* s = "hello\u03bc";
52	printCodePoints(reinterpret_cast<const uint8_t*>(s));
53
54	PASS();
55	}
56
57	uint32_t num_code_points = 17 * 1 << 16;
58
59	// uint32_t num_code_points = 1 << 21; // DFA has 983,040 disagreements
60
61	TEST decode_all_test() {
62	int num_disagree = 0;
63	int num_reject = 0;
64
65	// 17 "planes" of 2^16 - https://en.wikipedia.org/wiki/Unicode
66	// UTF-8 can represent more numbers (up to 21 bits), but this state machine
67	// doesn't need to handle them.
68
69	uint64_t sum_codepoint = 0;
70
71	for (uint32_t i = 0; i < num_code_points; ++i) {
72	uint8_t bytes[4] = {0};
73	int num_bytes = utf8proc_encode_char(i, bytes);
74	// printf("i = %d, num_bytes = %d\n", i, num_bytes);
75
76	uint32_t state = 0;
77	uint32_t codepoint = 0;
78	for (int j = 0; j < num_bytes; ++j) {
79	decode(&state, &codepoint, bytes[j]);
80	}
81	if (state == UTF8_ACCEPT) {
82	ASSERT_EQ_FMT(i, codepoint, "%d");
83	if (i != codepoint) {
84	num_disagree += 1;
85	}
86	} else {
87	num_reject += 1;
88	}
89
90	sum_codepoint += i;
91	}
92
93	printf("sum_codepoint = %ld\n", sum_codepoint);
94	printf("num_disagree = %d\n", num_disagree);
95	printf("num_reject = %d\n", num_reject);
96
97	PASS();
98	}
99
100	// Exhaustive DFA testing strategies
101	//
102	// All code points: 1,114,112 of them
103	//
104	// - Encode them all
105	// - Decode each byte
106	// - end state should be UTF8_ACCEPT, except for 2048 in surrogate range
107
108	// - code point should match
109	//
110	// All byte sequences
111	//
112	// 1. 256 1-byte sequences
113	// 2. 63,536 2-byte sequences
114	// - (ASCII, ASCII) will be 25% of these
115	// 3. 2**24 = 24 Mi 3-byte sequences
116	// overlapping cases
117	// 1 + 1 + 1
118	// 1 + 2
119	// 2 + 1
120	//
121	// 4. All 2**32 = 4 Gi 4-byte sequences? Possible but slightly slow
122	// maybe explore this in a smarter way
123
124	// Generating errors
125	//
126	// - enumerate all out of range
127	// - UTF-8 can go up to 2*21 - which is 2,097,152
128	// - enumerate ALL overlong encodings?
129	//
130	// - enumerate ALL sequences that correspond to code points in surrogate range
131	// - I think they're all 3 bytes long?
132
133	TEST exhaustive_test() {
134	long sum = 0;
135
136	for (int i = 0; i < (1 << 21); ++i) {
137	sum += i;
138	// Almost half these are out of range
139
140	// 2048 of them are surrogates!
141	}
142	printf("code points %ld\n", sum);
143
144	// How many overlong sequences?
145	// The key is making the leading byte 0 I think
146	// You can also have 2 or 3 leading zero bytes
147	//
148	// So you can test all these forms:
149	//
150	// [0 42 43 44] - should be [42 43 44]
151	//
152	// [0 0 42 43] - should be [42 43]
153	// [0 42 43]
154	//
155	// [0 0 0 42] - should be [42]
156	// [0 0 42]
157	// [0 42]
158	//
159	// Each continuation byte has 2*6 == 64 values.
160	// 2 byte sequences have 2*5 initial choicse
161	// 3 byte sequences have 2*4 initial choices
162	// 4 byte sequences have 2*3 initial choices
163
164	for (int i = 0; i < (1 << 8); ++i) {
165	sum += i;
166	}
167	printf("1 byte %ld\n", sum);
168
169	for (int i = 0; i < (1 << 16); ++i) {
170	sum += i;
171	}
172	printf("2 byte %ld\n", sum);
173
174	for (int i = 0; i < (1 << 24); ++i) {
175	sum += i;
176	}
177	printf("3 byte %ld\n", sum);
178
179	PASS();
180	}
181
182	TEST enumerate_utf8_test() {
183	// Valid digits, and some of them are overlong
184
185	// [a b c d] - 21 bits of info
186	int n = 0;
187	int overlong = 0;
188	for (int a = 0; a < (1 << 3); ++a) {
189	for (int b = 0; b < (1 << 6); ++b) {
190	for (int c = 0; c < (1 << 6); ++c) {
191	for (int d = 0; d < (1 << 6); ++d) {
192	n++;
193	if (a == 0) {
194	overlong++;
195	}
196	}
197	}
198	}
199	}
200	printf("4 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
201	n - overlong);
202
203	// [a b c] - 4 + 6 + 6 = 16 bits of info
204	n = 0;
205	overlong = 0;
206	for (int a = 0; a < (1 << 4); ++a) {
207	for (int b = 0; b < (1 << 6); ++b) {
208	for (int c = 0; c < (1 << 6); ++c) {
209	n++;
210	if (a == 0) {
211	overlong++;
212	}
213	}
214	}
215	}
216	printf("3 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
217	n - overlong);
218
219	// [a b] - 5 + 6 = 11 bits of info
220	n = 0;
221	overlong = 0;
222	for (int a = 0; a < (1 << 5); ++a) {
223	for (int b = 0; b < (1 << 6); ++b) {
224	n++;
225	if (a == 0) {
226	overlong++;
227	}
228	}
229	}
230	printf("2 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
231	n - overlong);
232
233	// Types of errors
234	// Surrogate Range
235	// Overlong
236	//
237	// Not covered by this:
238	// Invalid sequence (start byte, continuation byte) - we are leaving this off
239
240	n = 0;
241	for (int a = 0; a < (1 << 8); ++a) {
242	n++;
243	}
244	printf("1 byte: n = %10d\n", n);
245
246	PASS();
247	}
248
249	TEST crockford_test() {
250	char* s = const_cast<char*>("hello\u03bc");
251	utf8_decode_init(s, strlen(s));
252
253	int c;
254	bool done = false;
255	while (!done) {
256	c = utf8_decode_next();
257
258	switch (c) {
259	case UTF8_END:
260	done = true;
261	break;
262
263	case UTF8_ERROR:
264	printf("decode error");
265	break;
266
267	default:
268	printf("U+%04X\n", c);
269	break;
270	}
271	}
272
273	PASS();
274	}
275
276	TEST loop_all_test() {
277	int sum = 0;
278	int sum_codepoint = 0;
279
280	int num_errors = 0;
281
282	// Takes 1.4 seconds to iterate through all possibilities
283	for (uint32_t i = 0;; ++i) {
284	/*
285	uint32_t state = 0;
286	uint32_t codepoint = 0;
287
288	uint8_t* bytes = reinterpret_cast<uint8_t*>(&i);
289	int result = decode(&state, &codepoint, bytes[0]);
290
291	if (result == UTF8_REJECT) {
292	num_errors += 1;
293	} else {
294	sum_codepoint += codepoint;
295	}
296	*/
297	sum += i;
298
299	if (i == 0xFFFFFFFF) { // Can't test this condition in the loop
300	break;
301	}
302	}
303	printf("sum %d\n", sum);
304	printf("sum_codepoint %d\n", sum_codepoint);
305	printf("num_errors %d\n", num_errors);
306	printf("2 ^21 = %d\n", 1 << 21);
307
308	PASS();
309	}
310
311	void PrintStates(const uint8_t* s) {
312	uint32_t codepoint;
313	uint32_t state = UTF8_ACCEPT;
314
315	int n = strlen(reinterpret_cast<const char*>(s));
316	for (int i = 0; i < n; ++i) {
317	decode(&state, &codepoint, s[i]);
318	printf("state %d = %d\n", i, state);
319	}
320	}
321
322	TEST surrogate_test() {
323	// \ud83e
324	printf(" surrogate in utf-8\n");
325	const uint8_t* s = reinterpret_cast<const uint8_t*>("\xed\xa0\xbe");
326	PrintStates(s);
327
328	// mu
329	printf(" mu in utf-8\n");
330	const uint8_t* t = reinterpret_cast<const uint8_t*>("\xce\xbc");
331	PrintStates(t);
332
333	PASS();
334	}
335
336	TEST identity_test() {
337	// check that decode(encode(x)) = x for all code-points (and surrogates)
338	uint8_t buf[5] = {0};
339	for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
340	int len = utf8proc_encode_char(cp, buf);
341	Utf8Result result;
342	utf8_decode(buf, &result);
343
344	if (cp < 0xD800 \|\| cp > 0xDFFF) {
345	ASSERT_EQ(result.error, UTF8_OK);
346	} else {
347	ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
348	}
349	ASSERT_EQ(result.codepoint, cp);
350	ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
351	}
352
353	PASS();
354	}
355
356	TEST overlong_test() {
357	// All encode U+41 ('A')
358	Utf8Result ok, overlong2, overlong3, overlong4;
359	utf8_decode((unsigned char*)"\x41", &ok);
360	utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
361	utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
362	utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
363
364	ASSERT_EQ(ok.error, UTF8_OK);
365	ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
366	ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
367	ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
368
369	ASSERT_EQ(ok.codepoint, 0x41);
370	ASSERT_EQ(overlong2.codepoint, 0x41);
371	ASSERT_EQ(overlong3.codepoint, 0x41);
372	ASSERT_EQ(overlong4.codepoint, 0x41);
373
374	ASSERT_EQ(ok.bytes_read, 1);
375	ASSERT_EQ(overlong2.bytes_read, 2);
376	ASSERT_EQ(overlong3.bytes_read, 3);
377	ASSERT_EQ(overlong4.bytes_read, 4);
378
379	PASS();
380	}
381
382	TEST too_large_test() {
383	// Encoding of 0x111111 (via Table 3-6)
384	// = 00010001 00010001 00010001
385	// uuuuu -> 10001
386	// zzzz -> 00001
387	// yyyyyy -> 000100
388	// xxxxxx -> 010001
389	//
390	// -> 11110100 10010001 10000100 10010001
391	// = F4 91 84 91
392	Utf8Result result;
393	utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
394
395	ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
396	ASSERT_EQ(result.codepoint, 0x111111);
397	ASSERT_EQ(result.bytes_read, 4);
398
399	PASS();
400	}
401
402	TEST truncated_test() {
403	Utf8Result result;
404
405	constexpr const int NUM_INPUTS = 6;
406	const char *inputs[NUM_INPUTS] = {
407	"\xC5",
408	"\xED",
409	"\xED\x9F",
410	"\xF4",
411	"\xF4\x80",
412	"\xF4\x80\x80",
413	};
414
415	for (int i = 0; i < NUM_INPUTS; i++) {
416	utf8_decode((unsigned char*)inputs[i], &result);
417	ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
418	ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
419	}
420
421	// End of stream is separate from truncated
422	utf8_decode((unsigned char*)"", &result);
423	ASSERT_EQ(result.error, UTF8_ERR_END_OF_STREAM);
424	ASSERT_EQ(result.bytes_read, 0);
425
426	PASS();
427	}
428
429	GREATEST_MAIN_DEFS();
430
431	int main(int argc, char** argv) {
432	GREATEST_MAIN_BEGIN();
433
434	RUN_TEST(dfa_test);
435	RUN_TEST(decode_all_test);
436	RUN_TEST(exhaustive_test);
437	RUN_TEST(enumerate_utf8_test);
438	RUN_TEST(crockford_test);
439	RUN_TEST(surrogate_test);
440
441	RUN_TEST(identity_test);
442	RUN_TEST(overlong_test);
443	RUN_TEST(too_large_test);
444	RUN_TEST(truncated_test);
445
446	GREATEST_MAIN_END();
447	return 0;
448	}