OILS / data_lang / utf8_test.cc View on Github | oilshell.org

448 lines, 230 significant
1#include <inttypes.h>
2
3#include "data_lang/utf8_impls/bjoern_dfa.h"
4#include "data_lang/utf8_impls/utf8_decode.h"
5#include "data_lang/utf8.h"
6
7//#include "mycpp/runtime.h"
8#include "vendor/greatest.h"
9
10// Copied from UTF-8 proc
11// https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
12
13int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
14 if (uc < 0x80) {
15 dst[0] = (uint8_t)uc;
16 return 1;
17 } else if (uc < 0x800) {
18 dst[0] = (uint8_t)(0xC0 + (uc >> 6));
19 dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
20 return 2;
21 // Note: we allow encoding 0xd800-0xdfff here, so as not to change
22 // the API, however, these are actually invalid in UTF-8
23 } else if (uc < 0x10000) {
24 dst[0] = (uint8_t)(0xE0 + (uc >> 12));
25 dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
26 dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
27 return 3;
28 } else if (uc < 0x110000) {
29 dst[0] = (uint8_t)(0xF0 + (uc >> 18));
30 dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
31 dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
32 dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
33 return 4;
34 } else
35 return 0;
36}
37
38void printCodePoints(const uint8_t* s) {
39 uint32_t codepoint;
40 uint32_t state = 0;
41
42 for (; *s; ++s)
43 if (!decode(&state, &codepoint, *s)) {
44 printf("U+%04X\n", codepoint);
45 }
46
47 if (state != UTF8_ACCEPT) printf("The string is not well-formed\n");
48}
49
50TEST dfa_test() {
51 const char* s = "hello\u03bc";
52 printCodePoints(reinterpret_cast<const uint8_t*>(s));
53
54 PASS();
55}
56
57uint32_t num_code_points = 17 * 1 << 16;
58
59// uint32_t num_code_points = 1 << 21; // DFA has 983,040 disagreements
60
61TEST decode_all_test() {
62 int num_disagree = 0;
63 int num_reject = 0;
64
65 // 17 "planes" of 2^16 - https://en.wikipedia.org/wiki/Unicode
66 // UTF-8 can represent more numbers (up to 21 bits), but this state machine
67 // doesn't need to handle them.
68
69 uint64_t sum_codepoint = 0;
70
71 for (uint32_t i = 0; i < num_code_points; ++i) {
72 uint8_t bytes[4] = {0};
73 int num_bytes = utf8proc_encode_char(i, bytes);
74 // printf("i = %d, num_bytes = %d\n", i, num_bytes);
75
76 uint32_t state = 0;
77 uint32_t codepoint = 0;
78 for (int j = 0; j < num_bytes; ++j) {
79 decode(&state, &codepoint, bytes[j]);
80 }
81 if (state == UTF8_ACCEPT) {
82 ASSERT_EQ_FMT(i, codepoint, "%d");
83 if (i != codepoint) {
84 num_disagree += 1;
85 }
86 } else {
87 num_reject += 1;
88 }
89
90 sum_codepoint += i;
91 }
92
93 printf("sum_codepoint = %ld\n", sum_codepoint);
94 printf("num_disagree = %d\n", num_disagree);
95 printf("num_reject = %d\n", num_reject);
96
97 PASS();
98}
99
100// Exhaustive DFA testing strategies
101//
102// All code points: 1,114,112 of them
103//
104// - Encode them all
105// - Decode each byte
106// - end state should be UTF8_ACCEPT, except for 2048 in surrogate range
107
108// - code point should match
109//
110// All byte sequences
111//
112// 1. 256 1-byte sequences
113// 2. 63,536 2-byte sequences
114// - (ASCII, ASCII) will be 25% of these
115// 3. 2**24 = 24 Mi 3-byte sequences
116// overlapping cases
117// 1 + 1 + 1
118// 1 + 2
119// 2 + 1
120//
121// 4. All 2**32 = 4 Gi 4-byte sequences? Possible but slightly slow
122// maybe explore this in a smarter way
123
124// Generating errors
125//
126// - enumerate all out of range
127// - UTF-8 can go up to 2*21 - which is 2,097,152
128// - enumerate ALL overlong encodings?
129//
130// - enumerate ALL sequences that correspond to code points in surrogate range
131// - I think they're all 3 bytes long?
132
133TEST exhaustive_test() {
134 long sum = 0;
135
136 for (int i = 0; i < (1 << 21); ++i) {
137 sum += i;
138 // Almost half these are out of range
139
140 // 2048 of them are surrogates!
141 }
142 printf("code points %ld\n", sum);
143
144 // How many overlong sequences?
145 // The key is making the leading byte 0 I think
146 // You can also have 2 or 3 leading zero bytes
147 //
148 // So you can test all these forms:
149 //
150 // [0 42 43 44] - should be [42 43 44]
151 //
152 // [0 0 42 43] - should be [42 43]
153 // [0 42 43]
154 //
155 // [0 0 0 42] - should be [42]
156 // [0 0 42]
157 // [0 42]
158 //
159 // Each continuation byte has 2*6 == 64 values.
160 // 2 byte sequences have 2*5 initial choicse
161 // 3 byte sequences have 2*4 initial choices
162 // 4 byte sequences have 2*3 initial choices
163
164 for (int i = 0; i < (1 << 8); ++i) {
165 sum += i;
166 }
167 printf("1 byte %ld\n", sum);
168
169 for (int i = 0; i < (1 << 16); ++i) {
170 sum += i;
171 }
172 printf("2 byte %ld\n", sum);
173
174 for (int i = 0; i < (1 << 24); ++i) {
175 sum += i;
176 }
177 printf("3 byte %ld\n", sum);
178
179 PASS();
180}
181
182TEST enumerate_utf8_test() {
183 // Valid digits, and some of them are overlong
184
185 // [a b c d] - 21 bits of info
186 int n = 0;
187 int overlong = 0;
188 for (int a = 0; a < (1 << 3); ++a) {
189 for (int b = 0; b < (1 << 6); ++b) {
190 for (int c = 0; c < (1 << 6); ++c) {
191 for (int d = 0; d < (1 << 6); ++d) {
192 n++;
193 if (a == 0) {
194 overlong++;
195 }
196 }
197 }
198 }
199 }
200 printf("4 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
201 n - overlong);
202
203 // [a b c] - 4 + 6 + 6 = 16 bits of info
204 n = 0;
205 overlong = 0;
206 for (int a = 0; a < (1 << 4); ++a) {
207 for (int b = 0; b < (1 << 6); ++b) {
208 for (int c = 0; c < (1 << 6); ++c) {
209 n++;
210 if (a == 0) {
211 overlong++;
212 }
213 }
214 }
215 }
216 printf("3 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
217 n - overlong);
218
219 // [a b] - 5 + 6 = 11 bits of info
220 n = 0;
221 overlong = 0;
222 for (int a = 0; a < (1 << 5); ++a) {
223 for (int b = 0; b < (1 << 6); ++b) {
224 n++;
225 if (a == 0) {
226 overlong++;
227 }
228 }
229 }
230 printf("2 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
231 n - overlong);
232
233 // Types of errors
234 // Surrogate Range
235 // Overlong
236 //
237 // Not covered by this:
238 // Invalid sequence (start byte, continuation byte) - we are leaving this off
239
240 n = 0;
241 for (int a = 0; a < (1 << 8); ++a) {
242 n++;
243 }
244 printf("1 byte: n = %10d\n", n);
245
246 PASS();
247}
248
249TEST crockford_test() {
250 char* s = const_cast<char*>("hello\u03bc");
251 utf8_decode_init(s, strlen(s));
252
253 int c;
254 bool done = false;
255 while (!done) {
256 c = utf8_decode_next();
257
258 switch (c) {
259 case UTF8_END:
260 done = true;
261 break;
262
263 case UTF8_ERROR:
264 printf("decode error");
265 break;
266
267 default:
268 printf("U+%04X\n", c);
269 break;
270 }
271 }
272
273 PASS();
274}
275
276TEST loop_all_test() {
277 int sum = 0;
278 int sum_codepoint = 0;
279
280 int num_errors = 0;
281
282 // Takes 1.4 seconds to iterate through all possibilities
283 for (uint32_t i = 0;; ++i) {
284 /*
285 uint32_t state = 0;
286 uint32_t codepoint = 0;
287
288 uint8_t* bytes = reinterpret_cast<uint8_t*>(&i);
289 int result = decode(&state, &codepoint, bytes[0]);
290
291 if (result == UTF8_REJECT) {
292 num_errors += 1;
293 } else {
294 sum_codepoint += codepoint;
295 }
296 */
297 sum += i;
298
299 if (i == 0xFFFFFFFF) { // Can't test this condition in the loop
300 break;
301 }
302 }
303 printf("sum %d\n", sum);
304 printf("sum_codepoint %d\n", sum_codepoint);
305 printf("num_errors %d\n", num_errors);
306 printf("2 ^21 = %d\n", 1 << 21);
307
308 PASS();
309}
310
311void PrintStates(const uint8_t* s) {
312 uint32_t codepoint;
313 uint32_t state = UTF8_ACCEPT;
314
315 int n = strlen(reinterpret_cast<const char*>(s));
316 for (int i = 0; i < n; ++i) {
317 decode(&state, &codepoint, s[i]);
318 printf("state %d = %d\n", i, state);
319 }
320}
321
322TEST surrogate_test() {
323 // \ud83e
324 printf(" surrogate in utf-8\n");
325 const uint8_t* s = reinterpret_cast<const uint8_t*>("\xed\xa0\xbe");
326 PrintStates(s);
327
328 // mu
329 printf(" mu in utf-8\n");
330 const uint8_t* t = reinterpret_cast<const uint8_t*>("\xce\xbc");
331 PrintStates(t);
332
333 PASS();
334}
335
336TEST identity_test() {
337 // check that decode(encode(x)) = x for all code-points (and surrogates)
338 uint8_t buf[5] = {0};
339 for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
340 int len = utf8proc_encode_char(cp, buf);
341 Utf8Result result;
342 utf8_decode(buf, &result);
343
344 if (cp < 0xD800 || cp > 0xDFFF) {
345 ASSERT_EQ(result.error, UTF8_OK);
346 } else {
347 ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
348 }
349 ASSERT_EQ(result.codepoint, cp);
350 ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
351 }
352
353 PASS();
354}
355
356TEST overlong_test() {
357 // All encode U+41 ('A')
358 Utf8Result ok, overlong2, overlong3, overlong4;
359 utf8_decode((unsigned char*)"\x41", &ok);
360 utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
361 utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
362 utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
363
364 ASSERT_EQ(ok.error, UTF8_OK);
365 ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
366 ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
367 ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
368
369 ASSERT_EQ(ok.codepoint, 0x41);
370 ASSERT_EQ(overlong2.codepoint, 0x41);
371 ASSERT_EQ(overlong3.codepoint, 0x41);
372 ASSERT_EQ(overlong4.codepoint, 0x41);
373
374 ASSERT_EQ(ok.bytes_read, 1);
375 ASSERT_EQ(overlong2.bytes_read, 2);
376 ASSERT_EQ(overlong3.bytes_read, 3);
377 ASSERT_EQ(overlong4.bytes_read, 4);
378
379 PASS();
380}
381
382TEST too_large_test() {
383 // Encoding of 0x111111 (via Table 3-6)
384 // = 00010001 00010001 00010001
385 // uuuuu -> 10001
386 // zzzz -> 00001
387 // yyyyyy -> 000100
388 // xxxxxx -> 010001
389 //
390 // -> 11110100 10010001 10000100 10010001
391 // = F4 91 84 91
392 Utf8Result result;
393 utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
394
395 ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
396 ASSERT_EQ(result.codepoint, 0x111111);
397 ASSERT_EQ(result.bytes_read, 4);
398
399 PASS();
400}
401
402TEST truncated_test() {
403 Utf8Result result;
404
405 constexpr const int NUM_INPUTS = 6;
406 const char *inputs[NUM_INPUTS] = {
407 "\xC5",
408 "\xED",
409 "\xED\x9F",
410 "\xF4",
411 "\xF4\x80",
412 "\xF4\x80\x80",
413 };
414
415 for (int i = 0; i < NUM_INPUTS; i++) {
416 utf8_decode((unsigned char*)inputs[i], &result);
417 ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
418 ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
419 }
420
421 // End of stream is separate from truncated
422 utf8_decode((unsigned char*)"", &result);
423 ASSERT_EQ(result.error, UTF8_ERR_END_OF_STREAM);
424 ASSERT_EQ(result.bytes_read, 0);
425
426 PASS();
427}
428
429GREATEST_MAIN_DEFS();
430
431int main(int argc, char** argv) {
432 GREATEST_MAIN_BEGIN();
433
434 RUN_TEST(dfa_test);
435 RUN_TEST(decode_all_test);
436 RUN_TEST(exhaustive_test);
437 RUN_TEST(enumerate_utf8_test);
438 RUN_TEST(crockford_test);
439 RUN_TEST(surrogate_test);
440
441 RUN_TEST(identity_test);
442 RUN_TEST(overlong_test);
443 RUN_TEST(too_large_test);
444 RUN_TEST(truncated_test);
445
446 GREATEST_MAIN_END();
447 return 0;
448}