OILS / data_lang / utf8_test.cc View on Github | oilshell.org

349 lines, 167 significant
1#include <inttypes.h>
2
3#include "data_lang/utf8_impls/bjoern_dfa.h"
4#include "data_lang/utf8_impls/utf8_decode.h"
5
6//#include "mycpp/runtime.h"
7#include "vendor/greatest.h"
8
9// Copied from UTF-8 proc
10// https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
11
12int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
13 if (uc < 0x80) {
14 dst[0] = (uint8_t)uc;
15 return 1;
16 } else if (uc < 0x800) {
17 dst[0] = (uint8_t)(0xC0 + (uc >> 6));
18 dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
19 return 2;
20 // Note: we allow encoding 0xd800-0xdfff here, so as not to change
21 // the API, however, these are actually invalid in UTF-8
22 } else if (uc < 0x10000) {
23 dst[0] = (uint8_t)(0xE0 + (uc >> 12));
24 dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
25 dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
26 return 3;
27 } else if (uc < 0x110000) {
28 dst[0] = (uint8_t)(0xF0 + (uc >> 18));
29 dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
30 dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
31 dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
32 return 4;
33 } else
34 return 0;
35}
36
37void printCodePoints(const uint8_t* s) {
38 uint32_t codepoint;
39 uint32_t state = 0;
40
41 for (; *s; ++s)
42 if (!decode(&state, &codepoint, *s)) {
43 printf("U+%04X\n", codepoint);
44 }
45
46 if (state != UTF8_ACCEPT) printf("The string is not well-formed\n");
47}
48
49TEST dfa_test() {
50 const char* s = "hello\u03bc";
51 printCodePoints(reinterpret_cast<const uint8_t*>(s));
52
53 PASS();
54}
55
56uint32_t num_code_points = 17 * 1 << 16;
57
58// uint32_t num_code_points = 1 << 21; // DFA has 983,040 disagreements
59
60TEST decode_all_test() {
61 int num_disagree = 0;
62 int num_reject = 0;
63
64 // 17 "planes" of 2^16 - https://en.wikipedia.org/wiki/Unicode
65 // UTF-8 can represent more numbers (up to 21 bits), but this state machine
66 // doesn't need to handle them.
67
68 uint64_t sum_codepoint = 0;
69
70 for (uint32_t i = 0; i < num_code_points; ++i) {
71 uint8_t bytes[4] = {0};
72 int num_bytes = utf8proc_encode_char(i, bytes);
73 // printf("i = %d, num_bytes = %d\n", i, num_bytes);
74
75 uint32_t state = 0;
76 uint32_t codepoint = 0;
77 for (int j = 0; j < num_bytes; ++j) {
78 decode(&state, &codepoint, bytes[j]);
79 }
80 if (state == UTF8_ACCEPT) {
81 ASSERT_EQ_FMT(i, codepoint, "%d");
82 if (i != codepoint) {
83 num_disagree += 1;
84 }
85 } else {
86 num_reject += 1;
87 }
88
89 sum_codepoint += i;
90 }
91
92 printf("sum_codepoint = %ld\n", sum_codepoint);
93 printf("num_disagree = %d\n", num_disagree);
94 printf("num_reject = %d\n", num_reject);
95
96 PASS();
97}
98
99// Exhaustive DFA testing strategies
100//
101// All code points: 1,114,112 of them
102//
103// - Encode them all
104// - Decode each byte
105// - end state should be UTF8_ACCEPT, except for 2048 in surrogate range
106
107// - code point should match
108//
109// All byte sequences
110//
111// 1. 256 1-byte sequences
112// 2. 63,536 2-byte sequences
113// - (ASCII, ASCII) will be 25% of these
114// 3. 2**24 = 24 Mi 3-byte sequences
115// overlapping cases
116// 1 + 1 + 1
117// 1 + 2
118// 2 + 1
119//
120// 4. All 2**32 = 4 Gi 4-byte sequences? Possible but slightly slow
121// maybe explore this in a smarter way
122
123// Generating errors
124//
125// - enumerate all out of range
126// - UTF-8 can go up to 2*21 - which is 2,097,152
127// - enumerate ALL overlong encodings?
128//
129// - enumerate ALL sequences that correspond to code points in surrogate range
130// - I think they're all 3 bytes long?
131
132TEST exhaustive_test() {
133 long sum = 0;
134
135 for (int i = 0; i < (1 << 21); ++i) {
136 sum += i;
137 // Almost half these are out of range
138
139 // 2048 of them are surrogates!
140 }
141 printf("code points %ld\n", sum);
142
143 // How many overlong sequences?
144 // The key is making the leading byte 0 I think
145 // You can also have 2 or 3 leading zero bytes
146 //
147 // So you can test all these forms:
148 //
149 // [0 42 43 44] - should be [42 43 44]
150 //
151 // [0 0 42 43] - should be [42 43]
152 // [0 42 43]
153 //
154 // [0 0 0 42] - should be [42]
155 // [0 0 42]
156 // [0 42]
157 //
158 // Each continuation byte has 2*6 == 64 values.
159 // 2 byte sequences have 2*5 initial choicse
160 // 3 byte sequences have 2*4 initial choices
161 // 4 byte sequences have 2*3 initial choices
162
163 for (int i = 0; i < (1 << 8); ++i) {
164 sum += i;
165 }
166 printf("1 byte %ld\n", sum);
167
168 for (int i = 0; i < (1 << 16); ++i) {
169 sum += i;
170 }
171 printf("2 byte %ld\n", sum);
172
173 for (int i = 0; i < (1 << 24); ++i) {
174 sum += i;
175 }
176 printf("3 byte %ld\n", sum);
177
178 PASS();
179}
180
181TEST enumerate_utf8_test() {
182 // Valid digits, and some of them are overlong
183
184 // [a b c d] - 21 bits of info
185 int n = 0;
186 int overlong = 0;
187 for (int a = 0; a < (1 << 3); ++a) {
188 for (int b = 0; b < (1 << 6); ++b) {
189 for (int c = 0; c < (1 << 6); ++c) {
190 for (int d = 0; d < (1 << 6); ++d) {
191 n++;
192 if (a == 0) {
193 overlong++;
194 }
195 }
196 }
197 }
198 }
199 printf("4 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
200 n - overlong);
201
202 // [a b c] - 4 + 6 + 6 = 16 bits of info
203 n = 0;
204 overlong = 0;
205 for (int a = 0; a < (1 << 4); ++a) {
206 for (int b = 0; b < (1 << 6); ++b) {
207 for (int c = 0; c < (1 << 6); ++c) {
208 n++;
209 if (a == 0) {
210 overlong++;
211 }
212 }
213 }
214 }
215 printf("3 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
216 n - overlong);
217
218 // [a b] - 5 + 6 = 11 bits of info
219 n = 0;
220 overlong = 0;
221 for (int a = 0; a < (1 << 5); ++a) {
222 for (int b = 0; b < (1 << 6); ++b) {
223 n++;
224 if (a == 0) {
225 overlong++;
226 }
227 }
228 }
229 printf("2 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
230 n - overlong);
231
232 // Types of errors
233 // Surrogate Range
234 // Overlong
235 //
236 // Not covered by this:
237 // Invalid sequence (start byte, continuation byte) - we are leaving this off
238
239 n = 0;
240 for (int a = 0; a < (1 << 8); ++a) {
241 n++;
242 }
243 printf("1 byte: n = %10d\n", n);
244
245 PASS();
246}
247
248TEST crockford_test() {
249 char* s = const_cast<char*>("hello\u03bc");
250 utf8_decode_init(s, strlen(s));
251
252 int c;
253 bool done = false;
254 while (!done) {
255 c = utf8_decode_next();
256
257 switch (c) {
258 case UTF8_END:
259 done = true;
260 break;
261
262 case UTF8_ERROR:
263 printf("decode error");
264 break;
265
266 default:
267 printf("U+%04X\n", c);
268 break;
269 }
270 }
271
272 PASS();
273}
274
275TEST loop_all_test() {
276 int sum = 0;
277 int sum_codepoint = 0;
278
279 int num_errors = 0;
280
281 // Takes 1.4 seconds to iterate through all possibilities
282 for (uint32_t i = 0;; ++i) {
283 /*
284 uint32_t state = 0;
285 uint32_t codepoint = 0;
286
287 uint8_t* bytes = reinterpret_cast<uint8_t*>(&i);
288 int result = decode(&state, &codepoint, bytes[0]);
289
290 if (result == UTF8_REJECT) {
291 num_errors += 1;
292 } else {
293 sum_codepoint += codepoint;
294 }
295 */
296 sum += i;
297
298 if (i == 0xFFFFFFFF) { // Can't test this condition in the loop
299 break;
300 }
301 }
302 printf("sum %d\n", sum);
303 printf("sum_codepoint %d\n", sum_codepoint);
304 printf("num_errors %d\n", num_errors);
305 printf("2 ^21 = %d\n", 1 << 21);
306
307 PASS();
308}
309
310void PrintStates(const uint8_t* s) {
311 uint32_t codepoint;
312 uint32_t state = UTF8_ACCEPT;
313
314 int n = strlen(reinterpret_cast<const char*>(s));
315 for (int i = 0; i < n; ++i) {
316 decode(&state, &codepoint, s[i]);
317 printf("state %d = %d\n", i, state);
318 }
319}
320
321TEST surrogate_test() {
322 // \ud83e
323 printf(" surrogate in utf-8\n");
324 const uint8_t* s = reinterpret_cast<const uint8_t*>("\xed\xa0\xbe");
325 PrintStates(s);
326
327 // mu
328 printf(" mu in utf-8\n");
329 const uint8_t* t = reinterpret_cast<const uint8_t*>("\xce\xbc");
330 PrintStates(t);
331
332 PASS();
333}
334
335GREATEST_MAIN_DEFS();
336
337int main(int argc, char** argv) {
338 GREATEST_MAIN_BEGIN();
339
340 RUN_TEST(dfa_test);
341 RUN_TEST(decode_all_test);
342 RUN_TEST(exhaustive_test);
343 RUN_TEST(enumerate_utf8_test);
344 RUN_TEST(crockford_test);
345 RUN_TEST(surrogate_test);
346
347 GREATEST_MAIN_END();
348 return 0;
349}