| 1 | #include <inttypes.h>
|
| 2 |
|
| 3 | #include "data_lang/utf8_impls/bjoern_dfa.h"
|
| 4 | #include "data_lang/utf8_impls/utf8_decode.h"
|
| 5 | #include "data_lang/utf8.h"
|
| 6 |
|
| 7 | //#include "mycpp/runtime.h"
|
| 8 | #include "vendor/greatest.h"
|
| 9 |
|
| 10 | // Copied from UTF-8 proc
|
| 11 | // https://github.com/JuliaStrings/utf8proc/blob/master/utf8proc.c#L177
|
| 12 |
|
| 13 | int utf8proc_encode_char(uint32_t uc, uint8_t* dst) {
|
| 14 | if (uc < 0x80) {
|
| 15 | dst[0] = (uint8_t)uc;
|
| 16 | return 1;
|
| 17 | } else if (uc < 0x800) {
|
| 18 | dst[0] = (uint8_t)(0xC0 + (uc >> 6));
|
| 19 | dst[1] = (uint8_t)(0x80 + (uc & 0x3F));
|
| 20 | return 2;
|
| 21 | // Note: we allow encoding 0xd800-0xdfff here, so as not to change
|
| 22 | // the API, however, these are actually invalid in UTF-8
|
| 23 | } else if (uc < 0x10000) {
|
| 24 | dst[0] = (uint8_t)(0xE0 + (uc >> 12));
|
| 25 | dst[1] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
| 26 | dst[2] = (uint8_t)(0x80 + (uc & 0x3F));
|
| 27 | return 3;
|
| 28 | } else if (uc < 0x110000) {
|
| 29 | dst[0] = (uint8_t)(0xF0 + (uc >> 18));
|
| 30 | dst[1] = (uint8_t)(0x80 + ((uc >> 12) & 0x3F));
|
| 31 | dst[2] = (uint8_t)(0x80 + ((uc >> 6) & 0x3F));
|
| 32 | dst[3] = (uint8_t)(0x80 + (uc & 0x3F));
|
| 33 | return 4;
|
| 34 | } else
|
| 35 | return 0;
|
| 36 | }
|
| 37 |
|
| 38 | void printCodePoints(const uint8_t* s) {
|
| 39 | uint32_t codepoint;
|
| 40 | uint32_t state = 0;
|
| 41 |
|
| 42 | for (; *s; ++s)
|
| 43 | if (!decode(&state, &codepoint, *s)) {
|
| 44 | printf("U+%04X\n", codepoint);
|
| 45 | }
|
| 46 |
|
| 47 | if (state != UTF8_ACCEPT) printf("The string is not well-formed\n");
|
| 48 | }
|
| 49 |
|
| 50 | TEST dfa_test() {
|
| 51 | const char* s = "hello\u03bc";
|
| 52 | printCodePoints(reinterpret_cast<const uint8_t*>(s));
|
| 53 |
|
| 54 | PASS();
|
| 55 | }
|
| 56 |
|
| 57 | uint32_t num_code_points = 17 * 1 << 16;
|
| 58 |
|
| 59 | // uint32_t num_code_points = 1 << 21; // DFA has 983,040 disagreements
|
| 60 |
|
| 61 | TEST decode_all_test() {
|
| 62 | int num_disagree = 0;
|
| 63 | int num_reject = 0;
|
| 64 |
|
| 65 | // 17 "planes" of 2^16 - https://en.wikipedia.org/wiki/Unicode
|
| 66 | // UTF-8 can represent more numbers (up to 21 bits), but this state machine
|
| 67 | // doesn't need to handle them.
|
| 68 |
|
| 69 | uint64_t sum_codepoint = 0;
|
| 70 |
|
| 71 | for (uint32_t i = 0; i < num_code_points; ++i) {
|
| 72 | uint8_t bytes[4] = {0};
|
| 73 | int num_bytes = utf8proc_encode_char(i, bytes);
|
| 74 | // printf("i = %d, num_bytes = %d\n", i, num_bytes);
|
| 75 |
|
| 76 | uint32_t state = 0;
|
| 77 | uint32_t codepoint = 0;
|
| 78 | for (int j = 0; j < num_bytes; ++j) {
|
| 79 | decode(&state, &codepoint, bytes[j]);
|
| 80 | }
|
| 81 | if (state == UTF8_ACCEPT) {
|
| 82 | ASSERT_EQ_FMT(i, codepoint, "%d");
|
| 83 | if (i != codepoint) {
|
| 84 | num_disagree += 1;
|
| 85 | }
|
| 86 | } else {
|
| 87 | num_reject += 1;
|
| 88 | }
|
| 89 |
|
| 90 | sum_codepoint += i;
|
| 91 | }
|
| 92 |
|
| 93 | printf("sum_codepoint = %ld\n", sum_codepoint);
|
| 94 | printf("num_disagree = %d\n", num_disagree);
|
| 95 | printf("num_reject = %d\n", num_reject);
|
| 96 |
|
| 97 | PASS();
|
| 98 | }
|
| 99 |
|
| 100 | // Exhaustive DFA testing strategies
|
| 101 | //
|
| 102 | // All code points: 1,114,112 of them
|
| 103 | //
|
| 104 | // - Encode them all
|
| 105 | // - Decode each byte
|
| 106 | // - end state should be UTF8_ACCEPT, except for 2048 in surrogate range
|
| 107 |
|
| 108 | // - code point should match
|
| 109 | //
|
| 110 | // All byte sequences
|
| 111 | //
|
| 112 | // 1. 256 1-byte sequences
|
| 113 | // 2. 63,536 2-byte sequences
|
| 114 | // - (ASCII, ASCII) will be 25% of these
|
| 115 | // 3. 2**24 = 24 Mi 3-byte sequences
|
| 116 | // overlapping cases
|
| 117 | // 1 + 1 + 1
|
| 118 | // 1 + 2
|
| 119 | // 2 + 1
|
| 120 | //
|
| 121 | // 4. All 2**32 = 4 Gi 4-byte sequences? Possible but slightly slow
|
| 122 | // maybe explore this in a smarter way
|
| 123 |
|
| 124 | // Generating errors
|
| 125 | //
|
| 126 | // - enumerate all out of range
|
| 127 | // - UTF-8 can go up to 2*21 - which is 2,097,152
|
| 128 | // - enumerate ALL overlong encodings?
|
| 129 | //
|
| 130 | // - enumerate ALL sequences that correspond to code points in surrogate range
|
| 131 | // - I think they're all 3 bytes long?
|
| 132 |
|
| 133 | TEST exhaustive_test() {
|
| 134 | long sum = 0;
|
| 135 |
|
| 136 | for (int i = 0; i < (1 << 21); ++i) {
|
| 137 | sum += i;
|
| 138 | // Almost half these are out of range
|
| 139 |
|
| 140 | // 2048 of them are surrogates!
|
| 141 | }
|
| 142 | printf("code points %ld\n", sum);
|
| 143 |
|
| 144 | // How many overlong sequences?
|
| 145 | // The key is making the leading byte 0 I think
|
| 146 | // You can also have 2 or 3 leading zero bytes
|
| 147 | //
|
| 148 | // So you can test all these forms:
|
| 149 | //
|
| 150 | // [0 42 43 44] - should be [42 43 44]
|
| 151 | //
|
| 152 | // [0 0 42 43] - should be [42 43]
|
| 153 | // [0 42 43]
|
| 154 | //
|
| 155 | // [0 0 0 42] - should be [42]
|
| 156 | // [0 0 42]
|
| 157 | // [0 42]
|
| 158 | //
|
| 159 | // Each continuation byte has 2*6 == 64 values.
|
| 160 | // 2 byte sequences have 2*5 initial choicse
|
| 161 | // 3 byte sequences have 2*4 initial choices
|
| 162 | // 4 byte sequences have 2*3 initial choices
|
| 163 |
|
| 164 | for (int i = 0; i < (1 << 8); ++i) {
|
| 165 | sum += i;
|
| 166 | }
|
| 167 | printf("1 byte %ld\n", sum);
|
| 168 |
|
| 169 | for (int i = 0; i < (1 << 16); ++i) {
|
| 170 | sum += i;
|
| 171 | }
|
| 172 | printf("2 byte %ld\n", sum);
|
| 173 |
|
| 174 | for (int i = 0; i < (1 << 24); ++i) {
|
| 175 | sum += i;
|
| 176 | }
|
| 177 | printf("3 byte %ld\n", sum);
|
| 178 |
|
| 179 | PASS();
|
| 180 | }
|
| 181 |
|
| 182 | TEST enumerate_utf8_test() {
|
| 183 | // Valid digits, and some of them are overlong
|
| 184 |
|
| 185 | // [a b c d] - 21 bits of info
|
| 186 | int n = 0;
|
| 187 | int overlong = 0;
|
| 188 | for (int a = 0; a < (1 << 3); ++a) {
|
| 189 | for (int b = 0; b < (1 << 6); ++b) {
|
| 190 | for (int c = 0; c < (1 << 6); ++c) {
|
| 191 | for (int d = 0; d < (1 << 6); ++d) {
|
| 192 | n++;
|
| 193 | if (a == 0) {
|
| 194 | overlong++;
|
| 195 | }
|
| 196 | }
|
| 197 | }
|
| 198 | }
|
| 199 | }
|
| 200 | printf("4 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
|
| 201 | n - overlong);
|
| 202 |
|
| 203 | // [a b c] - 4 + 6 + 6 = 16 bits of info
|
| 204 | n = 0;
|
| 205 | overlong = 0;
|
| 206 | for (int a = 0; a < (1 << 4); ++a) {
|
| 207 | for (int b = 0; b < (1 << 6); ++b) {
|
| 208 | for (int c = 0; c < (1 << 6); ++c) {
|
| 209 | n++;
|
| 210 | if (a == 0) {
|
| 211 | overlong++;
|
| 212 | }
|
| 213 | }
|
| 214 | }
|
| 215 | }
|
| 216 | printf("3 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
|
| 217 | n - overlong);
|
| 218 |
|
| 219 | // [a b] - 5 + 6 = 11 bits of info
|
| 220 | n = 0;
|
| 221 | overlong = 0;
|
| 222 | for (int a = 0; a < (1 << 5); ++a) {
|
| 223 | for (int b = 0; b < (1 << 6); ++b) {
|
| 224 | n++;
|
| 225 | if (a == 0) {
|
| 226 | overlong++;
|
| 227 | }
|
| 228 | }
|
| 229 | }
|
| 230 | printf("2 byte: n = %10d, overlong = %10d, valid = %10d\n", n, overlong,
|
| 231 | n - overlong);
|
| 232 |
|
| 233 | // Types of errors
|
| 234 | // Surrogate Range
|
| 235 | // Overlong
|
| 236 | //
|
| 237 | // Not covered by this:
|
| 238 | // Invalid sequence (start byte, continuation byte) - we are leaving this off
|
| 239 |
|
| 240 | n = 0;
|
| 241 | for (int a = 0; a < (1 << 8); ++a) {
|
| 242 | n++;
|
| 243 | }
|
| 244 | printf("1 byte: n = %10d\n", n);
|
| 245 |
|
| 246 | PASS();
|
| 247 | }
|
| 248 |
|
| 249 | TEST crockford_test() {
|
| 250 | char* s = const_cast<char*>("hello\u03bc");
|
| 251 | utf8_decode_init(s, strlen(s));
|
| 252 |
|
| 253 | int c;
|
| 254 | bool done = false;
|
| 255 | while (!done) {
|
| 256 | c = utf8_decode_next();
|
| 257 |
|
| 258 | switch (c) {
|
| 259 | case UTF8_END:
|
| 260 | done = true;
|
| 261 | break;
|
| 262 |
|
| 263 | case UTF8_ERROR:
|
| 264 | printf("decode error");
|
| 265 | break;
|
| 266 |
|
| 267 | default:
|
| 268 | printf("U+%04X\n", c);
|
| 269 | break;
|
| 270 | }
|
| 271 | }
|
| 272 |
|
| 273 | PASS();
|
| 274 | }
|
| 275 |
|
| 276 | TEST loop_all_test() {
|
| 277 | int sum = 0;
|
| 278 | int sum_codepoint = 0;
|
| 279 |
|
| 280 | int num_errors = 0;
|
| 281 |
|
| 282 | // Takes 1.4 seconds to iterate through all possibilities
|
| 283 | for (uint32_t i = 0;; ++i) {
|
| 284 | /*
|
| 285 | uint32_t state = 0;
|
| 286 | uint32_t codepoint = 0;
|
| 287 |
|
| 288 | uint8_t* bytes = reinterpret_cast<uint8_t*>(&i);
|
| 289 | int result = decode(&state, &codepoint, bytes[0]);
|
| 290 |
|
| 291 | if (result == UTF8_REJECT) {
|
| 292 | num_errors += 1;
|
| 293 | } else {
|
| 294 | sum_codepoint += codepoint;
|
| 295 | }
|
| 296 | */
|
| 297 | sum += i;
|
| 298 |
|
| 299 | if (i == 0xFFFFFFFF) { // Can't test this condition in the loop
|
| 300 | break;
|
| 301 | }
|
| 302 | }
|
| 303 | printf("sum %d\n", sum);
|
| 304 | printf("sum_codepoint %d\n", sum_codepoint);
|
| 305 | printf("num_errors %d\n", num_errors);
|
| 306 | printf("2 ^21 = %d\n", 1 << 21);
|
| 307 |
|
| 308 | PASS();
|
| 309 | }
|
| 310 |
|
| 311 | void PrintStates(const uint8_t* s) {
|
| 312 | uint32_t codepoint;
|
| 313 | uint32_t state = UTF8_ACCEPT;
|
| 314 |
|
| 315 | int n = strlen(reinterpret_cast<const char*>(s));
|
| 316 | for (int i = 0; i < n; ++i) {
|
| 317 | decode(&state, &codepoint, s[i]);
|
| 318 | printf("state %d = %d\n", i, state);
|
| 319 | }
|
| 320 | }
|
| 321 |
|
| 322 | TEST surrogate_test() {
|
| 323 | // \ud83e
|
| 324 | printf(" surrogate in utf-8\n");
|
| 325 | const uint8_t* s = reinterpret_cast<const uint8_t*>("\xed\xa0\xbe");
|
| 326 | PrintStates(s);
|
| 327 |
|
| 328 | // mu
|
| 329 | printf(" mu in utf-8\n");
|
| 330 | const uint8_t* t = reinterpret_cast<const uint8_t*>("\xce\xbc");
|
| 331 | PrintStates(t);
|
| 332 |
|
| 333 | PASS();
|
| 334 | }
|
| 335 |
|
| 336 | TEST identity_test() {
|
| 337 | // check that decode(encode(x)) = x for all code-points (and surrogates)
|
| 338 | uint8_t buf[5] = {0};
|
| 339 | for (uint32_t cp = 1; cp < 0x10FFFF; ++cp) {
|
| 340 | int len = utf8proc_encode_char(cp, buf);
|
| 341 | Utf8Result result;
|
| 342 | utf8_decode(buf, &result);
|
| 343 |
|
| 344 | if (cp < 0xD800 || cp > 0xDFFF) {
|
| 345 | ASSERT_EQ(result.error, UTF8_OK);
|
| 346 | } else {
|
| 347 | ASSERT_EQ(result.error, UTF8_ERR_SURROGATE);
|
| 348 | }
|
| 349 | ASSERT_EQ(result.codepoint, cp);
|
| 350 | ASSERT_EQ(result.bytes_read, static_cast<size_t>(len));
|
| 351 | }
|
| 352 |
|
| 353 | PASS();
|
| 354 | }
|
| 355 |
|
| 356 | TEST overlong_test() {
|
| 357 | // All encode U+41 ('A')
|
| 358 | Utf8Result ok, overlong2, overlong3, overlong4;
|
| 359 | utf8_decode((unsigned char*)"\x41", &ok);
|
| 360 | utf8_decode((unsigned char*)"\xC1\x81", &overlong2);
|
| 361 | utf8_decode((unsigned char*)"\xE0\x81\x81", &overlong3);
|
| 362 | utf8_decode((unsigned char*)"\xF0\x80\x81\x81", &overlong4);
|
| 363 |
|
| 364 | ASSERT_EQ(ok.error, UTF8_OK);
|
| 365 | ASSERT_EQ(overlong2.error, UTF8_ERR_OVERLONG);
|
| 366 | ASSERT_EQ(overlong3.error, UTF8_ERR_OVERLONG);
|
| 367 | ASSERT_EQ(overlong4.error, UTF8_ERR_OVERLONG);
|
| 368 |
|
| 369 | ASSERT_EQ(ok.codepoint, 0x41);
|
| 370 | ASSERT_EQ(overlong2.codepoint, 0x41);
|
| 371 | ASSERT_EQ(overlong3.codepoint, 0x41);
|
| 372 | ASSERT_EQ(overlong4.codepoint, 0x41);
|
| 373 |
|
| 374 | ASSERT_EQ(ok.bytes_read, 1);
|
| 375 | ASSERT_EQ(overlong2.bytes_read, 2);
|
| 376 | ASSERT_EQ(overlong3.bytes_read, 3);
|
| 377 | ASSERT_EQ(overlong4.bytes_read, 4);
|
| 378 |
|
| 379 | PASS();
|
| 380 | }
|
| 381 |
|
| 382 | TEST too_large_test() {
|
| 383 | // Encoding of 0x111111 (via Table 3-6)
|
| 384 | // = 00010001 00010001 00010001
|
| 385 | // uuuuu -> 10001
|
| 386 | // zzzz -> 00001
|
| 387 | // yyyyyy -> 000100
|
| 388 | // xxxxxx -> 010001
|
| 389 | //
|
| 390 | // -> 11110100 10010001 10000100 10010001
|
| 391 | // = F4 91 84 91
|
| 392 | Utf8Result result;
|
| 393 | utf8_decode((unsigned char*)"\xF4\x91\x84\x91", &result);
|
| 394 |
|
| 395 | ASSERT_EQ(result.error, UTF8_ERR_TOO_LARGE);
|
| 396 | ASSERT_EQ(result.codepoint, 0x111111);
|
| 397 | ASSERT_EQ(result.bytes_read, 4);
|
| 398 |
|
| 399 | PASS();
|
| 400 | }
|
| 401 |
|
| 402 | TEST truncated_test() {
|
| 403 | Utf8Result result;
|
| 404 |
|
| 405 | constexpr const int NUM_INPUTS = 6;
|
| 406 | const char *inputs[NUM_INPUTS] = {
|
| 407 | "\xC5",
|
| 408 | "\xED",
|
| 409 | "\xED\x9F",
|
| 410 | "\xF4",
|
| 411 | "\xF4\x80",
|
| 412 | "\xF4\x80\x80",
|
| 413 | };
|
| 414 |
|
| 415 | for (int i = 0; i < NUM_INPUTS; i++) {
|
| 416 | utf8_decode((unsigned char*)inputs[i], &result);
|
| 417 | ASSERT_EQ(result.error, UTF8_ERR_TRUNCATED_BYTES);
|
| 418 | ASSERT_EQ(result.bytes_read, strlen(inputs[i]));
|
| 419 | }
|
| 420 |
|
| 421 | // End of stream is separate from truncated
|
| 422 | utf8_decode((unsigned char*)"", &result);
|
| 423 | ASSERT_EQ(result.error, UTF8_ERR_END_OF_STREAM);
|
| 424 | ASSERT_EQ(result.bytes_read, 0);
|
| 425 |
|
| 426 | PASS();
|
| 427 | }
|
| 428 |
|
| 429 | GREATEST_MAIN_DEFS();
|
| 430 |
|
| 431 | int main(int argc, char** argv) {
|
| 432 | GREATEST_MAIN_BEGIN();
|
| 433 |
|
| 434 | RUN_TEST(dfa_test);
|
| 435 | RUN_TEST(decode_all_test);
|
| 436 | RUN_TEST(exhaustive_test);
|
| 437 | RUN_TEST(enumerate_utf8_test);
|
| 438 | RUN_TEST(crockford_test);
|
| 439 | RUN_TEST(surrogate_test);
|
| 440 |
|
| 441 | RUN_TEST(identity_test);
|
| 442 | RUN_TEST(overlong_test);
|
| 443 | RUN_TEST(too_large_test);
|
| 444 | RUN_TEST(truncated_test);
|
| 445 |
|
| 446 | GREATEST_MAIN_END();
|
| 447 | return 0;
|
| 448 | }
|