| 1 | #ifndef BJOERN_DFA
|
| 2 | #define BJOERN_DFA
|
| 3 |
|
| 4 | // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
| 5 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
| 6 |
|
| 7 | #include "stdint.h"
|
| 8 |
|
| 9 | #if 0
|
| 10 |
|
| 11 | #define UTF8_ACCEPT 0
|
| 12 | #define UTF8_REJECT 1
|
| 13 |
|
| 14 | static const uint8_t utf8d[] = {
|
| 15 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
| 16 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
| 17 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
| 18 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
| 19 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
| 20 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
| 21 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
| 22 | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
| 23 | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
| 24 |
|
| 25 | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
| 26 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
| 27 | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
| 28 | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
| 29 | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
| 30 | };
|
| 31 |
|
| 32 | uint32_t inline
|
| 33 | decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
| 34 | uint32_t type = utf8d[byte];
|
| 35 |
|
| 36 | *codep = (*state != UTF8_ACCEPT) ?
|
| 37 | (byte & 0x3fu) | (*codep << 6) :
|
| 38 | (0xff >> type) & (byte);
|
| 39 |
|
| 40 | *state = utf8d[256 + *state*16 + type];
|
| 41 | return *state;
|
| 42 | }
|
| 43 |
|
| 44 | #else
|
| 45 |
|
| 46 | // Newer version, lower on the page
|
| 47 |
|
| 48 | #define UTF8_ACCEPT 0
|
| 49 | #define UTF8_REJECT 12
|
| 50 |
|
| 51 | static const uint8_t utf8d[] = {
|
| 52 | // The first part of the table maps bytes to character classes that
|
| 53 | // to reduce the size of the transition table and create bitmasks.
|
| 54 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
| 55 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
| 56 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
| 57 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
| 58 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
| 59 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
| 60 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
| 61 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
| 62 |
|
| 63 | // The second part is a transition table that maps a combination
|
| 64 | // of a state of the automaton and a character class to a state.
|
| 65 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
| 66 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
| 67 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
| 68 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
| 69 | 12,36,12,12,12,12,12,12,12,12,12,12,
|
| 70 | };
|
| 71 |
|
| 72 | static inline
|
| 73 | uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
| 74 | uint32_t type = utf8d[byte];
|
| 75 |
|
| 76 | *codep = (*state != UTF8_ACCEPT) ?
|
| 77 | (byte & 0x3fu) | (*codep << 6) :
|
| 78 | (0xff >> type) & (byte);
|
| 79 |
|
| 80 | *state = utf8d[256 + *state + type];
|
| 81 | return *state;
|
| 82 | }
|
| 83 |
|
| 84 | #endif
|
| 85 |
|
| 86 | #endif // BJOERN_DFA
|