1 | #ifndef BJOERN_DFA
|
2 | #define BJOERN_DFA
|
3 |
|
4 | // Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
|
5 | // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
|
6 |
|
7 | #include "stdint.h"
|
8 |
|
9 | #if 0
|
10 |
|
11 | #define UTF8_ACCEPT 0
|
12 | #define UTF8_REJECT 1
|
13 |
|
14 | static const uint8_t utf8d[] = {
|
15 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
|
16 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
|
17 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
|
18 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
|
19 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
|
20 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
|
21 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
|
22 | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
|
23 | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
|
24 |
|
25 | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
|
26 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
|
27 | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
|
28 | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
|
29 | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
|
30 | };
|
31 |
|
32 | uint32_t inline
|
33 | decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
34 | uint32_t type = utf8d[byte];
|
35 |
|
36 | *codep = (*state != UTF8_ACCEPT) ?
|
37 | (byte & 0x3fu) | (*codep << 6) :
|
38 | (0xff >> type) & (byte);
|
39 |
|
40 | *state = utf8d[256 + *state*16 + type];
|
41 | return *state;
|
42 | }
|
43 |
|
44 | #else
|
45 |
|
46 | // Newer version, lower on the page
|
47 |
|
48 | #define UTF8_ACCEPT 0
|
49 | #define UTF8_REJECT 12
|
50 |
|
51 | static const uint8_t utf8d[] = {
|
52 | // The first part of the table maps bytes to character classes that
|
53 | // to reduce the size of the transition table and create bitmasks.
|
54 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
55 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
56 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
57 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
|
58 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
|
59 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
|
60 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
|
61 | 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
|
62 |
|
63 | // The second part is a transition table that maps a combination
|
64 | // of a state of the automaton and a character class to a state.
|
65 | 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
|
66 | 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
|
67 | 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
|
68 | 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
|
69 | 12,36,12,12,12,12,12,12,12,12,12,12,
|
70 | };
|
71 |
|
72 | static inline
|
73 | uint32_t decode(uint32_t* state, uint32_t* codep, uint32_t byte) {
|
74 | uint32_t type = utf8d[byte];
|
75 |
|
76 | *codep = (*state != UTF8_ACCEPT) ?
|
77 | (byte & 0x3fu) | (*codep << 6) :
|
78 | (0xff >> type) & (byte);
|
79 |
|
80 | *state = utf8d[256 + *state + type];
|
81 | return *state;
|
82 | }
|
83 |
|
84 | #endif
|
85 |
|
86 | #endif // BJOERN_DFA
|