OILS / data_lang / utf8.h View on Github | oilshell.org

170 lines, 74 significant
1#ifndef DATA_LANG_UTF8_H
2#define DATA_LANG_UTF8_H
3
4#include <stdio.h>
5
6#include <stdint.h> // uint32_t
7#include <stddef.h> // size_t
8
9/**
10 * ---- Quick reference about the encoding ----
11 *
12 * First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
13 * first byte determines the length of the sequence and then the next 0-3 bytes
14 * are "continuation bytes."
15 *
16 * +----------------------------+----------+----------+----------+----------+
17 * | Scalar Value | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
18 * +----------------------------+----------+----------+----------+----------+
19 * | 00000000 0xxxxxxx | 0xxxxxxx | | | |
20 * | 00000yyy yyxxxxxx | 110yyyyy | 10xxxxxx | | |
21 * | zzzzyyyy yyxxxxxx | 1110zzzz | 10yyyyyy | 10xxxxxx | |
22 * | 000uuuuu zzzzyyyy yyxxxxxx | 11110uuu | 10uuzzzz | 10yyyyyy | 10xxxxxx |
23 * +----------------------------+----------+----------+----------+----------+
24 *
25 * Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
26 *
27 * There are 3 further restrictions which make some valid bit patterns
28 * *invalid*:
29 * 1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
30 * sequence is longer and thus an error.
31 * 2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
32 * surrogate. It is an error to encode surrogates in UTF-8.
33 * 3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
34 * and must be rejected as an error.
35 *
36 * See https://aolsen.ca/writings/everything-about-utf8 for more details about
37 * the encoding.
38 */
39
40typedef enum Utf8Error {
41 UTF8_OK = 0,
42
43 // Encodes a codepoint in more bytes than necessary
44 UTF8_ERR_OVERLONG,
45
46 // Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
47 UTF8_ERR_SURROGATE,
48
49 // Encodes a value greater than the max codepoint U+10FFFF
50 UTF8_ERR_TOO_LARGE,
51
52 // Encoding doesn't conform to the UTF-8 bit patterns
53 UTF8_ERR_BAD_ENCODING,
54
55 // It looks like there is another codepoint, but it has been truncated.
56 UTF8_ERR_TRUNCATED_BYTES,
57
58 // We are at the end of the string. (input_len = 0)
59 UTF8_ERR_END_OF_STREAM,
60} Utf8Error_t;
61
62typedef struct Utf8Result {
63 Utf8Error_t error;
64 uint32_t codepoint;
65 size_t bytes_read;
66} Utf8Result_t;
67
68static inline void _cont(const unsigned char *input, Utf8Result_t *result) {
69 if (result->error) return;
70
71 int byte = input[result->bytes_read];
72 if (byte == '\0') {
73 result->error = UTF8_ERR_TRUNCATED_BYTES;
74 return;
75 }
76 result->bytes_read += 1;
77
78 // Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
79 // validate the pattern and b) remove the leading '10'.
80 if ((byte & 0xC0) == 0x80) {
81 result->codepoint <<= 6;
82 result->codepoint |= byte & 0x3F;
83 } else {
84 result->error = UTF8_ERR_BAD_ENCODING;
85 }
86}
87
88/**
89 * Given a nul-terminated string `input`, try to decode the next codepoint from
90 * that string.
91 *
92 * If there was a surrogate, overlong or codepoint to large error then
93 * `result.codepoint` will contain the recovered value.
94 */
95static inline void utf8_decode(const unsigned char *input, Utf8Result_t *result) {
96 result->error = UTF8_OK;
97 result->codepoint = 0;
98 result->bytes_read = 0;
99
100 int first = *input;
101 if (first == '\0') {
102 result->error = UTF8_ERR_END_OF_STREAM;
103 return;
104 }
105 result->bytes_read = 1;
106
107 if ((first & 0x80) == 0) {
108 // 1-byte long (ASCII subset)
109 result->codepoint = first;
110 return;
111 }
112
113 if ((first & 0xE0) == 0xC0) {
114 // 2-bytes long
115 result->codepoint = first & 0x1F;
116
117 _cont(input, result);
118 if (result->error) return;
119
120 if (result->codepoint < 0x80) {
121 result->error = UTF8_ERR_OVERLONG;
122 }
123
124 return;
125 }
126
127 if ((first & 0xF0) == 0xE0) {
128 // 3-bytes long
129 result->codepoint = first & 0x0F;
130
131 _cont(input, result);
132 _cont(input, result);
133 if (result->error) return;
134
135 if (result->codepoint < 0x800) {
136 result->error = UTF8_ERR_OVERLONG;
137 }
138
139 if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140 result->error = UTF8_ERR_SURROGATE;
141 }
142
143 return;
144 }
145
146 if ((first & 0xF8) == 0xF0) {
147 // 4-bytes long
148 result->codepoint = first & 0x07;
149
150 _cont(input, result);
151 _cont(input, result);
152 _cont(input, result);
153 if (result->error) return;
154
155 if (result->codepoint < 0x10000) {
156 result->error = UTF8_ERR_OVERLONG;
157 }
158
159 if (result->codepoint > 0x10FFFF) {
160 result->error = UTF8_ERR_TOO_LARGE;
161 }
162
163 return;
164 }
165
166 result->error = UTF8_ERR_BAD_ENCODING;
167 return;
168}
169
170#endif // DATA_LANG_UTF8_H