data_lang/utf8.h

OILS / data_lang / utf8.h View on Github | oilshell.org

170 lines, 74 significant

1	#ifndef DATA_LANG_UTF8_H
2	#define DATA_LANG_UTF8_H
3
4	#include <stdio.h>
5
6	#include <stdint.h> // uint32_t
7	#include <stddef.h> // size_t
8
9	/**
10	* ---- Quick reference about the encoding ----
11	*
12	* First, all valid UTF-8 sequences follow of bit "patterns" (Table 3-6.) The
13	* first byte determines the length of the sequence and then the next 0-3 bytes
14	* are "continuation bytes."
15	*
16	* +----------------------------+----------+----------+----------+----------+
17	* \| Scalar Value \| 1st Byte \| 2nd Byte \| 3rd Byte \| 4th Byte \|
18	* +----------------------------+----------+----------+----------+----------+
19	* \| 00000000 0xxxxxxx \| 0xxxxxxx \| \| \| \|
20	* \| 00000yyy yyxxxxxx \| 110yyyyy \| 10xxxxxx \| \| \|
21	* \| zzzzyyyy yyxxxxxx \| 1110zzzz \| 10yyyyyy \| 10xxxxxx \| \|
22	* \| 000uuuuu zzzzyyyy yyxxxxxx \| 11110uuu \| 10uuzzzz \| 10yyyyyy \| 10xxxxxx \|
23	* +----------------------------+----------+----------+----------+----------+
24	*
25	* Table 3-6 from Unicode Standard 15.0.0 Ch3. UTF-8 bit patterns
26	*
27	* There are 3 further restrictions which make some valid bit patterns
28	* invalid:
29	* 1. Overlongs: eg, <0x41> and <0xC1 0x81> both store U+41, but the second
30	* sequence is longer and thus an error.
31	* 2. Surrogates: Any codepoint between U+D800 and U+DFFF (inclusive) is a
32	* surrogate. It is an error to encode surrogates in UTF-8.
33	* 3. Too Large: Any decoded value over 0x10FFFF is not a Unicode codepoint,
34	* and must be rejected as an error.
35	*
36	* See https://aolsen.ca/writings/everything-about-utf8 for more details about
37	* the encoding.
38	*/
39
40	typedef enum Utf8Error {
41	UTF8_OK = 0,
42
43	// Encodes a codepoint in more bytes than necessary
44	UTF8_ERR_OVERLONG,
45
46	// Encodes a codepoint in the surrogate range (0xD800 to 0xDFFF, inclusive)
47	UTF8_ERR_SURROGATE,
48
49	// Encodes a value greater than the max codepoint U+10FFFF
50	UTF8_ERR_TOO_LARGE,
51
52	// Encoding doesn't conform to the UTF-8 bit patterns
53	UTF8_ERR_BAD_ENCODING,
54
55	// It looks like there is another codepoint, but it has been truncated.
56	UTF8_ERR_TRUNCATED_BYTES,
57
58	// We are at the end of the string. (input_len = 0)
59	UTF8_ERR_END_OF_STREAM,
60	} Utf8Error_t;
61
62	typedef struct Utf8Result {
63	Utf8Error_t error;
64	uint32_t codepoint;
65	size_t bytes_read;
66	} Utf8Result_t;
67
68	static inline void _cont(const unsigned char input, Utf8Result_t result) {
69	if (result->error) return;
70
71	int byte = input[result->bytes_read];
72	if (byte == '\0') {
73	result->error = UTF8_ERR_TRUNCATED_BYTES;
74	return;
75	}
76	result->bytes_read += 1;
77
78	// Continuation bytes follow the bit pattern 10xx_xxxx. We need to a)
79	// validate the pattern and b) remove the leading '10'.
80	if ((byte & 0xC0) == 0x80) {
81	result->codepoint <<= 6;
82	result->codepoint \|= byte & 0x3F;
83	} else {
84	result->error = UTF8_ERR_BAD_ENCODING;
85	}
86	}
87
88	/**
89	* Given a nul-terminated string `input`, try to decode the next codepoint from
90	* that string.
91	*
92	* If there was a surrogate, overlong or codepoint to large error then
93	* `result.codepoint` will contain the recovered value.
94	*/
95	static inline void utf8_decode(const unsigned char input, Utf8Result_t result) {
96	result->error = UTF8_OK;
97	result->codepoint = 0;
98	result->bytes_read = 0;
99
100	int first = *input;
101	if (first == '\0') {
102	result->error = UTF8_ERR_END_OF_STREAM;
103	return;
104	}
105	result->bytes_read = 1;
106
107	if ((first & 0x80) == 0) {
108	// 1-byte long (ASCII subset)
109	result->codepoint = first;
110	return;
111	}
112
113	if ((first & 0xE0) == 0xC0) {
114	// 2-bytes long
115	result->codepoint = first & 0x1F;
116
117	_cont(input, result);
118	if (result->error) return;
119
120	if (result->codepoint < 0x80) {
121	result->error = UTF8_ERR_OVERLONG;
122	}
123
124	return;
125	}
126
127	if ((first & 0xF0) == 0xE0) {
128	// 3-bytes long
129	result->codepoint = first & 0x0F;
130
131	_cont(input, result);
132	_cont(input, result);
133	if (result->error) return;
134
135	if (result->codepoint < 0x800) {
136	result->error = UTF8_ERR_OVERLONG;
137	}
138
139	if (0xD800 <= result->codepoint && result->codepoint <= 0xDFFF) {
140	result->error = UTF8_ERR_SURROGATE;
141	}
142
143	return;
144	}
145
146	if ((first & 0xF8) == 0xF0) {
147	// 4-bytes long
148	result->codepoint = first & 0x07;
149
150	_cont(input, result);
151	_cont(input, result);
152	_cont(input, result);
153	if (result->error) return;
154
155	if (result->codepoint < 0x10000) {
156	result->error = UTF8_ERR_OVERLONG;
157	}
158
159	if (result->codepoint > 0x10FFFF) {
160	result->error = UTF8_ERR_TOO_LARGE;
161	}
162
163	return;
164	}
165
166	result->error = UTF8_ERR_BAD_ENCODING;
167	return;
168	}
169
170	#endif // DATA_LANG_UTF8_H