OILS / data_lang / utf8_impls / utf8_decode.c View on Github | oilshell.org

184 lines, 65 significant
1/* utf8_decode.c */
2
3/* 2016-04-05 */
4
5/*
6Copyright (c) 2005 JSON.org
7
8Permission is hereby granted, free of charge, to any person obtaining a copy
9of this software and associated documentation files (the "Software"), to deal
10in the Software without restriction, including without limitation the rights
11to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12copies of the Software, and to permit persons to whom the Software is
13furnished to do so, subject to the following conditions:
14
15The above copyright notice and this permission notice shall be included in all
16copies or substantial portions of the Software.
17
18The Software shall be used for Good, not Evil.
19
20THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26SOFTWARE.
27*/
28
29#include "utf8_decode.h"
30
31/*
32 Very Strict UTF-8 Decoder
33
34 UTF-8 is a multibyte character encoding of Unicode. A character can be
35 represented by 1-4 bytes. The bit pattern of the first byte indicates the
36 number of continuation bytes.
37
38 Most UTF-8 decoders tend to be lenient, attempting to recover as much
39 information as possible, even from badly encoded input. This UTF-8
40 decoder is not lenient. It will reject input which does not include
41 proper continuation bytes. It will reject aliases (or suboptimal
42 codings). It will reject surrogates. (Surrogate encoding should only be
43 used with UTF-16.)
44
45 Code Contination Minimum Maximum
46 0xxxxxxx 0 0 127
47 10xxxxxx error
48 110xxxxx 1 128 2047
49 1110xxxx 2 2048 65535 excluding 55296 - 57343
50 11110xxx 3 65536 1114111
51 11111xxx error
52*/
53
54
55static int the_index = 0;
56static int the_length = 0;
57static int the_char = 0;
58static int the_byte = 0;
59static char* the_input;
60
61
62/*
63 Get the next byte. It returns UTF8_END if there are no more bytes.
64*/
65static int get() {
66 int c;
67 if (the_index >= the_length) {
68 return UTF8_END;
69 }
70 c = the_input[the_index] & 0xFF;
71 the_index += 1;
72 return c;
73}
74
75
76/*
77 Get the 6-bit payload of the next continuation byte.
78 Return UTF8_ERROR if it is not a contination byte.
79*/
80static int cont() {
81 int c = get();
82 return ((c & 0xC0) == 0x80)
83 ? (c & 0x3F)
84 : UTF8_ERROR;
85}
86
87
88/*
89 Initialize the UTF-8 decoder. The decoder is not reentrant,
90*/
91void utf8_decode_init(char p[], int length) {
92 the_index = 0;
93 the_input = p;
94 the_length = length;
95 the_char = 0;
96 the_byte = 0;
97}
98
99
100/*
101 Get the current byte offset. This is generally used in error reporting.
102*/
103int utf8_decode_at_byte() {
104 return the_byte;
105}
106
107
108/*
109 Get the current character offset. This is generally used in error reporting.
110 The character offset matches the byte offset if the text is strictly ASCII.
111*/
112int utf8_decode_at_character() {
113 return (the_char > 0)
114 ? the_char - 1
115 : 0;
116}
117
118
119/*
120 Extract the next character.
121 Returns: the character (between 0 and 1114111)
122 or UTF8_END (the end)
123 or UTF8_ERROR (error)
124*/
125int utf8_decode_next() {
126 int c; /* the first byte of the character */
127 int c1; /* the first continuation character */
128 int c2; /* the second continuation character */
129 int c3; /* the third continuation character */
130 int r; /* the result */
131
132 if (the_index >= the_length) {
133 return the_index == the_length ? UTF8_END : UTF8_ERROR;
134 }
135 the_byte = the_index;
136 the_char += 1;
137 c = get();
138/*
139 Zero continuation (0 to 127)
140*/
141 if ((c & 0x80) == 0) {
142 return c;
143 }
144/*
145 One continuation (128 to 2047)
146*/
147 if ((c & 0xE0) == 0xC0) {
148 c1 = cont();
149 if (c1 >= 0) {
150 r = ((c & 0x1F) << 6) | c1;
151 if (r >= 128) {
152 return r;
153 }
154 }
155
156/*
157 Two continuations (2048 to 55295 and 57344 to 65535)
158*/
159 } else if ((c & 0xF0) == 0xE0) {
160 c1 = cont();
161 c2 = cont();
162 if ((c1 | c2) >= 0) {
163 r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
164 if (r >= 2048 && (r < 55296 || r > 57343)) {
165 return r;
166 }
167 }
168
169/*
170 Three continuations (65536 to 1114111)
171*/
172 } else if ((c & 0xF8) == 0xF0) {
173 c1 = cont();
174 c2 = cont();
175 c3 = cont();
176 if ((c1 | c2 | c3) >= 0) {
177 r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
178 if (r >= 65536 && r <= 1114111) {
179 return r;
180 }
181 }
182 }
183 return UTF8_ERROR;
184}