1 | /* utf8_decode.c */
|
2 |
|
3 | /* 2016-04-05 */
|
4 |
|
5 | /*
|
6 | Copyright (c) 2005 JSON.org
|
7 |
|
8 | Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 | of this software and associated documentation files (the "Software"), to deal
|
10 | in the Software without restriction, including without limitation the rights
|
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 | copies of the Software, and to permit persons to whom the Software is
|
13 | furnished to do so, subject to the following conditions:
|
14 |
|
15 | The above copyright notice and this permission notice shall be included in all
|
16 | copies or substantial portions of the Software.
|
17 |
|
18 | The Software shall be used for Good, not Evil.
|
19 |
|
20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
26 | SOFTWARE.
|
27 | */
|
28 |
|
29 | #include "utf8_decode.h"
|
30 |
|
31 | /*
|
32 | Very Strict UTF-8 Decoder
|
33 |
|
34 | UTF-8 is a multibyte character encoding of Unicode. A character can be
|
35 | represented by 1-4 bytes. The bit pattern of the first byte indicates the
|
36 | number of continuation bytes.
|
37 |
|
38 | Most UTF-8 decoders tend to be lenient, attempting to recover as much
|
39 | information as possible, even from badly encoded input. This UTF-8
|
40 | decoder is not lenient. It will reject input which does not include
|
41 | proper continuation bytes. It will reject aliases (or suboptimal
|
42 | codings). It will reject surrogates. (Surrogate encoding should only be
|
43 | used with UTF-16.)
|
44 |
|
45 | Code Contination Minimum Maximum
|
46 | 0xxxxxxx 0 0 127
|
47 | 10xxxxxx error
|
48 | 110xxxxx 1 128 2047
|
49 | 1110xxxx 2 2048 65535 excluding 55296 - 57343
|
50 | 11110xxx 3 65536 1114111
|
51 | 11111xxx error
|
52 | */
|
53 |
|
54 |
|
55 | static int the_index = 0;
|
56 | static int the_length = 0;
|
57 | static int the_char = 0;
|
58 | static int the_byte = 0;
|
59 | static char* the_input;
|
60 |
|
61 |
|
62 | /*
|
63 | Get the next byte. It returns UTF8_END if there are no more bytes.
|
64 | */
|
65 | static int get() {
|
66 | int c;
|
67 | if (the_index >= the_length) {
|
68 | return UTF8_END;
|
69 | }
|
70 | c = the_input[the_index] & 0xFF;
|
71 | the_index += 1;
|
72 | return c;
|
73 | }
|
74 |
|
75 |
|
76 | /*
|
77 | Get the 6-bit payload of the next continuation byte.
|
78 | Return UTF8_ERROR if it is not a contination byte.
|
79 | */
|
80 | static int cont() {
|
81 | int c = get();
|
82 | return ((c & 0xC0) == 0x80)
|
83 | ? (c & 0x3F)
|
84 | : UTF8_ERROR;
|
85 | }
|
86 |
|
87 |
|
88 | /*
|
89 | Initialize the UTF-8 decoder. The decoder is not reentrant,
|
90 | */
|
91 | void utf8_decode_init(char p[], int length) {
|
92 | the_index = 0;
|
93 | the_input = p;
|
94 | the_length = length;
|
95 | the_char = 0;
|
96 | the_byte = 0;
|
97 | }
|
98 |
|
99 |
|
100 | /*
|
101 | Get the current byte offset. This is generally used in error reporting.
|
102 | */
|
103 | int utf8_decode_at_byte() {
|
104 | return the_byte;
|
105 | }
|
106 |
|
107 |
|
108 | /*
|
109 | Get the current character offset. This is generally used in error reporting.
|
110 | The character offset matches the byte offset if the text is strictly ASCII.
|
111 | */
|
112 | int utf8_decode_at_character() {
|
113 | return (the_char > 0)
|
114 | ? the_char - 1
|
115 | : 0;
|
116 | }
|
117 |
|
118 |
|
119 | /*
|
120 | Extract the next character.
|
121 | Returns: the character (between 0 and 1114111)
|
122 | or UTF8_END (the end)
|
123 | or UTF8_ERROR (error)
|
124 | */
|
125 | int utf8_decode_next() {
|
126 | int c; /* the first byte of the character */
|
127 | int c1; /* the first continuation character */
|
128 | int c2; /* the second continuation character */
|
129 | int c3; /* the third continuation character */
|
130 | int r; /* the result */
|
131 |
|
132 | if (the_index >= the_length) {
|
133 | return the_index == the_length ? UTF8_END : UTF8_ERROR;
|
134 | }
|
135 | the_byte = the_index;
|
136 | the_char += 1;
|
137 | c = get();
|
138 | /*
|
139 | Zero continuation (0 to 127)
|
140 | */
|
141 | if ((c & 0x80) == 0) {
|
142 | return c;
|
143 | }
|
144 | /*
|
145 | One continuation (128 to 2047)
|
146 | */
|
147 | if ((c & 0xE0) == 0xC0) {
|
148 | c1 = cont();
|
149 | if (c1 >= 0) {
|
150 | r = ((c & 0x1F) << 6) | c1;
|
151 | if (r >= 128) {
|
152 | return r;
|
153 | }
|
154 | }
|
155 |
|
156 | /*
|
157 | Two continuations (2048 to 55295 and 57344 to 65535)
|
158 | */
|
159 | } else if ((c & 0xF0) == 0xE0) {
|
160 | c1 = cont();
|
161 | c2 = cont();
|
162 | if ((c1 | c2) >= 0) {
|
163 | r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
|
164 | if (r >= 2048 && (r < 55296 || r > 57343)) {
|
165 | return r;
|
166 | }
|
167 | }
|
168 |
|
169 | /*
|
170 | Three continuations (65536 to 1114111)
|
171 | */
|
172 | } else if ((c & 0xF8) == 0xF0) {
|
173 | c1 = cont();
|
174 | c2 = cont();
|
175 | c3 = cont();
|
176 | if ((c1 | c2 | c3) >= 0) {
|
177 | r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
|
178 | if (r >= 65536 && r <= 1114111) {
|
179 | return r;
|
180 | }
|
181 | }
|
182 | }
|
183 | return UTF8_ERROR;
|
184 | }
|