| 1 | /* utf8_decode.c */
|
| 2 |
|
| 3 | /* 2016-04-05 */
|
| 4 |
|
| 5 | /*
|
| 6 | Copyright (c) 2005 JSON.org
|
| 7 |
|
| 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 9 | of this software and associated documentation files (the "Software"), to deal
|
| 10 | in the Software without restriction, including without limitation the rights
|
| 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 12 | copies of the Software, and to permit persons to whom the Software is
|
| 13 | furnished to do so, subject to the following conditions:
|
| 14 |
|
| 15 | The above copyright notice and this permission notice shall be included in all
|
| 16 | copies or substantial portions of the Software.
|
| 17 |
|
| 18 | The Software shall be used for Good, not Evil.
|
| 19 |
|
| 20 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 21 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 22 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 23 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 24 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 25 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 26 | SOFTWARE.
|
| 27 | */
|
| 28 |
|
| 29 | #include "utf8_decode.h"
|
| 30 |
|
| 31 | /*
|
| 32 | Very Strict UTF-8 Decoder
|
| 33 |
|
| 34 | UTF-8 is a multibyte character encoding of Unicode. A character can be
|
| 35 | represented by 1-4 bytes. The bit pattern of the first byte indicates the
|
| 36 | number of continuation bytes.
|
| 37 |
|
| 38 | Most UTF-8 decoders tend to be lenient, attempting to recover as much
|
| 39 | information as possible, even from badly encoded input. This UTF-8
|
| 40 | decoder is not lenient. It will reject input which does not include
|
| 41 | proper continuation bytes. It will reject aliases (or suboptimal
|
| 42 | codings). It will reject surrogates. (Surrogate encoding should only be
|
| 43 | used with UTF-16.)
|
| 44 |
|
| 45 | Code Contination Minimum Maximum
|
| 46 | 0xxxxxxx 0 0 127
|
| 47 | 10xxxxxx error
|
| 48 | 110xxxxx 1 128 2047
|
| 49 | 1110xxxx 2 2048 65535 excluding 55296 - 57343
|
| 50 | 11110xxx 3 65536 1114111
|
| 51 | 11111xxx error
|
| 52 | */
|
| 53 |
|
| 54 |
|
| 55 | static int the_index = 0;
|
| 56 | static int the_length = 0;
|
| 57 | static int the_char = 0;
|
| 58 | static int the_byte = 0;
|
| 59 | static char* the_input;
|
| 60 |
|
| 61 |
|
| 62 | /*
|
| 63 | Get the next byte. It returns UTF8_END if there are no more bytes.
|
| 64 | */
|
| 65 | static int get() {
|
| 66 | int c;
|
| 67 | if (the_index >= the_length) {
|
| 68 | return UTF8_END;
|
| 69 | }
|
| 70 | c = the_input[the_index] & 0xFF;
|
| 71 | the_index += 1;
|
| 72 | return c;
|
| 73 | }
|
| 74 |
|
| 75 |
|
| 76 | /*
|
| 77 | Get the 6-bit payload of the next continuation byte.
|
| 78 | Return UTF8_ERROR if it is not a contination byte.
|
| 79 | */
|
| 80 | static int cont() {
|
| 81 | int c = get();
|
| 82 | return ((c & 0xC0) == 0x80)
|
| 83 | ? (c & 0x3F)
|
| 84 | : UTF8_ERROR;
|
| 85 | }
|
| 86 |
|
| 87 |
|
| 88 | /*
|
| 89 | Initialize the UTF-8 decoder. The decoder is not reentrant,
|
| 90 | */
|
| 91 | void utf8_decode_init(char p[], int length) {
|
| 92 | the_index = 0;
|
| 93 | the_input = p;
|
| 94 | the_length = length;
|
| 95 | the_char = 0;
|
| 96 | the_byte = 0;
|
| 97 | }
|
| 98 |
|
| 99 |
|
| 100 | /*
|
| 101 | Get the current byte offset. This is generally used in error reporting.
|
| 102 | */
|
| 103 | int utf8_decode_at_byte() {
|
| 104 | return the_byte;
|
| 105 | }
|
| 106 |
|
| 107 |
|
| 108 | /*
|
| 109 | Get the current character offset. This is generally used in error reporting.
|
| 110 | The character offset matches the byte offset if the text is strictly ASCII.
|
| 111 | */
|
| 112 | int utf8_decode_at_character() {
|
| 113 | return (the_char > 0)
|
| 114 | ? the_char - 1
|
| 115 | : 0;
|
| 116 | }
|
| 117 |
|
| 118 |
|
| 119 | /*
|
| 120 | Extract the next character.
|
| 121 | Returns: the character (between 0 and 1114111)
|
| 122 | or UTF8_END (the end)
|
| 123 | or UTF8_ERROR (error)
|
| 124 | */
|
| 125 | int utf8_decode_next() {
|
| 126 | int c; /* the first byte of the character */
|
| 127 | int c1; /* the first continuation character */
|
| 128 | int c2; /* the second continuation character */
|
| 129 | int c3; /* the third continuation character */
|
| 130 | int r; /* the result */
|
| 131 |
|
| 132 | if (the_index >= the_length) {
|
| 133 | return the_index == the_length ? UTF8_END : UTF8_ERROR;
|
| 134 | }
|
| 135 | the_byte = the_index;
|
| 136 | the_char += 1;
|
| 137 | c = get();
|
| 138 | /*
|
| 139 | Zero continuation (0 to 127)
|
| 140 | */
|
| 141 | if ((c & 0x80) == 0) {
|
| 142 | return c;
|
| 143 | }
|
| 144 | /*
|
| 145 | One continuation (128 to 2047)
|
| 146 | */
|
| 147 | if ((c & 0xE0) == 0xC0) {
|
| 148 | c1 = cont();
|
| 149 | if (c1 >= 0) {
|
| 150 | r = ((c & 0x1F) << 6) | c1;
|
| 151 | if (r >= 128) {
|
| 152 | return r;
|
| 153 | }
|
| 154 | }
|
| 155 |
|
| 156 | /*
|
| 157 | Two continuations (2048 to 55295 and 57344 to 65535)
|
| 158 | */
|
| 159 | } else if ((c & 0xF0) == 0xE0) {
|
| 160 | c1 = cont();
|
| 161 | c2 = cont();
|
| 162 | if ((c1 | c2) >= 0) {
|
| 163 | r = ((c & 0x0F) << 12) | (c1 << 6) | c2;
|
| 164 | if (r >= 2048 && (r < 55296 || r > 57343)) {
|
| 165 | return r;
|
| 166 | }
|
| 167 | }
|
| 168 |
|
| 169 | /*
|
| 170 | Three continuations (65536 to 1114111)
|
| 171 | */
|
| 172 | } else if ((c & 0xF8) == 0xF0) {
|
| 173 | c1 = cont();
|
| 174 | c2 = cont();
|
| 175 | c3 = cont();
|
| 176 | if ((c1 | c2 | c3) >= 0) {
|
| 177 | r = ((c & 0x07) << 18) | (c1 << 12) | (c2 << 6) | c3;
|
| 178 | if (r >= 65536 && r <= 1114111) {
|
| 179 | return r;
|
| 180 | }
|
| 181 | }
|
| 182 | }
|
| 183 | return UTF8_ERROR;
|
| 184 | }
|