/home/uke/oil/data_lang/j8.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef DATA_LANG_J8_H |
2 | | #define DATA_LANG_J8_H |
3 | | |
4 | | #include <stdio.h> // sprintf |
5 | | #include <string.h> // memcmp, memcpy, strlen |
6 | | |
7 | | #include "data_lang/utf8.h" |
8 | | |
9 | | #define J8_OUT(ch) \ |
10 | 396 | **p_out = (ch); \ |
11 | 396 | (*p_out)++ |
12 | | |
13 | | static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out, |
14 | 1.01k | int j8_escape) { |
15 | | // We use a slightly weird double pointer style because |
16 | | // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8) |
17 | | // *p_out may be advanced by 1 to 6 bytes (depending on escaping) |
18 | | |
19 | | // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() all |
20 | | // call utf8_decode() which require that p_in MUST have a NUL terminator. This |
21 | | // is so INCOMPLETE UTF-8 sequences are terminated with an INVALID byte, and |
22 | | // 0x00 can only be ITSELF, never part of a sequence. An alternative would be |
23 | | // to do more bounds checks in these functions. |
24 | | |
25 | | // CALLER MUST CHECK that we are able to write up to 6 bytes! |
26 | | // Because the longest output is \u001f or \u{1f} for control chars, since |
27 | | // we don't emit escapes like \u{1f926} right now |
28 | | // |
29 | | // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data |
30 | | // \yff instead of Unicode replacement char |
31 | | // \u{1} instead of \u0001 for unprintable low chars |
32 | | |
33 | | // Returns: |
34 | | // 0 wrote valid UTF-8 (encoded or not) |
35 | | // 1 wrote byte that's invalid UTF-8 |
36 | | |
37 | 1.01k | unsigned char ch = **p_in; |
38 | | |
39 | | // |
40 | | // Handle \\ \b \f \n \r \t |
41 | | // |
42 | | |
43 | | // clang-format off |
44 | 1.01k | switch (ch) { |
45 | 12 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0; |
46 | 15 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0; |
47 | 15 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0; |
48 | 15 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0; |
49 | 15 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0; |
50 | 15 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0; |
51 | 1.01k | } |
52 | | // clang-format on |
53 | | |
54 | | // |
55 | | // Conditionally handle \' and \" |
56 | | // |
57 | 927 | if (ch == '\'' && j8_escape) { // J8-style strings \' |
58 | 0 | J8_OUT('\\'); |
59 | 0 | J8_OUT('\''); |
60 | 0 | (*p_in)++; |
61 | 0 | return 0; |
62 | 0 | } |
63 | 927 | if (ch == '"' && !j8_escape) { // JSON-style strings \" |
64 | 0 | J8_OUT('\\'); |
65 | 0 | J8_OUT('"'); |
66 | 0 | (*p_in)++; |
67 | 0 | return 0; |
68 | 0 | } |
69 | | |
70 | | // |
71 | | // Unprintable ASCII control codes |
72 | | // |
73 | 927 | if (ch < 0x20) { |
74 | 189 | if (j8_escape) { |
75 | | // printf("Writing for %04x %p\n", ch, *p_out); |
76 | 75 | int n = sprintf((char*)*p_out, "\\u{%x}", ch); |
77 | | // printf("! Wrote %d bytes for %04x\n", n, ch); |
78 | 75 | *p_out += n; |
79 | 114 | } else { |
80 | | // printf("Writing for %04x %p\n", ch, *p_out); |
81 | 114 | int n = sprintf((char*)*p_out, "\\u%04x", ch); |
82 | 114 | *p_out += n; |
83 | | // printf("Wrote %d bytes for %04x\n", n, ch); |
84 | 114 | } |
85 | 189 | (*p_in)++; |
86 | 189 | return 0; |
87 | 189 | } |
88 | | |
89 | | // |
90 | | // UTF-8 encoded runes and invalid bytes |
91 | | // |
92 | 738 | Utf8Result_t result; |
93 | 738 | utf8_decode(*p_in, &result); |
94 | | |
95 | 738 | if (result.error == UTF8_OK) { |
96 | 660 | memcpy(*p_out, *p_in, result.bytes_read); |
97 | 660 | *p_in += result.bytes_read; |
98 | 660 | *p_out += result.bytes_read; |
99 | 660 | return 0; |
100 | 660 | } |
101 | | |
102 | | // We have a UTF-8 decoding error. This is handled one of three ways: |
103 | | // 1. Losslessly encode as J8 byte literals (only applicable in J8) |
104 | | // 2. Try to encode a lone surrogate |
105 | | // 3. Insert a Unicode replacement char |
106 | | |
107 | 78 | if (j8_escape) { |
108 | 30 | int n = sprintf((char*)*p_out, "\\y%02x", ch); |
109 | 30 | *p_in += 1; |
110 | 30 | *p_out += n; |
111 | 48 | } else if (result.error == UTF8_ERR_SURROGATE) { |
112 | 0 | int n = sprintf((char*)*p_out, "\\u%04x", result.codepoint); |
113 | 0 | *p_in += result.bytes_read; |
114 | 0 | *p_out += n; |
115 | 0 | return 1; |
116 | 48 | } else { |
117 | | // Unicode replacement char is U+FFFD, so write encoded form |
118 | | // >>> '\ufffd'.encode('utf-8') |
119 | | // b'\xef\xbf\xbd' |
120 | 48 | J8_OUT('\xef'); |
121 | 48 | J8_OUT('\xbf'); |
122 | 48 | J8_OUT('\xbd'); |
123 | 48 | *p_in += 1; // Advance past the byte we wrote |
124 | 48 | } |
125 | | |
126 | 78 | return 1; |
127 | 78 | } data_lang.cc:_ZL11J8EncodeOnePPhS0_i Line | Count | Source | 14 | 676 | int j8_escape) { | 15 | | // We use a slightly weird double pointer style because | 16 | | // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8) | 17 | | // *p_out may be advanced by 1 to 6 bytes (depending on escaping) | 18 | | | 19 | | // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() all | 20 | | // call utf8_decode() which require that p_in MUST have a NUL terminator. This | 21 | | // is so INCOMPLETE UTF-8 sequences are terminated with an INVALID byte, and | 22 | | // 0x00 can only be ITSELF, never part of a sequence. An alternative would be | 23 | | // to do more bounds checks in these functions. | 24 | | | 25 | | // CALLER MUST CHECK that we are able to write up to 6 bytes! | 26 | | // Because the longest output is \u001f or \u{1f} for control chars, since | 27 | | // we don't emit escapes like \u{1f926} right now | 28 | | // | 29 | | // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data | 30 | | // \yff instead of Unicode replacement char | 31 | | // \u{1} instead of \u0001 for unprintable low chars | 32 | | | 33 | | // Returns: | 34 | | // 0 wrote valid UTF-8 (encoded or not) | 35 | | // 1 wrote byte that's invalid UTF-8 | 36 | | | 37 | 676 | unsigned char ch = **p_in; | 38 | | | 39 | | // | 40 | | // Handle \\ \b \f \n \r \t | 41 | | // | 42 | | | 43 | | // clang-format off | 44 | 676 | switch (ch) { | 45 | 8 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0; | 46 | 10 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0; | 47 | 10 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0; | 48 | 10 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0; | 49 | 10 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0; | 50 | 10 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0; | 51 | 676 | } | 52 | | // clang-format on | 53 | | | 54 | | // | 55 | | // Conditionally handle \' and \" | 56 | | // | 57 | 618 | if (ch == '\'' && j8_escape) { // J8-style strings \' | 58 | 0 | J8_OUT('\\'); | 59 | 0 | J8_OUT('\''); | 60 | 0 | (*p_in)++; | 61 | 0 | return 0; | 62 | 0 | } | 63 | 618 | if (ch == '"' && !j8_escape) { // JSON-style strings \" | 64 | 0 | J8_OUT('\\'); | 65 | 0 | J8_OUT('"'); | 66 | 0 | (*p_in)++; | 67 | 0 | return 0; | 68 | 0 | } | 69 | | | 70 | | // | 71 | | // Unprintable ASCII control codes | 72 | | // | 73 | 618 | if (ch < 0x20) { | 74 | 126 | if (j8_escape) { | 75 | | // printf("Writing for %04x %p\n", ch, *p_out); | 76 | 50 | int n = sprintf((char*)*p_out, "\\u{%x}", ch); | 77 | | // printf("! Wrote %d bytes for %04x\n", n, ch); | 78 | 50 | *p_out += n; | 79 | 76 | } else { | 80 | | // printf("Writing for %04x %p\n", ch, *p_out); | 81 | 76 | int n = sprintf((char*)*p_out, "\\u%04x", ch); | 82 | 76 | *p_out += n; | 83 | | // printf("Wrote %d bytes for %04x\n", n, ch); | 84 | 76 | } | 85 | 126 | (*p_in)++; | 86 | 126 | return 0; | 87 | 126 | } | 88 | | | 89 | | // | 90 | | // UTF-8 encoded runes and invalid bytes | 91 | | // | 92 | 492 | Utf8Result_t result; | 93 | 492 | utf8_decode(*p_in, &result); | 94 | | | 95 | 492 | if (result.error == UTF8_OK) { | 96 | 440 | memcpy(*p_out, *p_in, result.bytes_read); | 97 | 440 | *p_in += result.bytes_read; | 98 | 440 | *p_out += result.bytes_read; | 99 | 440 | return 0; | 100 | 440 | } | 101 | | | 102 | | // We have a UTF-8 decoding error. This is handled one of three ways: | 103 | | // 1. Losslessly encode as J8 byte literals (only applicable in J8) | 104 | | // 2. Try to encode a lone surrogate | 105 | | // 3. Insert a Unicode replacement char | 106 | | | 107 | 52 | if (j8_escape) { | 108 | 20 | int n = sprintf((char*)*p_out, "\\y%02x", ch); | 109 | 20 | *p_in += 1; | 110 | 20 | *p_out += n; | 111 | 32 | } else if (result.error == UTF8_ERR_SURROGATE) { | 112 | 0 | int n = sprintf((char*)*p_out, "\\u%04x", result.codepoint); | 113 | 0 | *p_in += result.bytes_read; | 114 | 0 | *p_out += n; | 115 | 0 | return 1; | 116 | 32 | } else { | 117 | | // Unicode replacement char is U+FFFD, so write encoded form | 118 | | // >>> '\ufffd'.encode('utf-8') | 119 | | // b'\xef\xbf\xbd' | 120 | 32 | J8_OUT('\xef'); | 121 | 32 | J8_OUT('\xbf'); | 122 | 32 | J8_OUT('\xbd'); | 123 | 32 | *p_in += 1; // Advance past the byte we wrote | 124 | 32 | } | 125 | | | 126 | 52 | return 1; | 127 | 52 | } |
j8_libc.c:_ZL11J8EncodeOnePPhS0_i Line | Count | Source | 14 | 338 | int j8_escape) { | 15 | | // We use a slightly weird double pointer style because | 16 | | // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8) | 17 | | // *p_out may be advanced by 1 to 6 bytes (depending on escaping) | 18 | | | 19 | | // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne() all | 20 | | // call utf8_decode() which require that p_in MUST have a NUL terminator. This | 21 | | // is so INCOMPLETE UTF-8 sequences are terminated with an INVALID byte, and | 22 | | // 0x00 can only be ITSELF, never part of a sequence. An alternative would be | 23 | | // to do more bounds checks in these functions. | 24 | | | 25 | | // CALLER MUST CHECK that we are able to write up to 6 bytes! | 26 | | // Because the longest output is \u001f or \u{1f} for control chars, since | 27 | | // we don't emit escapes like \u{1f926} right now | 28 | | // | 29 | | // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data | 30 | | // \yff instead of Unicode replacement char | 31 | | // \u{1} instead of \u0001 for unprintable low chars | 32 | | | 33 | | // Returns: | 34 | | // 0 wrote valid UTF-8 (encoded or not) | 35 | | // 1 wrote byte that's invalid UTF-8 | 36 | | | 37 | 338 | unsigned char ch = **p_in; | 38 | | | 39 | | // | 40 | | // Handle \\ \b \f \n \r \t | 41 | | // | 42 | | | 43 | | // clang-format off | 44 | 338 | switch (ch) { | 45 | 4 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0; | 46 | 5 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0; | 47 | 5 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0; | 48 | 5 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0; | 49 | 5 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0; | 50 | 5 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0; | 51 | 338 | } | 52 | | // clang-format on | 53 | | | 54 | | // | 55 | | // Conditionally handle \' and \" | 56 | | // | 57 | 309 | if (ch == '\'' && j8_escape) { // J8-style strings \' | 58 | 0 | J8_OUT('\\'); | 59 | 0 | J8_OUT('\''); | 60 | 0 | (*p_in)++; | 61 | 0 | return 0; | 62 | 0 | } | 63 | 309 | if (ch == '"' && !j8_escape) { // JSON-style strings \" | 64 | 0 | J8_OUT('\\'); | 65 | 0 | J8_OUT('"'); | 66 | 0 | (*p_in)++; | 67 | 0 | return 0; | 68 | 0 | } | 69 | | | 70 | | // | 71 | | // Unprintable ASCII control codes | 72 | | // | 73 | 309 | if (ch < 0x20) { | 74 | 63 | if (j8_escape) { | 75 | | // printf("Writing for %04x %p\n", ch, *p_out); | 76 | 25 | int n = sprintf((char*)*p_out, "\\u{%x}", ch); | 77 | | // printf("! Wrote %d bytes for %04x\n", n, ch); | 78 | 25 | *p_out += n; | 79 | 38 | } else { | 80 | | // printf("Writing for %04x %p\n", ch, *p_out); | 81 | 38 | int n = sprintf((char*)*p_out, "\\u%04x", ch); | 82 | 38 | *p_out += n; | 83 | | // printf("Wrote %d bytes for %04x\n", n, ch); | 84 | 38 | } | 85 | 63 | (*p_in)++; | 86 | 63 | return 0; | 87 | 63 | } | 88 | | | 89 | | // | 90 | | // UTF-8 encoded runes and invalid bytes | 91 | | // | 92 | 246 | Utf8Result_t result; | 93 | 246 | utf8_decode(*p_in, &result); | 94 | | | 95 | 246 | if (result.error == UTF8_OK) { | 96 | 220 | memcpy(*p_out, *p_in, result.bytes_read); | 97 | 220 | *p_in += result.bytes_read; | 98 | 220 | *p_out += result.bytes_read; | 99 | 220 | return 0; | 100 | 220 | } | 101 | | | 102 | | // We have a UTF-8 decoding error. This is handled one of three ways: | 103 | | // 1. Losslessly encode as J8 byte literals (only applicable in J8) | 104 | | // 2. Try to encode a lone surrogate | 105 | | // 3. Insert a Unicode replacement char | 106 | | | 107 | 26 | if (j8_escape) { | 108 | 10 | int n = sprintf((char*)*p_out, "\\y%02x", ch); | 109 | 10 | *p_in += 1; | 110 | 10 | *p_out += n; | 111 | 16 | } else if (result.error == UTF8_ERR_SURROGATE) { | 112 | 0 | int n = sprintf((char*)*p_out, "\\u%04x", result.codepoint); | 113 | 0 | *p_in += result.bytes_read; | 114 | 0 | *p_out += n; | 115 | 0 | return 1; | 116 | 16 | } else { | 117 | | // Unicode replacement char is U+FFFD, so write encoded form | 118 | | // >>> '\ufffd'.encode('utf-8') | 119 | | // b'\xef\xbf\xbd' | 120 | 16 | J8_OUT('\xef'); | 121 | 16 | J8_OUT('\xbf'); | 122 | 16 | J8_OUT('\xbd'); | 123 | 16 | *p_in += 1; // Advance past the byte we wrote | 124 | 16 | } | 125 | | | 126 | 26 | return 1; | 127 | 26 | } |
|
128 | | |
129 | | // Like the above, but |
130 | | // |
131 | | // \xff instead of \yff |
132 | | // \u001f always, never \u{1f} |
133 | | // No JSON vs. J8 |
134 | | // No \" escape ever |
135 | | // No errors -- it can encode everything |
136 | | |
137 | | static inline void BashDollarEncodeOne(unsigned char** p_in, |
138 | 0 | unsigned char** p_out) { |
139 | 0 | unsigned char ch = **p_in; |
140 | | |
141 | | // |
142 | | // Handle \\ \b \f \n \r \t \' |
143 | | // |
144 | | |
145 | | // clang-format off |
146 | 0 | switch (ch) { |
147 | 0 | case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return; |
148 | 0 | case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return; |
149 | 0 | case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return; |
150 | 0 | case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return; |
151 | 0 | case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return; |
152 | 0 | case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return; |
153 | 0 | case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return; |
154 | 0 | } |
155 | | // clang-format on |
156 | | |
157 | | // |
158 | | // Unprintable ASCII control codes |
159 | | // |
160 | 0 | if (ch < 0x20) { |
161 | | // printf("Writing for %04x %p\n", ch, *p_out); |
162 | 0 | int n = sprintf((char*)*p_out, "\\u%04x", ch); |
163 | 0 | *p_out += n; |
164 | | // printf("Wrote %d bytes for %04x\n", n, ch); |
165 | 0 | (*p_in)++; |
166 | 0 | return; |
167 | 0 | } |
168 | | |
169 | | // |
170 | | // UTF-8 encoded runes and invalid bytes |
171 | | // |
172 | 0 | Utf8Result_t result; |
173 | 0 | utf8_decode(*p_in, &result); |
174 | 0 | if (result.error == UTF8_OK) { |
175 | 0 | memcpy(*p_out, *p_in, result.bytes_read); |
176 | 0 | *p_in += result.bytes_read; |
177 | 0 | *p_out += result.bytes_read; |
178 | 0 | } else { |
179 | | // If not a valid UTF-8 byte sequence, losslessly encode the bad bytes |
180 | 0 | int n = sprintf((char*)*p_out, "\\x%02x", **p_in); |
181 | 0 | *p_out += n; |
182 | 0 | *p_in += 1; // Advance past the byte we wrote |
183 | 0 | } |
184 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL19BashDollarEncodeOnePPhS0_ Unexecuted instantiation: j8_libc.c:_ZL19BashDollarEncodeOnePPhS0_ |
185 | | |
186 | | // BourneShellEncodeOne rules: |
187 | | // |
188 | | // must be valid UTF-8 |
189 | | // no control chars |
190 | | // no ' is required |
191 | | // no \ -- not required, but avoids ambiguous '\n' |
192 | | // |
193 | | // For example we write $'\\' or b'\\' not '\' |
194 | | // The latter should be written r'\', but we're not outputing |
195 | | |
196 | | static inline int BourneShellEncodeOne(unsigned char** p_in, |
197 | 0 | unsigned char** p_out) { |
198 | 0 | unsigned char ch = **p_in; |
199 | |
|
200 | 0 | if (ch == '\'' || ch == '\\') { // can't encode these in Bourne shell '' |
201 | 0 | return 1; |
202 | 0 | } |
203 | 0 | if (ch < 0x20) { // Unprintable ASCII control codes |
204 | 0 | return 1; |
205 | 0 | } |
206 | | |
207 | | // UTF-8 encoded runes and invalid bytes |
208 | 0 | Utf8Result_t result; |
209 | 0 | utf8_decode(*p_in, &result); |
210 | 0 | if (result.error == UTF8_OK) { |
211 | 0 | memcpy(*p_out, *p_in, result.bytes_read); |
212 | 0 | *p_in += result.bytes_read; |
213 | 0 | *p_out += result.bytes_read; |
214 | 0 | return 0; |
215 | 0 | } else { |
216 | 0 | return 1; |
217 | 0 | } |
218 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL20BourneShellEncodeOnePPhS0_ Unexecuted instantiation: j8_libc.c:_ZL20BourneShellEncodeOnePPhS0_ |
219 | | |
220 | | // Right now \u001f and \u{1f} are the longest output sequences for a byte. |
221 | | // Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even |
222 | | // though we don't technically need it) |
223 | | |
224 | | // Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa |
225 | | // If this is too small, we would enter an infinite loop |
226 | | // +1 for NUL terminator |
227 | | |
228 | 1.06k | #define J8_MAX_BYTES_PER_INPUT_BYTE 7 |
229 | | |
230 | | // The minimum capacity must be more than the number above. |
231 | | // TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity); |
232 | 96 | #define J8_MIN_CAPACITY 16 |
233 | | |
234 | | static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end, |
235 | | unsigned char** p_out, unsigned char* out_end, |
236 | 152 | int j8_escape) { |
237 | 1.11k | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { |
238 | | // printf("iter %d %p < %p \n", i++, *p_out, out_end); |
239 | 1.01k | int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape); |
240 | 1.01k | if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data? |
241 | 48 | return invalid_utf8; // early return |
242 | 48 | } |
243 | 1.01k | } |
244 | 104 | return 0; |
245 | 152 | } data_lang.cc:_ZL13J8EncodeChunkPPhS_S0_S_i Line | Count | Source | 236 | 96 | int j8_escape) { | 237 | 740 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { | 238 | | // printf("iter %d %p < %p \n", i++, *p_out, out_end); | 239 | 676 | int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape); | 240 | 676 | if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data? | 241 | 32 | return invalid_utf8; // early return | 242 | 32 | } | 243 | 676 | } | 244 | 64 | return 0; | 245 | 96 | } |
j8_libc.c:_ZL13J8EncodeChunkPPhS_S0_S_i Line | Count | Source | 236 | 56 | int j8_escape) { | 237 | 378 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { | 238 | | // printf("iter %d %p < %p \n", i++, *p_out, out_end); | 239 | 338 | int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape); | 240 | 338 | if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data? | 241 | 16 | return invalid_utf8; // early return | 242 | 16 | } | 243 | 338 | } | 244 | 40 | return 0; | 245 | 56 | } |
|
246 | | |
247 | | static inline int BashDollarEncodeChunk(unsigned char** p_in, |
248 | | unsigned char* in_end, |
249 | | unsigned char** p_out, |
250 | 0 | unsigned char* out_end) { |
251 | 0 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { |
252 | 0 | BashDollarEncodeOne(p_in, p_out); |
253 | 0 | } |
254 | 0 | return 0; |
255 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL21BashDollarEncodeChunkPPhS_S0_S_ Unexecuted instantiation: j8_libc.c:_ZL21BashDollarEncodeChunkPPhS_S0_S_ |
256 | | |
257 | | static inline int BourneShellEncodeChunk(unsigned char** p_in, |
258 | | unsigned char* in_end, |
259 | | unsigned char** p_out, |
260 | 0 | unsigned char* out_end) { |
261 | 0 | while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) { |
262 | 0 | int cannot_encode = BourneShellEncodeOne(p_in, p_out); |
263 | 0 | if (cannot_encode) { // we need escaping, e.g. \u0001 or \' |
264 | 0 | return cannot_encode; // early return |
265 | 0 | } |
266 | 0 | } |
267 | 0 | return 0; |
268 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL22BourneShellEncodeChunkPPhS_S0_S_ Unexecuted instantiation: j8_libc.c:_ZL22BourneShellEncodeChunkPPhS_S0_S_ |
269 | | |
270 | 0 | static inline int CanOmitQuotes(unsigned char* s, int len) { |
271 | 0 | if (len == 0) { // empty string has to be quoted |
272 | 0 | return 0; |
273 | 0 | } |
274 | | |
275 | | // 3 special case keywords |
276 | 0 | if (len == 4) { |
277 | 0 | if (memcmp(s, "null", 4) == 0) { |
278 | 0 | return 0; |
279 | 0 | } |
280 | 0 | if (memcmp(s, "true", 4) == 0) { |
281 | 0 | return 0; |
282 | 0 | } |
283 | 0 | } |
284 | 0 | if (len == 5) { |
285 | 0 | if (memcmp(s, "false", 5) == 0) { |
286 | 0 | return 0; |
287 | 0 | } |
288 | 0 | } |
289 | | |
290 | 0 | for (int i = 0; i < len; ++i) { |
291 | 0 | unsigned char ch = s[i]; |
292 | | |
293 | | // Corresponds to regex [a-zA-Z0-9./_-] |
294 | 0 | if ('a' <= ch && ch <= 'z') { |
295 | 0 | continue; |
296 | 0 | } |
297 | 0 | if ('A' <= ch && ch <= 'Z') { |
298 | 0 | continue; |
299 | 0 | } |
300 | 0 | if ('0' <= ch && ch <= '9') { |
301 | 0 | continue; |
302 | 0 | } |
303 | 0 | if (ch == '.' || ch == '/' || ch == '_' || ch == '-') { |
304 | 0 | continue; |
305 | 0 | } |
306 | | // some byte requires quotes |
307 | | // Not including UTF-8 here because it can have chars that look like space |
308 | | // or quotes |
309 | 0 | return 0; |
310 | 0 | } |
311 | 0 | return 1; // everything OK |
312 | 0 | } Unexecuted instantiation: data_lang.cc:_ZL13CanOmitQuotesPhi Unexecuted instantiation: j8_libc.c:_ZL13CanOmitQuotesPhi |
313 | | |
314 | | #endif // DATA_LANG_J8_H |