OILS / data_lang / j8.h View on Github | oilshell.org

361 lines, 173 significant
1#ifndef DATA_LANG_J8_H
2#define DATA_LANG_J8_H
3
4#include <stdio.h> // sprintf
5#include <string.h> // memcmp, memcpy, strlen
6
7#include "data_lang/utf8_impls/bjoern_dfa.h"
8#include "data_lang/utf8.h"
9
10#define J8_OUT(ch) \
11 **p_out = (ch); \
12 (*p_out)++
13
14static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
15 int j8_escape) {
16 // We use a slightly weird double pointer style because
17 // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
18 // *p_out may be advanced by 1 to 6 bytes (depending on escaping)
19
20 // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne()
21 // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST
22 // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences
23 // are terminated with an INVALID byte that the state machine can accept, and
24 // 0x00 can only be ITSELF, never part of a sequence. An alternative would be
25 // to do more bounds checks in these functions.
26
27 // CALLER MUST CHECK that we are able to write up to 6 bytes!
28 // Because the longest output is \u001f or \u{1f} for control chars, since
29 // we don't escapes like \u{1f926} right now
30 //
31 // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
32 // \yff instead of Unicode replacement char
33 // \u{1} instead of \u0001 for unprintable low chars
34
35 // Returns:
36 // 0 wrote valid UTF-8 (encoded or not)
37 // 1 wrote byte that's invalid UTF-8
38
39 unsigned char ch = **p_in;
40
41 //
42 // Handle \\ \b \f \n \r \t
43 //
44
45 // clang-format off
46 switch (ch) {
47 case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
48 case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
49 case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
50 case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
51 case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
52 case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
53 }
54 // clang-format on
55
56 //
57 // Conditionally handle \' and \"
58 //
59 if (ch == '\'' && j8_escape) { // J8-style strings \'
60 J8_OUT('\\');
61 J8_OUT('\'');
62 (*p_in)++;
63 return 0;
64 }
65 if (ch == '"' && !j8_escape) { // JSON-style strings \"
66 J8_OUT('\\');
67 J8_OUT('"');
68 (*p_in)++;
69 return 0;
70 }
71
72 //
73 // Unprintable ASCII control codes
74 //
75 if (ch < 0x20) {
76 if (j8_escape) {
77 // printf("Writing for %04x %p\n", ch, *p_out);
78 int n = sprintf((char*)*p_out, "\\u{%x}", ch);
79 // printf("! Wrote %d bytes for %04x\n", n, ch);
80 *p_out += n;
81 } else {
82 // printf("Writing for %04x %p\n", ch, *p_out);
83 int n = sprintf((char*)*p_out, "\\u%04x", ch);
84 *p_out += n;
85 // printf("Wrote %d bytes for %04x\n", n, ch);
86 }
87 (*p_in)++;
88 return 0;
89 }
90
91 //
92 // UTF-8 encoded runes and invalid bytes
93 //
94 Utf8Result_t result;
95 utf8_decode(*p_in, &result);
96
97 if (result.error == UTF8_OK) {
98 memcpy(*p_out, *p_in, result.bytes_read);
99 *p_in += result.bytes_read;
100 *p_out += result.bytes_read;
101 return 0;
102 }
103
104 // We have a UTF-8 decoding error. This is handled one of three ways:
105 // 1. Losslessly encode as J8 byte literals (only applicable in J8)
106 // 2. Try to encode a lone surrogate
107 // 3. Insert a Unicode replacement char
108
109 if (j8_escape) {
110 int n = sprintf((char*)*p_out, "\\y%02x", ch);
111 *p_in += 1;
112 *p_out += n;
113 } else if (result.error == UTF8_ERR_SURROGATE) {
114 int n = sprintf((char*)*p_out, "\\u%04x", result.codepoint);
115 *p_in += result.bytes_read;
116 *p_out += n;
117 return 1;
118 } else {
119 // Unicode replacement char is U+FFFD, so write encoded form
120 // >>> '\ufffd'.encode('utf-8')
121 // b'\xef\xbf\xbd'
122 J8_OUT('\xef');
123 J8_OUT('\xbf');
124 J8_OUT('\xbd');
125 *p_in += 1; // Advance past the byte we wrote
126 }
127
128 return 1;
129}
130
131// Like the above, but
132//
133// \xff instead of \yff
134// \u001f always, never \u{1f}
135// No JSON vs. J8
136// No \" escape ever
137// No errors -- it can encode everything
138
139static inline void BashDollarEncodeOne(unsigned char** p_in,
140 unsigned char** p_out) {
141 unsigned char ch = **p_in;
142
143 //
144 // Handle \\ \b \f \n \r \t \'
145 //
146
147 // clang-format off
148 switch (ch) {
149 case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return;
150 case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return;
151 case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return;
152 case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return;
153 case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return;
154 case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return;
155 case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return;
156 }
157 // clang-format on
158
159 //
160 // Unprintable ASCII control codes
161 //
162 if (ch < 0x20) {
163 // printf("Writing for %04x %p\n", ch, *p_out);
164 int n = sprintf((char*)*p_out, "\\u%04x", ch);
165 *p_out += n;
166 // printf("Wrote %d bytes for %04x\n", n, ch);
167 (*p_in)++;
168 return;
169 }
170
171 //
172 // UTF-8 encoded runes and invalid bytes
173 //
174 unsigned char* start = *p_in; // save start position
175 uint32_t codepoint = 0;
176 uint32_t state = UTF8_ACCEPT;
177
178 while (1) {
179 // unsigned char byte = **p_in;
180 decode(&state, &codepoint, ch);
181 // printf(" state %d ch %d\n", state, ch);
182 switch (state) {
183 // BUG: we don't reject IMMEDIATELY
184 //
185 // We could be in another state for up to 4 chars
186 // And then we hit REJECT
187 // And then we need to output \yff\yff\yff\yff
188 // OK that's actually SIXTEEN at once?
189
190 case UTF8_REJECT: {
191 int n = sprintf((char*)*p_out, "\\x%02x", *start);
192 *p_out += n;
193 (*p_in) = start; // REWIND because we might have consumed NUL terminator!
194 (*p_in)++; // Advance past the byte we wrote
195 return;
196 }
197 case UTF8_ACCEPT: {
198 (*p_in)++;
199 // printf("start %p p_in %p\n", start, *p_in);
200 while (start < *p_in) {
201 J8_OUT(*start);
202 start++;
203 }
204 return;
205 }
206 default:
207 (*p_in)++; // advance, next UTF8_ACCEPT will write it
208 ch = **p_in;
209 // printf(" => ch %d\n", ch);
210 break;
211 }
212 }
213 // Unreachable
214}
215
216// BourneShellEncodeOne rules:
217//
218// must be valid UTF-8
219// no control chars
220// no ' is required
221// no \ -- not required, but avoids ambiguous '\n'
222//
223// For example we write $'\\' or b'\\' not '\'
224// The latter should be written r'\', but we're not outputing
225
226static inline int BourneShellEncodeOne(unsigned char** p_in,
227 unsigned char** p_out) {
228 unsigned char ch = **p_in;
229
230 if (ch == '\'' || ch == '\\') { // can't encode these in Bourne shell ''
231 return 1;
232 }
233 if (ch < 0x20) { // Unprintable ASCII control codes
234 return 1;
235 }
236
237 // UTF-8 encoded runes and invalid bytes
238 unsigned char* start = *p_in; // save start position
239 uint32_t codepoint = 0;
240 uint32_t state = UTF8_ACCEPT;
241
242 while (1) {
243 decode(&state, &codepoint, ch);
244 // printf(" state %d\n", state);
245 switch (state) {
246 case UTF8_REJECT: {
247 return 1;
248 }
249 case UTF8_ACCEPT: {
250 (*p_in)++;
251 // printf("start %p p_in %p\n", start, *p_in);
252 while (start < *p_in) {
253 J8_OUT(*start);
254 start++;
255 }
256 return 0;
257 }
258 default:
259 (*p_in)++; // advance, next UTF8_ACCEPT will write it
260 ch = **p_in;
261 break;
262 }
263 }
264 // Unreachable
265}
266
267// Right now \u001f and \u{1f} are the longest output sequences for a byte.
268// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even
269// though we don't technically need it)
270
271// Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa
272// If this is too small, we would enter an infinite loop
273// +1 for NUL terminator
274
275#define J8_MAX_BYTES_PER_INPUT_BYTE 7
276
277// The minimum capacity must be more than the number above.
278// TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity);
279#define J8_MIN_CAPACITY 16
280
281static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
282 unsigned char** p_out, unsigned char* out_end,
283 int j8_escape) {
284 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
285 // printf("iter %d %p < %p \n", i++, *p_out, out_end);
286 int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
287 if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data?
288 return invalid_utf8; // early return
289 }
290 }
291 return 0;
292}
293
294static inline int BashDollarEncodeChunk(unsigned char** p_in,
295 unsigned char* in_end,
296 unsigned char** p_out,
297 unsigned char* out_end) {
298 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
299 BashDollarEncodeOne(p_in, p_out);
300 }
301 return 0;
302}
303
304static inline int BourneShellEncodeChunk(unsigned char** p_in,
305 unsigned char* in_end,
306 unsigned char** p_out,
307 unsigned char* out_end) {
308 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
309 int cannot_encode = BourneShellEncodeOne(p_in, p_out);
310 if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
311 return cannot_encode; // early return
312 }
313 }
314 return 0;
315}
316
317static inline int CanOmitQuotes(unsigned char* s, int len) {
318 if (len == 0) { // empty string has to be quoted
319 return 0;
320 }
321
322 // 3 special case keywords
323 if (len == 4) {
324 if (memcmp(s, "null", 4) == 0) {
325 return 0;
326 }
327 if (memcmp(s, "true", 4) == 0) {
328 return 0;
329 }
330 }
331 if (len == 5) {
332 if (memcmp(s, "false", 5) == 0) {
333 return 0;
334 }
335 }
336
337 for (int i = 0; i < len; ++i) {
338 unsigned char ch = s[i];
339
340 // Corresponds to regex [a-zA-Z0-9./_-]
341 if ('a' <= ch && ch <= 'z') {
342 continue;
343 }
344 if ('A' <= ch && ch <= 'Z') {
345 continue;
346 }
347 if ('0' <= ch && ch <= '9') {
348 continue;
349 }
350 if (ch == '.' || ch == '/' || ch == '_' || ch == '-') {
351 continue;
352 }
353 // some byte requires quotes
354 // Not including UTF-8 here because it can have chars that look like space
355 // or quotes
356 return 0;
357 }
358 return 1; // everything OK
359}
360
361#endif // DATA_LANG_J8_H