OILS / data_lang / j8.h View on Github | oilshell.org

365 lines, 177 significant
1#ifndef DATA_LANG_J8_H
2#define DATA_LANG_J8_H
3
4#include <stdio.h> // sprintf
5#include <string.h> // memcmp
6
7#include "data_lang/utf8_impls/bjoern_dfa.h"
8
9#define J8_OUT(ch) \
10 **p_out = (ch); \
11 (*p_out)++
12
13static inline int J8EncodeOne(unsigned char** p_in, unsigned char** p_out,
14 int j8_escape) {
15 // We use a slightly weird double pointer style because
16 // *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
17 // *p_out may be advanced by 1 to 6 bytes (depending on escaping)
18
19 // IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne()
20 // all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST
21 // have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences
22 // are terminated with an INVALID byte that the state machine can accept, and
23 // 0x00 can only be ITSELF, never part of a sequence. An alternative would be
24 // to do more bounds checks in these functions.
25
26 // CALLER MUST CHECK that we are able to write up to 6 bytes!
27 // Because the longest output is \u001f or \u{1f} for control chars, since
28 // we don't escapes like \u{1f926} right now
29 //
30 // j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
31 // \yff instead of Unicode replacement char
32 // \u{1} instead of \u0001 for unprintable low chars
33
34 // Returns:
35 // 0 wrote valid UTF-8 (encoded or not)
36 // 1 wrote byte that's invalid UTF-8
37
38 unsigned char ch = **p_in;
39
40 //
41 // Handle \\ \b \f \n \r \t
42 //
43
44 // clang-format off
45 switch (ch) {
46 case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
47 case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
48 case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
49 case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
50 case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
51 case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
52 }
53 // clang-format on
54
55 //
56 // Conditionally handle \' and \"
57 //
58 if (ch == '\'' && j8_escape) { // J8-style strings \'
59 J8_OUT('\\');
60 J8_OUT('\'');
61 (*p_in)++;
62 return 0;
63 }
64 if (ch == '"' && !j8_escape) { // JSON-style strings \"
65 J8_OUT('\\');
66 J8_OUT('"');
67 (*p_in)++;
68 return 0;
69 }
70
71 //
72 // Unprintable ASCII control codes
73 //
74 if (ch < 0x20) {
75 if (j8_escape) {
76 // printf("Writing for %04x %p\n", ch, *p_out);
77 int n = sprintf((char*)*p_out, "\\u{%x}", ch);
78 // printf("! Wrote %d bytes for %04x\n", n, ch);
79 *p_out += n;
80 } else {
81 // printf("Writing for %04x %p\n", ch, *p_out);
82 int n = sprintf((char*)*p_out, "\\u%04x", ch);
83 *p_out += n;
84 // printf("Wrote %d bytes for %04x\n", n, ch);
85 }
86 (*p_in)++;
87 return 0;
88 }
89
90 //
91 // UTF-8 encoded runes and invalid bytes
92 //
93 unsigned char* start = *p_in; // save start position
94 uint32_t codepoint = 0;
95 uint32_t state = UTF8_ACCEPT;
96
97 while (1) {
98 decode(&state, &codepoint, ch);
99 // printf(" state %d\n", state);
100 switch (state) {
101 case UTF8_REJECT: {
102 if (j8_escape) {
103 int n = sprintf((char*)*p_out, "\\y%02x", *start);
104 *p_out += n;
105 } else {
106 // Unicode replacement char is U+FFFD, so write encoded form
107 // >>> '\ufffd'.encode('utf-8')
108 // b'\xef\xbf\xbd'
109 J8_OUT('\xef');
110 J8_OUT('\xbf');
111 J8_OUT('\xbd');
112 }
113 (*p_in) = start; // REWIND because we might have consumed NUL terminator!
114 (*p_in)++; // Advance past the byte we wrote
115 return 1;
116 }
117 case UTF8_ACCEPT: {
118 (*p_in)++;
119 // printf("start %p p_in %p\n", start, *p_in);
120 while (start < *p_in) {
121 J8_OUT(*start);
122 start++;
123 }
124 return 0;
125 }
126 default:
127 (*p_in)++; // advance, next UTF8_ACCEPT will write it
128 ch = **p_in;
129 break;
130 }
131 }
132 // Unreachable
133}
134
135// Like the above, but
136//
137// \xff instead of \yff
138// \u001f always, never \u{1f}
139// No JSON vs. J8
140// No \" escape ever
141// No errors -- it can encode everything
142
143static inline void BashDollarEncodeOne(unsigned char** p_in,
144 unsigned char** p_out) {
145 unsigned char ch = **p_in;
146
147 //
148 // Handle \\ \b \f \n \r \t \'
149 //
150
151 // clang-format off
152 switch (ch) {
153 case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return;
154 case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return;
155 case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return;
156 case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return;
157 case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return;
158 case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return;
159 case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return;
160 }
161 // clang-format off
162
163 //
164 // Unprintable ASCII control codes
165 //
166 if (ch < 0x20) {
167 // printf("Writing for %04x %p\n", ch, *p_out);
168 int n = sprintf((char*)*p_out, "\\u%04x", ch);
169 *p_out += n;
170 // printf("Wrote %d bytes for %04x\n", n, ch);
171 (*p_in)++;
172 return;
173 }
174
175 //
176 // UTF-8 encoded runes and invalid bytes
177 //
178 unsigned char* start = *p_in; // save start position
179 uint32_t codepoint = 0;
180 uint32_t state = UTF8_ACCEPT;
181
182 while (1) {
183 // unsigned char byte = **p_in;
184 decode(&state, &codepoint, ch);
185 // printf(" state %d ch %d\n", state, ch);
186 switch (state) {
187 // BUG: we don't reject IMMEDIATELY
188 //
189 // We could be in another state for up to 4 chars
190 // And then we hit REJECT
191 // And then we need to output \yff\yff\yff\yff
192 // OK that's actually SIXTEEN at once?
193
194 case UTF8_REJECT: {
195 int n = sprintf((char*)*p_out, "\\x%02x", *start);
196 *p_out += n;
197 (*p_in) = start; // REWIND because we might have consumed NUL terminator!
198 (*p_in)++; // Advance past the byte we wrote
199 return;
200 }
201 case UTF8_ACCEPT: {
202 (*p_in)++;
203 // printf("start %p p_in %p\n", start, *p_in);
204 while (start < *p_in) {
205 J8_OUT(*start);
206 start++;
207 }
208 return;
209 }
210 default:
211 (*p_in)++; // advance, next UTF8_ACCEPT will write it
212 ch = **p_in;
213 // printf(" => ch %d\n", ch);
214 break;
215 }
216 }
217 // Unreachable
218}
219
220// BourneShellEncodeOne rules:
221//
222// must be valid UTF-8
223// no control chars
224// no ' is required
225// no \ -- not required, but avoids ambiguous '\n'
226//
227// For example we write $'\\' or b'\\' not '\'
228// The latter should be written r'\', but we're not outputing
229
230static inline int BourneShellEncodeOne(unsigned char** p_in,
231 unsigned char** p_out) {
232 unsigned char ch = **p_in;
233
234 if (ch == '\'' || ch == '\\') { // can't encode these in Bourne shell ''
235 return 1;
236 }
237 if (ch < 0x20) { // Unprintable ASCII control codes
238 return 1;
239 }
240
241 // UTF-8 encoded runes and invalid bytes
242 unsigned char* start = *p_in; // save start position
243 uint32_t codepoint = 0;
244 uint32_t state = UTF8_ACCEPT;
245
246 while (1) {
247 decode(&state, &codepoint, ch);
248 // printf(" state %d\n", state);
249 switch (state) {
250 case UTF8_REJECT: {
251 return 1;
252 }
253 case UTF8_ACCEPT: {
254 (*p_in)++;
255 // printf("start %p p_in %p\n", start, *p_in);
256 while (start < *p_in) {
257 J8_OUT(*start);
258 start++;
259 }
260 return 0;
261 }
262 default:
263 (*p_in)++; // advance, next UTF8_ACCEPT will write it
264 ch = **p_in;
265 break;
266 }
267 }
268 // Unreachable
269}
270
271// Right now \u001f and \u{1f} are the longest output sequences for a byte.
272// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even
273// though we don't technically need it)
274
275// Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa
276// If this is too small, we would enter an infinite loop
277// +1 for NUL terminator
278
279#define J8_MAX_BYTES_PER_INPUT_BYTE 7
280
281// The minimum capacity must be more than the number above.
282// TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity);
283#define J8_MIN_CAPACITY 16
284
285static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
286 unsigned char** p_out, unsigned char* out_end,
287 int j8_escape) {
288 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
289 // printf("iter %d %p < %p \n", i++, *p_out, out_end);
290 int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
291 if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data?
292 return invalid_utf8; // early return
293 }
294 }
295 return 0;
296}
297
298static inline int BashDollarEncodeChunk(unsigned char** p_in,
299 unsigned char* in_end,
300 unsigned char** p_out,
301 unsigned char* out_end) {
302 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
303 BashDollarEncodeOne(p_in, p_out);
304 }
305 return 0;
306}
307
308static inline int BourneShellEncodeChunk(unsigned char** p_in,
309 unsigned char* in_end,
310 unsigned char** p_out,
311 unsigned char* out_end) {
312 while (*p_in < in_end && (*p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
313 int cannot_encode = BourneShellEncodeOne(p_in, p_out);
314 if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
315 return cannot_encode; // early return
316 }
317 }
318 return 0;
319}
320
321static inline int CanOmitQuotes(unsigned char* s, int len) {
322 if (len == 0) { // empty string has to be quoted
323 return 0;
324 }
325
326 // 3 special case keywords
327 if (len == 4) {
328 if (memcmp(s, "null", 4) == 0) {
329 return 0;
330 }
331 if (memcmp(s, "true", 4) == 0) {
332 return 0;
333 }
334 }
335 if (len == 5) {
336 if (memcmp(s, "false", 5) == 0) {
337 return 0;
338 }
339 }
340
341 for (int i = 0; i < len; ++i) {
342 unsigned char ch = s[i];
343
344 // Corresponds to regex [a-zA-Z0-9./_-]
345 if ('a' <= ch && ch <= 'z') {
346 continue;
347 }
348 if ('A' <= ch && ch <= 'Z') {
349 continue;
350 }
351 if ('0' <= ch && ch <= '9') {
352 continue;
353 }
354 if (ch == '.' || ch == '/' || ch == '_' || ch == '-') {
355 continue;
356 }
357 // some byte requires quotes
358 // Not including UTF-8 here because it can have chars that look like space
359 // or quotes
360 return 0;
361 }
362 return 1; // everything OK
363}
364
365#endif // DATA_LANG_J8_H