data_lang/j8.h

OILS / data_lang / j8.h View on Github | oilshell.org

361 lines, 173 significant

1	#ifndef DATA_LANG_J8_H
2	#define DATA_LANG_J8_H
3
4	#include <stdio.h> // sprintf
5	#include <string.h> // memcmp, memcpy, strlen
6
7	#include "data_lang/utf8_impls/bjoern_dfa.h"
8	#include "data_lang/utf8.h"
9
10	#define J8_OUT(ch) \
11	**p_out = (ch); \
12	(*p_out)++
13
14	static inline int J8EncodeOne(unsigned char p_in, unsigned char p_out,
15	int j8_escape) {
16	// We use a slightly weird double pointer style because
17	// *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
18	// *p_out may be advanced by 1 to 6 bytes (depending on escaping)
19
20	// IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne()
21	// all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST
22	// have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences
23	// are terminated with an INVALID byte that the state machine can accept, and
24	// 0x00 can only be ITSELF, never part of a sequence. An alternative would be
25	// to do more bounds checks in these functions.
26
27	// CALLER MUST CHECK that we are able to write up to 6 bytes!
28	// Because the longest output is \u001f or \u{1f} for control chars, since
29	// we don't escapes like \u{1f926} right now
30	//
31	// j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
32	// \yff instead of Unicode replacement char
33	// \u{1} instead of \u0001 for unprintable low chars
34
35	// Returns:
36	// 0 wrote valid UTF-8 (encoded or not)
37	// 1 wrote byte that's invalid UTF-8
38
39	unsigned char ch = **p_in;
40
41	//
42	// Handle \\ \b \f \n \r \t
43	//
44
45	// clang-format off
46	switch (ch) {
47	case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
48	case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
49	case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
50	case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
51	case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
52	case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
53	}
54	// clang-format on
55
56	//
57	// Conditionally handle \' and \"
58	//
59	if (ch == '\'' && j8_escape) { // J8-style strings \'
60	J8_OUT('\\');
61	J8_OUT('\'');
62	(*p_in)++;
63	return 0;
64	}
65	if (ch == '"' && !j8_escape) { // JSON-style strings \"
66	J8_OUT('\\');
67	J8_OUT('"');
68	(*p_in)++;
69	return 0;
70	}
71
72	//
73	// Unprintable ASCII control codes
74	//
75	if (ch < 0x20) {
76	if (j8_escape) {
77	// printf("Writing for %04x %p\n", ch, *p_out);
78	int n = sprintf((char)p_out, "\\u{%x}", ch);
79	// printf("! Wrote %d bytes for %04x\n", n, ch);
80	*p_out += n;
81	} else {
82	// printf("Writing for %04x %p\n", ch, *p_out);
83	int n = sprintf((char)p_out, "\\u%04x", ch);
84	*p_out += n;
85	// printf("Wrote %d bytes for %04x\n", n, ch);
86	}
87	(*p_in)++;
88	return 0;
89	}
90
91	//
92	// UTF-8 encoded runes and invalid bytes
93	//
94	Utf8Result_t result;
95	utf8_decode(*p_in, &result);
96
97	if (result.error == UTF8_OK) {
98	memcpy(p_out, p_in, result.bytes_read);
99	*p_in += result.bytes_read;
100	*p_out += result.bytes_read;
101	return 0;
102	}
103
104	// We have a UTF-8 decoding error. This is handled one of three ways:
105	// 1. Losslessly encode as J8 byte literals (only applicable in J8)
106	// 2. Try to encode a lone surrogate
107	// 3. Insert a Unicode replacement char
108
109	if (j8_escape) {
110	int n = sprintf((char)p_out, "\\y%02x", ch);
111	*p_in += 1;
112	*p_out += n;
113	} else if (result.error == UTF8_ERR_SURROGATE) {
114	int n = sprintf((char)p_out, "\\u%04x", result.codepoint);
115	*p_in += result.bytes_read;
116	*p_out += n;
117	return 1;
118	} else {
119	// Unicode replacement char is U+FFFD, so write encoded form
120	// >>> '\ufffd'.encode('utf-8')
121	// b'\xef\xbf\xbd'
122	J8_OUT('\xef');
123	J8_OUT('\xbf');
124	J8_OUT('\xbd');
125	*p_in += 1; // Advance past the byte we wrote
126	}
127
128	return 1;
129	}
130
131	// Like the above, but
132	//
133	// \xff instead of \yff
134	// \u001f always, never \u{1f}
135	// No JSON vs. J8
136	// No \" escape ever
137	// No errors -- it can encode everything
138
139	static inline void BashDollarEncodeOne(unsigned char** p_in,
140	unsigned char** p_out) {
141	unsigned char ch = **p_in;
142
143	//
144	// Handle \\ \b \f \n \r \t \'
145	//
146
147	// clang-format off
148	switch (ch) {
149	case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return;
150	case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return;
151	case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return;
152	case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return;
153	case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return;
154	case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return;
155	case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return;
156	}
157	// clang-format on
158
159	//
160	// Unprintable ASCII control codes
161	//
162	if (ch < 0x20) {
163	// printf("Writing for %04x %p\n", ch, *p_out);
164	int n = sprintf((char)p_out, "\\u%04x", ch);
165	*p_out += n;
166	// printf("Wrote %d bytes for %04x\n", n, ch);
167	(*p_in)++;
168	return;
169	}
170
171	//
172	// UTF-8 encoded runes and invalid bytes
173	//
174	unsigned char* start = *p_in; // save start position
175	uint32_t codepoint = 0;
176	uint32_t state = UTF8_ACCEPT;
177
178	while (1) {
179	// unsigned char byte = **p_in;
180	decode(&state, &codepoint, ch);
181	// printf(" state %d ch %d\n", state, ch);
182	switch (state) {
183	// BUG: we don't reject IMMEDIATELY
184	//
185	// We could be in another state for up to 4 chars
186	// And then we hit REJECT
187	// And then we need to output \yff\yff\yff\yff
188	// OK that's actually SIXTEEN at once?
189
190	case UTF8_REJECT: {
191	int n = sprintf((char)p_out, "\\x%02x", *start);
192	*p_out += n;
193	(*p_in) = start; // REWIND because we might have consumed NUL terminator!
194	(*p_in)++; // Advance past the byte we wrote
195	return;
196	}
197	case UTF8_ACCEPT: {
198	(*p_in)++;
199	// printf("start %p p_in %p\n", start, *p_in);
200	while (start < *p_in) {
201	J8_OUT(*start);
202	start++;
203	}
204	return;
205	}
206	default:
207	(*p_in)++; // advance, next UTF8_ACCEPT will write it
208	ch = **p_in;
209	// printf(" => ch %d\n", ch);
210	break;
211	}
212	}
213	// Unreachable
214	}
215
216	// BourneShellEncodeOne rules:
217	//
218	// must be valid UTF-8
219	// no control chars
220	// no ' is required
221	// no \ -- not required, but avoids ambiguous '\n'
222	//
223	// For example we write $'\\' or b'\\' not '\'
224	// The latter should be written r'\', but we're not outputing
225
226	static inline int BourneShellEncodeOne(unsigned char** p_in,
227	unsigned char** p_out) {
228	unsigned char ch = **p_in;
229
230	if (ch == '\'' \|\| ch == '\\') { // can't encode these in Bourne shell ''
231	return 1;
232	}
233	if (ch < 0x20) { // Unprintable ASCII control codes
234	return 1;
235	}
236
237	// UTF-8 encoded runes and invalid bytes
238	unsigned char* start = *p_in; // save start position
239	uint32_t codepoint = 0;
240	uint32_t state = UTF8_ACCEPT;
241
242	while (1) {
243	decode(&state, &codepoint, ch);
244	// printf(" state %d\n", state);
245	switch (state) {
246	case UTF8_REJECT: {
247	return 1;
248	}
249	case UTF8_ACCEPT: {
250	(*p_in)++;
251	// printf("start %p p_in %p\n", start, *p_in);
252	while (start < *p_in) {
253	J8_OUT(*start);
254	start++;
255	}
256	return 0;
257	}
258	default:
259	(*p_in)++; // advance, next UTF8_ACCEPT will write it
260	ch = **p_in;
261	break;
262	}
263	}
264	// Unreachable
265	}
266
267	// Right now \u001f and \u{1f} are the longest output sequences for a byte.
268	// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even
269	// though we don't technically need it)
270
271	// Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa
272	// If this is too small, we would enter an infinite loop
273	// +1 for NUL terminator
274
275	#define J8_MAX_BYTES_PER_INPUT_BYTE 7
276
277	// The minimum capacity must be more than the number above.
278	// TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity);
279	#define J8_MIN_CAPACITY 16
280
281	static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
282	unsigned char** p_out, unsigned char* out_end,
283	int j8_escape) {
284	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
285	// printf("iter %d %p < %p \n", i++, *p_out, out_end);
286	int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
287	if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data?
288	return invalid_utf8; // early return
289	}
290	}
291	return 0;
292	}
293
294	static inline int BashDollarEncodeChunk(unsigned char** p_in,
295	unsigned char* in_end,
296	unsigned char** p_out,
297	unsigned char* out_end) {
298	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
299	BashDollarEncodeOne(p_in, p_out);
300	}
301	return 0;
302	}
303
304	static inline int BourneShellEncodeChunk(unsigned char** p_in,
305	unsigned char* in_end,
306	unsigned char** p_out,
307	unsigned char* out_end) {
308	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
309	int cannot_encode = BourneShellEncodeOne(p_in, p_out);
310	if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
311	return cannot_encode; // early return
312	}
313	}
314	return 0;
315	}
316
317	static inline int CanOmitQuotes(unsigned char* s, int len) {
318	if (len == 0) { // empty string has to be quoted
319	return 0;
320	}
321
322	// 3 special case keywords
323	if (len == 4) {
324	if (memcmp(s, "null", 4) == 0) {
325	return 0;
326	}
327	if (memcmp(s, "true", 4) == 0) {
328	return 0;
329	}
330	}
331	if (len == 5) {
332	if (memcmp(s, "false", 5) == 0) {
333	return 0;
334	}
335	}
336
337	for (int i = 0; i < len; ++i) {
338	unsigned char ch = s[i];
339
340	// Corresponds to regex [a-zA-Z0-9./_-]
341	if ('a' <= ch && ch <= 'z') {
342	continue;
343	}
344	if ('A' <= ch && ch <= 'Z') {
345	continue;
346	}
347	if ('0' <= ch && ch <= '9') {
348	continue;
349	}
350	if (ch == '.' \|\| ch == '/' \|\| ch == '_' \|\| ch == '-') {
351	continue;
352	}
353	// some byte requires quotes
354	// Not including UTF-8 here because it can have chars that look like space
355	// or quotes
356	return 0;
357	}
358	return 1; // everything OK
359	}
360
361	#endif // DATA_LANG_J8_H