data_lang/j8.h

OILS / data_lang / j8.h View on Github | oilshell.org

365 lines, 177 significant

1	#ifndef DATA_LANG_J8_H
2	#define DATA_LANG_J8_H
3
4	#include <stdio.h> // sprintf
5	#include <string.h> // memcmp
6
7	#include "data_lang/utf8_impls/bjoern_dfa.h"
8
9	#define J8_OUT(ch) \
10	**p_out = (ch); \
11	(*p_out)++
12
13	static inline int J8EncodeOne(unsigned char p_in, unsigned char p_out,
14	int j8_escape) {
15	// We use a slightly weird double pointer style because
16	// *p_in may be advanced by 1 to 4 bytes (depending on whether it's UTF-8)
17	// *p_out may be advanced by 1 to 6 bytes (depending on escaping)
18
19	// IMPORTANT: J8EncodeOne(), BourneShellEncodeOne(), BashDollarEncodeOne()
20	// all call Bjoern DFA decode(), and there's a subtle issue where p_in MUST
21	// have a NUL terminator is required. This is so INCOMPLETE UTF-8 sequences
22	// are terminated with an INVALID byte that the state machine can accept, and
23	// 0x00 can only be ITSELF, never part of a sequence. An alternative would be
24	// to do more bounds checks in these functions.
25
26	// CALLER MUST CHECK that we are able to write up to 6 bytes!
27	// Because the longest output is \u001f or \u{1f} for control chars, since
28	// we don't escapes like \u{1f926} right now
29	//
30	// j8_escape: Whether to use j8 escapes, i.e. LOSSLESS encoding of data
31	// \yff instead of Unicode replacement char
32	// \u{1} instead of \u0001 for unprintable low chars
33
34	// Returns:
35	// 0 wrote valid UTF-8 (encoded or not)
36	// 1 wrote byte that's invalid UTF-8
37
38	unsigned char ch = **p_in;
39
40	//
41	// Handle \\ \b \f \n \r \t
42	//
43
44	// clang-format off
45	switch (ch) {
46	case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return 0;
47	case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return 0;
48	case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return 0;
49	case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return 0;
50	case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return 0;
51	case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return 0;
52	}
53	// clang-format on
54
55	//
56	// Conditionally handle \' and \"
57	//
58	if (ch == '\'' && j8_escape) { // J8-style strings \'
59	J8_OUT('\\');
60	J8_OUT('\'');
61	(*p_in)++;
62	return 0;
63	}
64	if (ch == '"' && !j8_escape) { // JSON-style strings \"
65	J8_OUT('\\');
66	J8_OUT('"');
67	(*p_in)++;
68	return 0;
69	}
70
71	//
72	// Unprintable ASCII control codes
73	//
74	if (ch < 0x20) {
75	if (j8_escape) {
76	// printf("Writing for %04x %p\n", ch, *p_out);
77	int n = sprintf((char)p_out, "\\u{%x}", ch);
78	// printf("! Wrote %d bytes for %04x\n", n, ch);
79	*p_out += n;
80	} else {
81	// printf("Writing for %04x %p\n", ch, *p_out);
82	int n = sprintf((char)p_out, "\\u%04x", ch);
83	*p_out += n;
84	// printf("Wrote %d bytes for %04x\n", n, ch);
85	}
86	(*p_in)++;
87	return 0;
88	}
89
90	//
91	// UTF-8 encoded runes and invalid bytes
92	//
93	unsigned char* start = *p_in; // save start position
94	uint32_t codepoint = 0;
95	uint32_t state = UTF8_ACCEPT;
96
97	while (1) {
98	decode(&state, &codepoint, ch);
99	// printf(" state %d\n", state);
100	switch (state) {
101	case UTF8_REJECT: {
102	if (j8_escape) {
103	int n = sprintf((char)p_out, "\\y%02x", *start);
104	*p_out += n;
105	} else {
106	// Unicode replacement char is U+FFFD, so write encoded form
107	// >>> '\ufffd'.encode('utf-8')
108	// b'\xef\xbf\xbd'
109	J8_OUT('\xef');
110	J8_OUT('\xbf');
111	J8_OUT('\xbd');
112	}
113	(*p_in) = start; // REWIND because we might have consumed NUL terminator!
114	(*p_in)++; // Advance past the byte we wrote
115	return 1;
116	}
117	case UTF8_ACCEPT: {
118	(*p_in)++;
119	// printf("start %p p_in %p\n", start, *p_in);
120	while (start < *p_in) {
121	J8_OUT(*start);
122	start++;
123	}
124	return 0;
125	}
126	default:
127	(*p_in)++; // advance, next UTF8_ACCEPT will write it
128	ch = **p_in;
129	break;
130	}
131	}
132	// Unreachable
133	}
134
135	// Like the above, but
136	//
137	// \xff instead of \yff
138	// \u001f always, never \u{1f}
139	// No JSON vs. J8
140	// No \" escape ever
141	// No errors -- it can encode everything
142
143	static inline void BashDollarEncodeOne(unsigned char** p_in,
144	unsigned char** p_out) {
145	unsigned char ch = **p_in;
146
147	//
148	// Handle \\ \b \f \n \r \t \'
149	//
150
151	// clang-format off
152	switch (ch) {
153	case '\\': J8_OUT('\\'); J8_OUT('\\'); (*p_in)++; return;
154	case '\b': J8_OUT('\\'); J8_OUT('b'); (*p_in)++; return;
155	case '\f': J8_OUT('\\'); J8_OUT('f'); (*p_in)++; return;
156	case '\n': J8_OUT('\\'); J8_OUT('n'); (*p_in)++; return;
157	case '\r': J8_OUT('\\'); J8_OUT('r'); (*p_in)++; return;
158	case '\t': J8_OUT('\\'); J8_OUT('t'); (*p_in)++; return;
159	case '\'': J8_OUT('\\'); J8_OUT('\''); (*p_in)++; return;
160	}
161	// clang-format off
162
163	//
164	// Unprintable ASCII control codes
165	//
166	if (ch < 0x20) {
167	// printf("Writing for %04x %p\n", ch, *p_out);
168	int n = sprintf((char)p_out, "\\u%04x", ch);
169	*p_out += n;
170	// printf("Wrote %d bytes for %04x\n", n, ch);
171	(*p_in)++;
172	return;
173	}
174
175	//
176	// UTF-8 encoded runes and invalid bytes
177	//
178	unsigned char* start = *p_in; // save start position
179	uint32_t codepoint = 0;
180	uint32_t state = UTF8_ACCEPT;
181
182	while (1) {
183	// unsigned char byte = **p_in;
184	decode(&state, &codepoint, ch);
185	// printf(" state %d ch %d\n", state, ch);
186	switch (state) {
187	// BUG: we don't reject IMMEDIATELY
188	//
189	// We could be in another state for up to 4 chars
190	// And then we hit REJECT
191	// And then we need to output \yff\yff\yff\yff
192	// OK that's actually SIXTEEN at once?
193
194	case UTF8_REJECT: {
195	int n = sprintf((char)p_out, "\\x%02x", *start);
196	*p_out += n;
197	(*p_in) = start; // REWIND because we might have consumed NUL terminator!
198	(*p_in)++; // Advance past the byte we wrote
199	return;
200	}
201	case UTF8_ACCEPT: {
202	(*p_in)++;
203	// printf("start %p p_in %p\n", start, *p_in);
204	while (start < *p_in) {
205	J8_OUT(*start);
206	start++;
207	}
208	return;
209	}
210	default:
211	(*p_in)++; // advance, next UTF8_ACCEPT will write it
212	ch = **p_in;
213	// printf(" => ch %d\n", ch);
214	break;
215	}
216	}
217	// Unreachable
218	}
219
220	// BourneShellEncodeOne rules:
221	//
222	// must be valid UTF-8
223	// no control chars
224	// no ' is required
225	// no \ -- not required, but avoids ambiguous '\n'
226	//
227	// For example we write $'\\' or b'\\' not '\'
228	// The latter should be written r'\', but we're not outputing
229
230	static inline int BourneShellEncodeOne(unsigned char** p_in,
231	unsigned char** p_out) {
232	unsigned char ch = **p_in;
233
234	if (ch == '\'' \|\| ch == '\\') { // can't encode these in Bourne shell ''
235	return 1;
236	}
237	if (ch < 0x20) { // Unprintable ASCII control codes
238	return 1;
239	}
240
241	// UTF-8 encoded runes and invalid bytes
242	unsigned char* start = *p_in; // save start position
243	uint32_t codepoint = 0;
244	uint32_t state = UTF8_ACCEPT;
245
246	while (1) {
247	decode(&state, &codepoint, ch);
248	// printf(" state %d\n", state);
249	switch (state) {
250	case UTF8_REJECT: {
251	return 1;
252	}
253	case UTF8_ACCEPT: {
254	(*p_in)++;
255	// printf("start %p p_in %p\n", start, *p_in);
256	while (start < *p_in) {
257	J8_OUT(*start);
258	start++;
259	}
260	return 0;
261	}
262	default:
263	(*p_in)++; // advance, next UTF8_ACCEPT will write it
264	ch = **p_in;
265	break;
266	}
267	}
268	// Unreachable
269	}
270
271	// Right now \u001f and \u{1f} are the longest output sequences for a byte.
272	// Bug fix: we need 6 + 1 for the NUL terminator that sprintf() writes! (Even
273	// though we don't technically need it)
274
275	// Bug: we may need up to 16 bytes: \yaa\yaa\yaa\yaa
276	// If this is too small, we would enter an infinite loop
277	// +1 for NUL terminator
278
279	#define J8_MAX_BYTES_PER_INPUT_BYTE 7
280
281	// The minimum capacity must be more than the number above.
282	// TODO: Tune this for our allocator? We call buf->EnsureMoreSpace(capacity);
283	#define J8_MIN_CAPACITY 16
284
285	static inline int J8EncodeChunk(unsigned char** p_in, unsigned char* in_end,
286	unsigned char** p_out, unsigned char* out_end,
287	int j8_escape) {
288	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
289	// printf("iter %d %p < %p \n", i++, *p_out, out_end);
290	int invalid_utf8 = J8EncodeOne(p_in, p_out, j8_escape);
291	if (invalid_utf8 && !j8_escape) { // first JSON pass got binary data?
292	return invalid_utf8; // early return
293	}
294	}
295	return 0;
296	}
297
298	static inline int BashDollarEncodeChunk(unsigned char** p_in,
299	unsigned char* in_end,
300	unsigned char** p_out,
301	unsigned char* out_end) {
302	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
303	BashDollarEncodeOne(p_in, p_out);
304	}
305	return 0;
306	}
307
308	static inline int BourneShellEncodeChunk(unsigned char** p_in,
309	unsigned char* in_end,
310	unsigned char** p_out,
311	unsigned char* out_end) {
312	while (p_in < in_end && (p_out + J8_MAX_BYTES_PER_INPUT_BYTE) <= out_end) {
313	int cannot_encode = BourneShellEncodeOne(p_in, p_out);
314	if (cannot_encode) { // we need escaping, e.g. \u0001 or \'
315	return cannot_encode; // early return
316	}
317	}
318	return 0;
319	}
320
321	static inline int CanOmitQuotes(unsigned char* s, int len) {
322	if (len == 0) { // empty string has to be quoted
323	return 0;
324	}
325
326	// 3 special case keywords
327	if (len == 4) {
328	if (memcmp(s, "null", 4) == 0) {
329	return 0;
330	}
331	if (memcmp(s, "true", 4) == 0) {
332	return 0;
333	}
334	}
335	if (len == 5) {
336	if (memcmp(s, "false", 5) == 0) {
337	return 0;
338	}
339	}
340
341	for (int i = 0; i < len; ++i) {
342	unsigned char ch = s[i];
343
344	// Corresponds to regex [a-zA-Z0-9./_-]
345	if ('a' <= ch && ch <= 'z') {
346	continue;
347	}
348	if ('A' <= ch && ch <= 'Z') {
349	continue;
350	}
351	if ('0' <= ch && ch <= '9') {
352	continue;
353	}
354	if (ch == '.' \|\| ch == '/' \|\| ch == '_' \|\| ch == '-') {
355	continue;
356	}
357	// some byte requires quotes
358	// Not including UTF-8 here because it can have chars that look like space
359	// or quotes
360	return 0;
361	}
362	return 1; // everything OK
363	}
364
365	#endif // DATA_LANG_J8_H