OILS / cpp / data_lang.cc View on Github | oilshell.org

255 lines, 121 significant
1// data_lang.cc
2
3#include "cpp/data_lang.h"
4
5#include "data_lang/j8.h"
6#include "data_lang/utf8_impls/bjoern_dfa.h"
7
8// TODO: remove duplication
9#define LOSSY_JSON (1 << 3)
10
11namespace {
12
13void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
14 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
15 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
16
17 buf->WriteConst("b'");
18
19 // Set up pointers after writing opening quote
20 uint8_t* out = buf->LengthPointer(); // mutated
21 uint8_t* out_end = buf->CapacityPointer();
22
23 while (true) {
24 J8EncodeChunk(&in, in_end, &out, out_end, true); // Fill as much as we can
25 buf->SetLengthFrom(out);
26
27 if (in >= in_end) {
28 break;
29 }
30
31 // Same growth policy as below
32 capacity = capacity * 3 / 2;
33 // printf("[2] new capacity %d\n", capacity);
34 buf->EnsureMoreSpace(capacity);
35
36 // Recompute pointers
37 out = buf->LengthPointer();
38 out_end = buf->CapacityPointer();
39 }
40
41 buf->WriteConst("'");
42}
43
44void WriteBashDollarString(BigStr* s, mylib::BufWriter* buf, int capacity) {
45 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
46 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
47
48 buf->WriteConst("$'");
49
50 // Set up pointers after writing opening quote
51 uint8_t* out = buf->LengthPointer(); // mutated
52 uint8_t* out_end = buf->CapacityPointer();
53
54 while (true) {
55 BashDollarEncodeChunk(&in, in_end, &out,
56 out_end); // Fill as much as we can
57 buf->SetLengthFrom(out);
58
59 if (in >= in_end) {
60 break;
61 }
62
63 // Same growth policy as below
64 capacity = capacity * 3 / 2;
65 // printf("[2] new capacity %d\n", capacity);
66 buf->EnsureMoreSpace(capacity);
67
68 // Recompute pointers
69 out = buf->LengthPointer();
70 out_end = buf->CapacityPointer();
71 }
72
73 buf->WriteConst("'");
74}
75
76// Style is COPIED from pyj8::WriteString()
77// Functionality is like j8_libc.c ShellEncodeString, that is:
78//
79// call BourneShellEncodeChunk()
80// then either
81// WriteBString()
82// WriteBashDollarString()
83
84void ShellEncodeString(BigStr* s, int ysh_fallback, mylib::BufWriter* buf) {
85 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
86 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
87
88 // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
89 int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
90 if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
91 capacity = J8_MIN_CAPACITY;
92 }
93 // printf("[1] capacity %d\n", capacity);
94
95 buf->EnsureMoreSpace(capacity);
96
97 int begin = buf->Length(); // maybe Truncate to this position
98 buf->WriteConst("'");
99
100 // Set up pointers after writing opening quote
101 uint8_t* out = buf->LengthPointer(); // mutated
102 uint8_t* out_end = buf->CapacityPointer();
103
104 while (true) {
105 // Fill in as much as we can
106 int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
107 if (cannot_encode) {
108 buf->Truncate(begin);
109 if (ysh_fallback) {
110 WriteBString(s, buf, capacity); // fall back to b''
111 } else {
112 WriteBashDollarString(s, buf, capacity); // fall back to $''
113 }
114 return;
115 }
116 buf->SetLengthFrom(out);
117
118 // printf("[1] len %d\n", out_buf->len);
119
120 if (in >= in_end) {
121 break;
122 }
123
124 // Growth policy: every time through the loop, increase 1.5x
125 //
126 // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
127 // This seems like a reasonable tradeoff between over-allocating and too
128 // many realloc().
129 capacity = capacity * 3 / 2;
130 // printf("[1] new capacity %d\n", capacity);
131 buf->EnsureMoreSpace(capacity);
132
133 // Recompute pointers
134 out = buf->LengthPointer(); // mutated
135 out_end = buf->CapacityPointer();
136 // printf("[1] out %p out_end %p\n", out, out_end);
137 }
138
139 buf->WriteConst("'");
140}
141
142} // namespace
143
144namespace fastfunc {
145
146bool CanOmitQuotes(BigStr* s) {
147 return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
148}
149
150BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
151 auto buf = Alloc<mylib::BufWriter>();
152 int options = j8_fallback ? 0 : LOSSY_JSON;
153 pyj8::WriteString(s, options, buf);
154 return buf->getvalue();
155}
156
157BigStr* ShellEncodeString(BigStr* s, int ysh_fallback) {
158 auto buf = Alloc<mylib::BufWriter>();
159 ::ShellEncodeString(s, ysh_fallback, buf);
160 return buf->getvalue();
161}
162
163} // namespace fastfunc
164
165namespace pyj8 {
166
167bool PartIsUtf8(BigStr* s, int start, int end) {
168 uint32_t codepoint;
169 uint32_t state = UTF8_ACCEPT;
170
171 for (int i = start; i < end; ++i) {
172 // This var or a static_cast<> is necessary. Should really change BigStr*
173 // to use unsigned type
174 uint8_t c = s->data_[i];
175 decode(&state, &codepoint, c);
176 if (state == UTF8_REJECT) {
177 return false;
178 }
179 }
180
181 return state == UTF8_ACCEPT;
182}
183
184void WriteString(BigStr* s, int options, mylib::BufWriter* buf) {
185 bool j8_fallback = !(options & LOSSY_JSON);
186
187 uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
188 uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
189
190 // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
191 int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
192 if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
193 capacity = J8_MIN_CAPACITY;
194 }
195 // printf("[1] capacity %d\n", capacity);
196
197 buf->EnsureMoreSpace(capacity);
198
199 int begin = buf->Length(); // maybe Truncate to this position
200 buf->WriteConst("\"");
201
202 // Set up pointers after writing opening quote
203 uint8_t* out = buf->LengthPointer(); // mutated
204 uint8_t* out_end = buf->CapacityPointer();
205
206 while (true) {
207 // Fill in as much as we can
208 int invalid_utf8 = J8EncodeChunk(&in, in_end, &out, out_end, false);
209 if (invalid_utf8 && j8_fallback) {
210 buf->Truncate(begin);
211 WriteBString(s, buf, capacity); // fall back to b''
212 return;
213 }
214 buf->SetLengthFrom(out);
215
216 // printf("[1] len %d\n", out_buf->len);
217
218 if (in >= in_end) {
219 break;
220 }
221
222 // Growth policy: every time through the loop, increase 1.5x
223 //
224 // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
225 // This seems like a reasonable tradeoff between over-allocating and too
226 // many realloc().
227 capacity = capacity * 3 / 2;
228 // printf("[1] new capacity %d\n", capacity);
229 buf->EnsureMoreSpace(capacity);
230
231 // Recompute pointers
232 out = buf->LengthPointer(); // mutated
233 out_end = buf->CapacityPointer();
234 // printf("[1] out %p out_end %p\n", out, out_end);
235 }
236
237 buf->WriteConst("\"");
238}
239
240} // namespace pyj8
241
242namespace j8 {
243
244int HeapValueId(value_asdl::value_t* val) {
245#ifndef OPTIMIZED
246 // ASDL generates headers with HeapTag::Scanned, but HeapTag::FixedSize would
247 // also be valid.
248 ObjHeader* h = ObjHeader::FromObject(val);
249 DCHECK(h->heap_tag == HeapTag::Scanned || h->heap_tag == HeapTag::FixedSize);
250#endif
251
252 return ObjectId(val);
253}
254
255} // namespace j8