1 | // data_lang.cc
|
2 |
|
3 | #include "cpp/data_lang.h"
|
4 |
|
5 | #include "data_lang/j8.h"
|
6 | #include "data_lang/utf8_impls/bjoern_dfa.h"
|
7 |
|
8 | // TODO: remove duplication
|
9 | #define LOSSY_JSON (1 << 3)
|
10 |
|
11 | namespace {
|
12 |
|
13 | void WriteBString(BigStr* s, mylib::BufWriter* buf, int capacity) {
|
14 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
15 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
16 |
|
17 | buf->WriteConst("b'");
|
18 |
|
19 | // Set up pointers after writing opening quote
|
20 | uint8_t* out = buf->LengthPointer(); // mutated
|
21 | uint8_t* out_end = buf->CapacityPointer();
|
22 |
|
23 | while (true) {
|
24 | J8EncodeChunk(&in, in_end, &out, out_end, true); // Fill as much as we can
|
25 | buf->SetLengthFrom(out);
|
26 |
|
27 | if (in >= in_end) {
|
28 | break;
|
29 | }
|
30 |
|
31 | // Same growth policy as below
|
32 | capacity = capacity * 3 / 2;
|
33 | // printf("[2] new capacity %d\n", capacity);
|
34 | buf->EnsureMoreSpace(capacity);
|
35 |
|
36 | // Recompute pointers
|
37 | out = buf->LengthPointer();
|
38 | out_end = buf->CapacityPointer();
|
39 | }
|
40 |
|
41 | buf->WriteConst("'");
|
42 | }
|
43 |
|
44 | void WriteBashDollarString(BigStr* s, mylib::BufWriter* buf, int capacity) {
|
45 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
46 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
47 |
|
48 | buf->WriteConst("$'");
|
49 |
|
50 | // Set up pointers after writing opening quote
|
51 | uint8_t* out = buf->LengthPointer(); // mutated
|
52 | uint8_t* out_end = buf->CapacityPointer();
|
53 |
|
54 | while (true) {
|
55 | BashDollarEncodeChunk(&in, in_end, &out,
|
56 | out_end); // Fill as much as we can
|
57 | buf->SetLengthFrom(out);
|
58 |
|
59 | if (in >= in_end) {
|
60 | break;
|
61 | }
|
62 |
|
63 | // Same growth policy as below
|
64 | capacity = capacity * 3 / 2;
|
65 | // printf("[2] new capacity %d\n", capacity);
|
66 | buf->EnsureMoreSpace(capacity);
|
67 |
|
68 | // Recompute pointers
|
69 | out = buf->LengthPointer();
|
70 | out_end = buf->CapacityPointer();
|
71 | }
|
72 |
|
73 | buf->WriteConst("'");
|
74 | }
|
75 |
|
76 | // Style is COPIED from pyj8::WriteString()
|
77 | // Functionality is like j8_libc.c ShellEncodeString, that is:
|
78 | //
|
79 | // call BourneShellEncodeChunk()
|
80 | // then either
|
81 | // WriteBString()
|
82 | // WriteBashDollarString()
|
83 |
|
84 | void ShellEncodeString(BigStr* s, int ysh_fallback, mylib::BufWriter* buf) {
|
85 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
86 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
87 |
|
88 | // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
|
89 | int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
|
90 | if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
|
91 | capacity = J8_MIN_CAPACITY;
|
92 | }
|
93 | // printf("[1] capacity %d\n", capacity);
|
94 |
|
95 | buf->EnsureMoreSpace(capacity);
|
96 |
|
97 | int begin = buf->Length(); // maybe Truncate to this position
|
98 | buf->WriteConst("'");
|
99 |
|
100 | // Set up pointers after writing opening quote
|
101 | uint8_t* out = buf->LengthPointer(); // mutated
|
102 | uint8_t* out_end = buf->CapacityPointer();
|
103 |
|
104 | while (true) {
|
105 | // Fill in as much as we can
|
106 | int cannot_encode = BourneShellEncodeChunk(&in, in_end, &out, out_end);
|
107 | if (cannot_encode) {
|
108 | buf->Truncate(begin);
|
109 | if (ysh_fallback) {
|
110 | WriteBString(s, buf, capacity); // fall back to b''
|
111 | } else {
|
112 | WriteBashDollarString(s, buf, capacity); // fall back to $''
|
113 | }
|
114 | return;
|
115 | }
|
116 | buf->SetLengthFrom(out);
|
117 |
|
118 | // printf("[1] len %d\n", out_buf->len);
|
119 |
|
120 | if (in >= in_end) {
|
121 | break;
|
122 | }
|
123 |
|
124 | // Growth policy: every time through the loop, increase 1.5x
|
125 | //
|
126 | // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
|
127 | // This seems like a reasonable tradeoff between over-allocating and too
|
128 | // many realloc().
|
129 | capacity = capacity * 3 / 2;
|
130 | // printf("[1] new capacity %d\n", capacity);
|
131 | buf->EnsureMoreSpace(capacity);
|
132 |
|
133 | // Recompute pointers
|
134 | out = buf->LengthPointer(); // mutated
|
135 | out_end = buf->CapacityPointer();
|
136 | // printf("[1] out %p out_end %p\n", out, out_end);
|
137 | }
|
138 |
|
139 | buf->WriteConst("'");
|
140 | }
|
141 |
|
142 | } // namespace
|
143 |
|
144 | namespace fastfunc {
|
145 |
|
146 | bool CanOmitQuotes(BigStr* s) {
|
147 | return ::CanOmitQuotes(reinterpret_cast<unsigned char*>(s->data_), len(s));
|
148 | }
|
149 |
|
150 | BigStr* J8EncodeString(BigStr* s, int j8_fallback) {
|
151 | auto buf = Alloc<mylib::BufWriter>();
|
152 | int options = j8_fallback ? 0 : LOSSY_JSON;
|
153 | pyj8::WriteString(s, options, buf);
|
154 | return buf->getvalue();
|
155 | }
|
156 |
|
157 | BigStr* ShellEncodeString(BigStr* s, int ysh_fallback) {
|
158 | auto buf = Alloc<mylib::BufWriter>();
|
159 | ::ShellEncodeString(s, ysh_fallback, buf);
|
160 | return buf->getvalue();
|
161 | }
|
162 |
|
163 | } // namespace fastfunc
|
164 |
|
165 | namespace pyj8 {
|
166 |
|
167 | bool PartIsUtf8(BigStr* s, int start, int end) {
|
168 | uint32_t codepoint;
|
169 | uint32_t state = UTF8_ACCEPT;
|
170 |
|
171 | for (int i = start; i < end; ++i) {
|
172 | // This var or a static_cast<> is necessary. Should really change BigStr*
|
173 | // to use unsigned type
|
174 | uint8_t c = s->data_[i];
|
175 | decode(&state, &codepoint, c);
|
176 | if (state == UTF8_REJECT) {
|
177 | return false;
|
178 | }
|
179 | }
|
180 |
|
181 | return state == UTF8_ACCEPT;
|
182 | }
|
183 |
|
184 | void WriteString(BigStr* s, int options, mylib::BufWriter* buf) {
|
185 | bool j8_fallback = !(options & LOSSY_JSON);
|
186 |
|
187 | uint8_t* in = reinterpret_cast<uint8_t*>(s->data_);
|
188 | uint8_t* in_end = reinterpret_cast<uint8_t*>(s->data_ + len(s));
|
189 |
|
190 | // Growth policy: Start at a fixed size max(N + 3 + 2, J8_MIN_CAPACITY)
|
191 | int capacity = len(s) + 3 + 2; // 3 for quotes, 2 potential \" \n
|
192 | if (capacity < J8_MIN_CAPACITY) { // account for J8_MAX_BYTES_PER_INPUT_BYTE
|
193 | capacity = J8_MIN_CAPACITY;
|
194 | }
|
195 | // printf("[1] capacity %d\n", capacity);
|
196 |
|
197 | buf->EnsureMoreSpace(capacity);
|
198 |
|
199 | int begin = buf->Length(); // maybe Truncate to this position
|
200 | buf->WriteConst("\"");
|
201 |
|
202 | // Set up pointers after writing opening quote
|
203 | uint8_t* out = buf->LengthPointer(); // mutated
|
204 | uint8_t* out_end = buf->CapacityPointer();
|
205 |
|
206 | while (true) {
|
207 | // Fill in as much as we can
|
208 | int invalid_utf8 = J8EncodeChunk(&in, in_end, &out, out_end, false);
|
209 | if (invalid_utf8 && j8_fallback) {
|
210 | buf->Truncate(begin);
|
211 | WriteBString(s, buf, capacity); // fall back to b''
|
212 | return;
|
213 | }
|
214 | buf->SetLengthFrom(out);
|
215 |
|
216 | // printf("[1] len %d\n", out_buf->len);
|
217 |
|
218 | if (in >= in_end) {
|
219 | break;
|
220 | }
|
221 |
|
222 | // Growth policy: every time through the loop, increase 1.5x
|
223 | //
|
224 | // The worst blowup is 6x, and 1.5 ** 5 > 6, so it will take 5 reallocs.
|
225 | // This seems like a reasonable tradeoff between over-allocating and too
|
226 | // many realloc().
|
227 | capacity = capacity * 3 / 2;
|
228 | // printf("[1] new capacity %d\n", capacity);
|
229 | buf->EnsureMoreSpace(capacity);
|
230 |
|
231 | // Recompute pointers
|
232 | out = buf->LengthPointer(); // mutated
|
233 | out_end = buf->CapacityPointer();
|
234 | // printf("[1] out %p out_end %p\n", out, out_end);
|
235 | }
|
236 |
|
237 | buf->WriteConst("\"");
|
238 | }
|
239 |
|
240 | } // namespace pyj8
|
241 |
|
242 | namespace j8 {
|
243 |
|
244 | int HeapValueId(value_asdl::value_t* val) {
|
245 | #ifndef OPTIMIZED
|
246 | // ASDL generates headers with HeapTag::Scanned, but HeapTag::FixedSize would
|
247 | // also be valid.
|
248 | ObjHeader* h = ObjHeader::FromObject(val);
|
249 | DCHECK(h->heap_tag == HeapTag::Scanned || h->heap_tag == HeapTag::FixedSize);
|
250 | #endif
|
251 |
|
252 | return ObjectId(val);
|
253 | }
|
254 |
|
255 | } // namespace j8
|