OILS / mycpp / gc_str.cc View on Github | oilshell.org

613 lines, 389 significant
1#include "mycpp/gc_str.h"
2
3#include <ctype.h> // isalpha(), isdigit()
4#include <stdarg.h>
5
6#include <regex>
7
8#include "mycpp/common.h"
9#include "mycpp/gc_alloc.h" // NewStr()
10#include "mycpp/gc_builtins.h" // StringToInt()
11#include "mycpp/gc_list.h" // join(), split() use it
12
13GLOBAL_STR(kEmptyString, "");
14
15static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
16static const int kMaxFmtWidth = 256; // arbitrary...
17
18int BigStr::find(BigStr* needle, int pos) {
19 int length = len(this);
20 DCHECK(len(needle) == 1); // Oil's usage
21 char c = needle->data_[0];
22 for (int i = pos; i < length; ++i) {
23 if (data_[i] == c) {
24 return i;
25 }
26 }
27 return -1;
28}
29
30int BigStr::rfind(BigStr* needle) {
31 int length = len(this);
32 DCHECK(len(needle) == 1); // Oil's usage
33 char c = needle->data_[0];
34 for (int i = length - 1; i >= 0; --i) {
35 if (data_[i] == c) {
36 return i;
37 }
38 }
39 return -1;
40}
41
42bool BigStr::isdigit() {
43 int n = len(this);
44 if (n == 0) {
45 return false; // special case
46 }
47 for (int i = 0; i < n; ++i) {
48 if (!::isdigit(data_[i])) {
49 return false;
50 }
51 }
52 return true;
53}
54
55bool BigStr::isalpha() {
56 int n = len(this);
57 if (n == 0) {
58 return false; // special case
59 }
60 for (int i = 0; i < n; ++i) {
61 if (!::isalpha(data_[i])) {
62 return false;
63 }
64 }
65 return true;
66}
67
68// e.g. for osh/braces.py
69bool BigStr::isupper() {
70 int n = len(this);
71 if (n == 0) {
72 return false; // special case
73 }
74 for (int i = 0; i < n; ++i) {
75 if (!::isupper(data_[i])) {
76 return false;
77 }
78 }
79 return true;
80}
81
82bool BigStr::startswith(BigStr* s) {
83 int n = len(s);
84 if (n > len(this)) {
85 return false;
86 }
87 return memcmp(data_, s->data_, n) == 0;
88}
89
90bool BigStr::endswith(BigStr* s) {
91 int len_s = len(s);
92 int len_this = len(this);
93 if (len_s > len_this) {
94 return false;
95 }
96 const char* start = data_ + len_this - len_s;
97 return memcmp(start, s->data_, len_s) == 0;
98}
99
100// Get a string with one character
101BigStr* BigStr::at(int i) {
102 int length = len(this);
103 if (i < 0) {
104 i = length + i;
105 }
106 DCHECK(0 <= i);
107 DCHECK(i < length); // had a problem here!
108
109 BigStr* result = NewStr(1);
110 result->data_[0] = data_[i];
111 return result;
112}
113
114// s[begin:]
115BigStr* BigStr::slice(int begin) {
116 return slice(begin, len(this));
117}
118
119// s[begin:end]
120BigStr* BigStr::slice(int begin, int end) {
121 int length = len(this);
122 SLICE_ADJUST(begin, end, length);
123
124 DCHECK(0 <= begin && begin <= length);
125 DCHECK(0 <= end && end <= length);
126
127 int new_len = end - begin;
128 DCHECK(0 <= new_len && new_len <= length);
129
130 BigStr* result = NewStr(new_len);
131 memcpy(result->data_, data_ + begin, new_len);
132
133 return result;
134}
135
136// Used by 'help' builtin and --help, neither of which translate yet.
137
138List<BigStr*>* BigStr::splitlines(bool keep) {
139 DCHECK(keep == true);
140 FAIL(kNotImplemented);
141}
142
143BigStr* BigStr::upper() {
144 int length = len(this);
145 BigStr* result = NewStr(length);
146 char* buffer = result->data();
147 for (int char_index = 0; char_index < length; ++char_index) {
148 buffer[char_index] = toupper(data_[char_index]);
149 }
150 return result;
151}
152
153BigStr* BigStr::lower() {
154 int length = len(this);
155 BigStr* result = NewStr(length);
156 char* buffer = result->data();
157 for (int char_index = 0; char_index < length; ++char_index) {
158 buffer[char_index] = tolower(data_[char_index]);
159 }
160 return result;
161}
162
163BigStr* BigStr::ljust(int width, BigStr* fillchar) {
164 DCHECK(len(fillchar) == 1);
165
166 int length = len(this);
167 int num_fill = width - length;
168 if (num_fill < 0) {
169 return this;
170 } else {
171 BigStr* result = NewStr(width);
172 char c = fillchar->data_[0];
173 memcpy(result->data_, data_, length);
174 for (int i = length; i < width; ++i) {
175 result->data_[i] = c;
176 }
177 return result;
178 }
179}
180
181BigStr* BigStr::rjust(int width, BigStr* fillchar) {
182 DCHECK(len(fillchar) == 1);
183
184 int length = len(this);
185 int num_fill = width - length;
186 if (num_fill < 0) {
187 return this;
188 } else {
189 BigStr* result = NewStr(width);
190 char c = fillchar->data_[0];
191 for (int i = 0; i < num_fill; ++i) {
192 result->data_[i] = c;
193 }
194 memcpy(result->data_ + num_fill, data_, length);
195 return result;
196 }
197}
198
199BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
200 // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
201 return replace(old, new_str, -1);
202}
203
204BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
205 // log("replacing %s with %s", old_data, new_str->data_);
206 const char* old_data = old->data_;
207
208 int this_len = len(this);
209 int old_len = len(old);
210
211 const char* last_possible = data_ + this_len - old_len;
212
213 const char* p_this = data_; // advances through 'this'
214
215 // First pass: Calculate number of replacements, and hence new length
216 int replace_count = 0;
217 if (old_len == 0) {
218 replace_count = this_len + 1;
219 if (count > 0) {
220 replace_count = min(replace_count, count);
221 }
222 } else {
223 while (p_this <= last_possible) {
224 if (replace_count != count && // limit replacements (if count != -1)
225 memcmp(p_this, old_data, old_len) == 0) { // equal
226 replace_count++;
227 p_this += old_len;
228 } else {
229 p_this++;
230 }
231 }
232 }
233
234 // log("replacements %d", replace_count);
235
236 if (replace_count == 0) {
237 return this; // Reuse the string if there were no replacements
238 }
239
240 int new_str_len = len(new_str);
241 int result_len =
242 this_len - (replace_count * old_len) + (replace_count * new_str_len);
243
244 BigStr* result = NewStr(result_len);
245
246 const char* new_data = new_str->data_;
247 const size_t new_len = new_str_len;
248
249 // Second pass: Copy pieces into 'result'
250 p_this = data_; // back to beginning
251 char* p_result = result->data_; // advances through 'result'
252 replace_count = 0;
253
254 if (old_len == 0) {
255 // Should place new_str between each char in this
256 while (p_this < last_possible && replace_count != count) {
257 replace_count++;
258 memcpy(p_result, new_data, new_len); // Copy from new_str
259 p_result += new_len; // Move past new_str
260
261 // Write a char from this
262 *p_result = *p_this;
263 p_this++;
264 p_result++;
265 }
266
267 if (replace_count != count) {
268 // Write a copy of new_str at the end
269 assert(p_this == last_possible);
270 memcpy(p_result, new_data, new_len);
271 } else if (p_this <= last_possible) {
272 // Write the last part of string
273 memcpy(p_result, p_this, data_ + this_len - p_this);
274 }
275 } else {
276 while (p_this <= last_possible) {
277 // Note: would be more efficient if we remembered the match positions
278 if (replace_count != count && // limit replacements (if count != -1)
279 memcmp(p_this, old_data, old_len) == 0) { // equal
280 memcpy(p_result, new_data, new_len); // Copy from new_str
281 replace_count++;
282 p_result += new_len;
283 p_this += old_len;
284 } else { // copy 1 byte
285 *p_result = *p_this;
286 p_result++;
287 p_this++;
288 }
289 }
290 memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
291 }
292
293 return result;
294}
295
296enum class StripWhere {
297 Left,
298 Right,
299 Both,
300};
301
302const int kWhitespace = -1;
303
304bool OmitChar(uint8_t ch, int what) {
305 if (what == kWhitespace) {
306 return isspace(ch);
307 } else {
308 return what == ch;
309 }
310}
311
312// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
313// implement 6 functions:
314//
315// strip / lstrip / rstrip
316// strip(char) / lstrip(char) / rstrip(char)
317//
318// Args:
319// where: which ends to strip from
320// what: kWhitespace, or an ASCII code 0-255
321
322BigStr* StripAny(BigStr* s, StripWhere where, int what) {
323 int length = len(s);
324 const char* char_data = s->data();
325
326 int i = 0;
327 if (where != StripWhere::Right) {
328 while (i < length && OmitChar(char_data[i], what)) {
329 i++;
330 }
331 }
332
333 int j = length;
334 if (where != StripWhere::Left) {
335 do {
336 j--;
337 } while (j >= i && OmitChar(char_data[j], what));
338 j++;
339 }
340
341 if (i == j) { // Optimization to reuse existing object
342 return kEmptyString;
343 }
344
345 if (i == 0 && j == length) { // nothing stripped
346 return s;
347 }
348
349 // Note: makes a copy in leaky version, and will in GC version too
350 int new_len = j - i;
351 BigStr* result = NewStr(new_len);
352 memcpy(result->data(), s->data() + i, new_len);
353 return result;
354}
355
356BigStr* BigStr::strip() {
357 return StripAny(this, StripWhere::Both, kWhitespace);
358}
359
360// Used for CommandSub in osh/cmd_exec.py
361BigStr* BigStr::rstrip(BigStr* chars) {
362 DCHECK(len(chars) == 1);
363 int c = chars->data_[0];
364 return StripAny(this, StripWhere::Right, c);
365}
366
367BigStr* BigStr::rstrip() {
368 return StripAny(this, StripWhere::Right, kWhitespace);
369}
370
371BigStr* BigStr::lstrip(BigStr* chars) {
372 DCHECK(len(chars) == 1);
373 int c = chars->data_[0];
374 return StripAny(this, StripWhere::Left, c);
375}
376
377BigStr* BigStr::lstrip() {
378 return StripAny(this, StripWhere::Left, kWhitespace);
379}
380
381BigStr* BigStr::join(List<BigStr*>* items) {
382 int length = 0;
383
384 int num_parts = len(items);
385
386 // " ".join([]) == ""
387 if (num_parts == 0) {
388 return kEmptyString;
389 }
390
391 // Common case
392 // 'anything'.join(["foo"]) == "foo"
393 if (num_parts == 1) {
394 return items->at(0);
395 }
396
397 for (int i = 0; i < num_parts; ++i) {
398 length += len(items->at(i));
399 }
400 // add length of all the separators
401 int this_len = len(this);
402 length += this_len * (num_parts - 1);
403
404 BigStr* result = NewStr(length);
405 char* p_result = result->data_; // advances through
406
407 for (int i = 0; i < num_parts; ++i) {
408 // log("i %d", i);
409 if (i != 0 && this_len) { // optimize common case of ''.join()
410 memcpy(p_result, data_, this_len); // copy the separator
411 p_result += this_len;
412 // log("this_len %d", this_len);
413 }
414
415 int n = len(items->at(i));
416 // log("n: %d", n);
417 memcpy(p_result, items->at(i)->data_, n); // copy the list item
418 p_result += n;
419 }
420
421 return result;
422}
423
424static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
425 int new_len = right - left;
426 BigStr* part;
427 if (new_len == 0) {
428 part = kEmptyString;
429 } else {
430 part = NewStr(new_len);
431 memcpy(part->data_, s->data_ + left, new_len);
432 }
433 result->append(part);
434}
435
436// Split BigStr into List<BigStr*> of parts separated by 'sep'.
437// The code structure is taken from CPython's Objects/stringlib/split.h.
438List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
439 DCHECK(sep != nullptr);
440 DCHECK(len(sep) == 1); // we can only split one char
441 char sep_char = sep->data_[0];
442
443 int str_len = len(this);
444 if (str_len == 0) {
445 // weird case consistent with Python: ''.split(':') == ['']
446 return NewList<BigStr*>({kEmptyString});
447 }
448
449 List<BigStr*>* result = NewList<BigStr*>({});
450 int left = 0;
451 int right = 0;
452 int num_parts = 0; // 3 splits results in 4 parts
453
454 while (right < str_len && num_parts < max_split) {
455 // search for separator
456 for (; right < str_len; right++) {
457 if (data_[right] == sep_char) {
458 AppendPart(result, this, left, right);
459 right++;
460 left = right;
461 num_parts++;
462 break;
463 }
464 }
465 }
466 if (num_parts == 0) { // Optimization when there is no split
467 result->append(this);
468 } else if (left <= str_len) { // Last part
469 AppendPart(result, this, left, str_len);
470 }
471
472 return result;
473}
474
475List<BigStr*>* BigStr::split(BigStr* sep) {
476 return this->split(sep, len(this));
477}
478
479unsigned BigStr::hash(HashFunc h) {
480 if (!is_hashed_) {
481 hash_ = h(data_, len(this)) >> 1;
482 is_hashed_ = 1;
483 }
484 return hash_;
485}
486
487static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
488 auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
489 auto end = std::cregex_iterator();
490
491 char int_buf[kMaxFmtWidth];
492 std::string buf;
493 for (std::cregex_iterator it = beg; it != end; ++it) {
494 const std::cmatch& match = *it;
495
496 const std::csub_match& lit_m = match[1];
497 DCHECK(lit_m.matched);
498 const std::string& lit_s = lit_m.str();
499 buf.append(lit_s);
500
501 int width = 0;
502 bool zero_pad = false;
503 bool pad_back = false;
504 const std::csub_match& width_m = match[2];
505 const std::string& width_s = width_m.str();
506 bool ok = false;
507 if (width_m.matched && !width_s.empty()) {
508 if (width_s[0] == '0') {
509 zero_pad = true;
510 DCHECK(width_s.size() > 1);
511 ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
512 DCHECK(ok);
513 (void)ok; // silence unused var warning in opt
514 } else {
515 ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
516 DCHECK(ok);
517 }
518 if (width < 0) {
519 pad_back = true;
520 width *= -1;
521 }
522 DCHECK(0 <= width && width < kMaxFmtWidth);
523 }
524
525 char const* str_to_add = nullptr;
526 int add_len = 0;
527 const std::csub_match& code_m = match[3];
528 const std::string& code_s = code_m.str();
529 if (!code_m.matched) {
530 DCHECK(!width_m.matched); // python errors on invalid format operators
531 break;
532 }
533 DCHECK(code_s.size() == 1);
534 switch (code_s[0]) {
535 case '%': {
536 str_to_add = code_s.c_str();
537 add_len = 1;
538 break;
539 }
540 case 's': {
541 BigStr* s = va_arg(args, BigStr*);
542 // Check type unconditionally because mycpp doesn't always check it
543 CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
544
545 str_to_add = s->data();
546 add_len = len(s);
547 zero_pad = false; // python ignores the 0 directive for strings
548 break;
549 }
550 case 'r': {
551 BigStr* s = va_arg(args, BigStr*);
552 // Check type unconditionally because mycpp doesn't always check it
553 CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
554
555 s = repr(s);
556 str_to_add = s->data();
557 add_len = len(s);
558 zero_pad = false; // python ignores the 0 directive for strings
559 break;
560 }
561 case 'd': // fallthrough
562 case 'o': {
563 int d = va_arg(args, int);
564 add_len = snprintf(int_buf, kMaxFmtWidth,
565 match.str().c_str() + lit_s.size(), d);
566 DCHECK(add_len > 0);
567 str_to_add = int_buf;
568 break;
569 }
570 default:
571 DCHECK(0);
572 break;
573 }
574 DCHECK(str_to_add != nullptr);
575
576 if (pad_back) {
577 buf.append(str_to_add, add_len);
578 }
579 if (add_len < width) {
580 for (int i = 0; i < width - add_len; ++i) {
581 buf.push_back(zero_pad ? '0' : ' ');
582 }
583 }
584 if (!pad_back) {
585 buf.append(str_to_add, add_len);
586 }
587 }
588
589 return StrFromC(buf.c_str(), buf.size());
590}
591
592BigStr* StrIter::Value() { // similar to at()
593 BigStr* result = NewStr(1);
594 result->data_[0] = s_->data_[i_];
595 DCHECK(result->data_[1] == '\0');
596 return result;
597}
598
599BigStr* StrFormat(const char* fmt, ...) {
600 va_list args;
601 va_start(args, fmt);
602 BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
603 va_end(args);
604 return ret;
605}
606
607BigStr* StrFormat(BigStr* fmt, ...) {
608 va_list args;
609 va_start(args, fmt);
610 BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
611 va_end(args);
612 return ret;
613}