1 | #include "mycpp/gc_str.h"
|
2 |
|
3 | #include <ctype.h> // isalpha(), isdigit()
|
4 | #include <stdarg.h>
|
5 |
|
6 | #include <regex>
|
7 |
|
8 | #include "mycpp/common.h"
|
9 | #include "mycpp/gc_alloc.h" // NewStr()
|
10 | #include "mycpp/gc_builtins.h" // StringToInt()
|
11 | #include "mycpp/gc_list.h" // join(), split() use it
|
12 |
|
13 | GLOBAL_STR(kEmptyString, "");
|
14 |
|
15 | static const std::regex gStrFmtRegex("([^%]*)(?:%(-?[0-9]*)(.))?");
|
16 | static const int kMaxFmtWidth = 256; // arbitrary...
|
17 |
|
18 | int BigStr::find(BigStr* needle, int pos) {
|
19 | int length = len(this);
|
20 | DCHECK(len(needle) == 1); // Oil's usage
|
21 | char c = needle->data_[0];
|
22 | for (int i = pos; i < length; ++i) {
|
23 | if (data_[i] == c) {
|
24 | return i;
|
25 | }
|
26 | }
|
27 | return -1;
|
28 | }
|
29 |
|
30 | int BigStr::rfind(BigStr* needle) {
|
31 | int length = len(this);
|
32 | DCHECK(len(needle) == 1); // Oil's usage
|
33 | char c = needle->data_[0];
|
34 | for (int i = length - 1; i >= 0; --i) {
|
35 | if (data_[i] == c) {
|
36 | return i;
|
37 | }
|
38 | }
|
39 | return -1;
|
40 | }
|
41 |
|
42 | bool BigStr::isdigit() {
|
43 | int n = len(this);
|
44 | if (n == 0) {
|
45 | return false; // special case
|
46 | }
|
47 | for (int i = 0; i < n; ++i) {
|
48 | if (!::isdigit(data_[i])) {
|
49 | return false;
|
50 | }
|
51 | }
|
52 | return true;
|
53 | }
|
54 |
|
55 | bool BigStr::isalpha() {
|
56 | int n = len(this);
|
57 | if (n == 0) {
|
58 | return false; // special case
|
59 | }
|
60 | for (int i = 0; i < n; ++i) {
|
61 | if (!::isalpha(data_[i])) {
|
62 | return false;
|
63 | }
|
64 | }
|
65 | return true;
|
66 | }
|
67 |
|
68 | // e.g. for osh/braces.py
|
69 | bool BigStr::isupper() {
|
70 | int n = len(this);
|
71 | if (n == 0) {
|
72 | return false; // special case
|
73 | }
|
74 | for (int i = 0; i < n; ++i) {
|
75 | if (!::isupper(data_[i])) {
|
76 | return false;
|
77 | }
|
78 | }
|
79 | return true;
|
80 | }
|
81 |
|
82 | bool BigStr::startswith(BigStr* s) {
|
83 | int n = len(s);
|
84 | if (n > len(this)) {
|
85 | return false;
|
86 | }
|
87 | return memcmp(data_, s->data_, n) == 0;
|
88 | }
|
89 |
|
90 | bool BigStr::endswith(BigStr* s) {
|
91 | int len_s = len(s);
|
92 | int len_this = len(this);
|
93 | if (len_s > len_this) {
|
94 | return false;
|
95 | }
|
96 | const char* start = data_ + len_this - len_s;
|
97 | return memcmp(start, s->data_, len_s) == 0;
|
98 | }
|
99 |
|
100 | // Get a string with one character
|
101 | BigStr* BigStr::at(int i) {
|
102 | int length = len(this);
|
103 | if (i < 0) {
|
104 | i = length + i;
|
105 | }
|
106 | DCHECK(0 <= i);
|
107 | DCHECK(i < length); // had a problem here!
|
108 |
|
109 | BigStr* result = NewStr(1);
|
110 | result->data_[0] = data_[i];
|
111 | return result;
|
112 | }
|
113 |
|
114 | // s[begin:]
|
115 | BigStr* BigStr::slice(int begin) {
|
116 | return slice(begin, len(this));
|
117 | }
|
118 |
|
119 | // s[begin:end]
|
120 | BigStr* BigStr::slice(int begin, int end) {
|
121 | int length = len(this);
|
122 | SLICE_ADJUST(begin, end, length);
|
123 |
|
124 | DCHECK(0 <= begin && begin <= length);
|
125 | DCHECK(0 <= end && end <= length);
|
126 |
|
127 | int new_len = end - begin;
|
128 | DCHECK(0 <= new_len && new_len <= length);
|
129 |
|
130 | BigStr* result = NewStr(new_len);
|
131 | memcpy(result->data_, data_ + begin, new_len);
|
132 |
|
133 | return result;
|
134 | }
|
135 |
|
136 | // Used by 'help' builtin and --help, neither of which translate yet.
|
137 |
|
138 | List<BigStr*>* BigStr::splitlines(bool keep) {
|
139 | DCHECK(keep == true);
|
140 | FAIL(kNotImplemented);
|
141 | }
|
142 |
|
143 | BigStr* BigStr::upper() {
|
144 | int length = len(this);
|
145 | BigStr* result = NewStr(length);
|
146 | char* buffer = result->data();
|
147 | for (int char_index = 0; char_index < length; ++char_index) {
|
148 | buffer[char_index] = toupper(data_[char_index]);
|
149 | }
|
150 | return result;
|
151 | }
|
152 |
|
153 | BigStr* BigStr::lower() {
|
154 | int length = len(this);
|
155 | BigStr* result = NewStr(length);
|
156 | char* buffer = result->data();
|
157 | for (int char_index = 0; char_index < length; ++char_index) {
|
158 | buffer[char_index] = tolower(data_[char_index]);
|
159 | }
|
160 | return result;
|
161 | }
|
162 |
|
163 | BigStr* BigStr::ljust(int width, BigStr* fillchar) {
|
164 | DCHECK(len(fillchar) == 1);
|
165 |
|
166 | int length = len(this);
|
167 | int num_fill = width - length;
|
168 | if (num_fill < 0) {
|
169 | return this;
|
170 | } else {
|
171 | BigStr* result = NewStr(width);
|
172 | char c = fillchar->data_[0];
|
173 | memcpy(result->data_, data_, length);
|
174 | for (int i = length; i < width; ++i) {
|
175 | result->data_[i] = c;
|
176 | }
|
177 | return result;
|
178 | }
|
179 | }
|
180 |
|
181 | BigStr* BigStr::rjust(int width, BigStr* fillchar) {
|
182 | DCHECK(len(fillchar) == 1);
|
183 |
|
184 | int length = len(this);
|
185 | int num_fill = width - length;
|
186 | if (num_fill < 0) {
|
187 | return this;
|
188 | } else {
|
189 | BigStr* result = NewStr(width);
|
190 | char c = fillchar->data_[0];
|
191 | for (int i = 0; i < num_fill; ++i) {
|
192 | result->data_[i] = c;
|
193 | }
|
194 | memcpy(result->data_ + num_fill, data_, length);
|
195 | return result;
|
196 | }
|
197 | }
|
198 |
|
199 | BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
|
200 | // Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
|
201 | return replace(old, new_str, -1);
|
202 | }
|
203 |
|
204 | BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
|
205 | // log("replacing %s with %s", old_data, new_str->data_);
|
206 | const char* old_data = old->data_;
|
207 |
|
208 | int this_len = len(this);
|
209 | int old_len = len(old);
|
210 |
|
211 | const char* last_possible = data_ + this_len - old_len;
|
212 |
|
213 | const char* p_this = data_; // advances through 'this'
|
214 |
|
215 | // First pass: Calculate number of replacements, and hence new length
|
216 | int replace_count = 0;
|
217 | if (old_len == 0) {
|
218 | replace_count = this_len + 1;
|
219 | if (count > 0) {
|
220 | replace_count = min(replace_count, count);
|
221 | }
|
222 | } else {
|
223 | while (p_this <= last_possible) {
|
224 | if (replace_count != count && // limit replacements (if count != -1)
|
225 | memcmp(p_this, old_data, old_len) == 0) { // equal
|
226 | replace_count++;
|
227 | p_this += old_len;
|
228 | } else {
|
229 | p_this++;
|
230 | }
|
231 | }
|
232 | }
|
233 |
|
234 | // log("replacements %d", replace_count);
|
235 |
|
236 | if (replace_count == 0) {
|
237 | return this; // Reuse the string if there were no replacements
|
238 | }
|
239 |
|
240 | int new_str_len = len(new_str);
|
241 | int result_len =
|
242 | this_len - (replace_count * old_len) + (replace_count * new_str_len);
|
243 |
|
244 | BigStr* result = NewStr(result_len);
|
245 |
|
246 | const char* new_data = new_str->data_;
|
247 | const size_t new_len = new_str_len;
|
248 |
|
249 | // Second pass: Copy pieces into 'result'
|
250 | p_this = data_; // back to beginning
|
251 | char* p_result = result->data_; // advances through 'result'
|
252 | replace_count = 0;
|
253 |
|
254 | if (old_len == 0) {
|
255 | // Should place new_str between each char in this
|
256 | while (p_this < last_possible && replace_count != count) {
|
257 | replace_count++;
|
258 | memcpy(p_result, new_data, new_len); // Copy from new_str
|
259 | p_result += new_len; // Move past new_str
|
260 |
|
261 | // Write a char from this
|
262 | *p_result = *p_this;
|
263 | p_this++;
|
264 | p_result++;
|
265 | }
|
266 |
|
267 | if (replace_count != count) {
|
268 | // Write a copy of new_str at the end
|
269 | assert(p_this == last_possible);
|
270 | memcpy(p_result, new_data, new_len);
|
271 | } else if (p_this <= last_possible) {
|
272 | // Write the last part of string
|
273 | memcpy(p_result, p_this, data_ + this_len - p_this);
|
274 | }
|
275 | } else {
|
276 | while (p_this <= last_possible) {
|
277 | // Note: would be more efficient if we remembered the match positions
|
278 | if (replace_count != count && // limit replacements (if count != -1)
|
279 | memcmp(p_this, old_data, old_len) == 0) { // equal
|
280 | memcpy(p_result, new_data, new_len); // Copy from new_str
|
281 | replace_count++;
|
282 | p_result += new_len;
|
283 | p_this += old_len;
|
284 | } else { // copy 1 byte
|
285 | *p_result = *p_this;
|
286 | p_result++;
|
287 | p_this++;
|
288 | }
|
289 | }
|
290 | memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
|
291 | }
|
292 |
|
293 | return result;
|
294 | }
|
295 |
|
296 | enum class StripWhere {
|
297 | Left,
|
298 | Right,
|
299 | Both,
|
300 | };
|
301 |
|
302 | const int kWhitespace = -1;
|
303 |
|
304 | bool OmitChar(uint8_t ch, int what) {
|
305 | if (what == kWhitespace) {
|
306 | return isspace(ch);
|
307 | } else {
|
308 | return what == ch;
|
309 | }
|
310 | }
|
311 |
|
312 | // StripAny is modeled after CPython's do_strip() in stringobject.c, and can
|
313 | // implement 6 functions:
|
314 | //
|
315 | // strip / lstrip / rstrip
|
316 | // strip(char) / lstrip(char) / rstrip(char)
|
317 | //
|
318 | // Args:
|
319 | // where: which ends to strip from
|
320 | // what: kWhitespace, or an ASCII code 0-255
|
321 |
|
322 | BigStr* StripAny(BigStr* s, StripWhere where, int what) {
|
323 | int length = len(s);
|
324 | const char* char_data = s->data();
|
325 |
|
326 | int i = 0;
|
327 | if (where != StripWhere::Right) {
|
328 | while (i < length && OmitChar(char_data[i], what)) {
|
329 | i++;
|
330 | }
|
331 | }
|
332 |
|
333 | int j = length;
|
334 | if (where != StripWhere::Left) {
|
335 | do {
|
336 | j--;
|
337 | } while (j >= i && OmitChar(char_data[j], what));
|
338 | j++;
|
339 | }
|
340 |
|
341 | if (i == j) { // Optimization to reuse existing object
|
342 | return kEmptyString;
|
343 | }
|
344 |
|
345 | if (i == 0 && j == length) { // nothing stripped
|
346 | return s;
|
347 | }
|
348 |
|
349 | // Note: makes a copy in leaky version, and will in GC version too
|
350 | int new_len = j - i;
|
351 | BigStr* result = NewStr(new_len);
|
352 | memcpy(result->data(), s->data() + i, new_len);
|
353 | return result;
|
354 | }
|
355 |
|
356 | BigStr* BigStr::strip() {
|
357 | return StripAny(this, StripWhere::Both, kWhitespace);
|
358 | }
|
359 |
|
360 | // Used for CommandSub in osh/cmd_exec.py
|
361 | BigStr* BigStr::rstrip(BigStr* chars) {
|
362 | DCHECK(len(chars) == 1);
|
363 | int c = chars->data_[0];
|
364 | return StripAny(this, StripWhere::Right, c);
|
365 | }
|
366 |
|
367 | BigStr* BigStr::rstrip() {
|
368 | return StripAny(this, StripWhere::Right, kWhitespace);
|
369 | }
|
370 |
|
371 | BigStr* BigStr::lstrip(BigStr* chars) {
|
372 | DCHECK(len(chars) == 1);
|
373 | int c = chars->data_[0];
|
374 | return StripAny(this, StripWhere::Left, c);
|
375 | }
|
376 |
|
377 | BigStr* BigStr::lstrip() {
|
378 | return StripAny(this, StripWhere::Left, kWhitespace);
|
379 | }
|
380 |
|
381 | BigStr* BigStr::join(List<BigStr*>* items) {
|
382 | int length = 0;
|
383 |
|
384 | int num_parts = len(items);
|
385 |
|
386 | // " ".join([]) == ""
|
387 | if (num_parts == 0) {
|
388 | return kEmptyString;
|
389 | }
|
390 |
|
391 | // Common case
|
392 | // 'anything'.join(["foo"]) == "foo"
|
393 | if (num_parts == 1) {
|
394 | return items->at(0);
|
395 | }
|
396 |
|
397 | for (int i = 0; i < num_parts; ++i) {
|
398 | length += len(items->at(i));
|
399 | }
|
400 | // add length of all the separators
|
401 | int this_len = len(this);
|
402 | length += this_len * (num_parts - 1);
|
403 |
|
404 | BigStr* result = NewStr(length);
|
405 | char* p_result = result->data_; // advances through
|
406 |
|
407 | for (int i = 0; i < num_parts; ++i) {
|
408 | // log("i %d", i);
|
409 | if (i != 0 && this_len) { // optimize common case of ''.join()
|
410 | memcpy(p_result, data_, this_len); // copy the separator
|
411 | p_result += this_len;
|
412 | // log("this_len %d", this_len);
|
413 | }
|
414 |
|
415 | int n = len(items->at(i));
|
416 | // log("n: %d", n);
|
417 | memcpy(p_result, items->at(i)->data_, n); // copy the list item
|
418 | p_result += n;
|
419 | }
|
420 |
|
421 | return result;
|
422 | }
|
423 |
|
424 | static void AppendPart(List<BigStr*>* result, BigStr* s, int left, int right) {
|
425 | int new_len = right - left;
|
426 | BigStr* part;
|
427 | if (new_len == 0) {
|
428 | part = kEmptyString;
|
429 | } else {
|
430 | part = NewStr(new_len);
|
431 | memcpy(part->data_, s->data_ + left, new_len);
|
432 | }
|
433 | result->append(part);
|
434 | }
|
435 |
|
436 | // Split BigStr into List<BigStr*> of parts separated by 'sep'.
|
437 | // The code structure is taken from CPython's Objects/stringlib/split.h.
|
438 | List<BigStr*>* BigStr::split(BigStr* sep, int max_split) {
|
439 | DCHECK(sep != nullptr);
|
440 | DCHECK(len(sep) == 1); // we can only split one char
|
441 | char sep_char = sep->data_[0];
|
442 |
|
443 | int str_len = len(this);
|
444 | if (str_len == 0) {
|
445 | // weird case consistent with Python: ''.split(':') == ['']
|
446 | return NewList<BigStr*>({kEmptyString});
|
447 | }
|
448 |
|
449 | List<BigStr*>* result = NewList<BigStr*>({});
|
450 | int left = 0;
|
451 | int right = 0;
|
452 | int num_parts = 0; // 3 splits results in 4 parts
|
453 |
|
454 | while (right < str_len && num_parts < max_split) {
|
455 | // search for separator
|
456 | for (; right < str_len; right++) {
|
457 | if (data_[right] == sep_char) {
|
458 | AppendPart(result, this, left, right);
|
459 | right++;
|
460 | left = right;
|
461 | num_parts++;
|
462 | break;
|
463 | }
|
464 | }
|
465 | }
|
466 | if (num_parts == 0) { // Optimization when there is no split
|
467 | result->append(this);
|
468 | } else if (left <= str_len) { // Last part
|
469 | AppendPart(result, this, left, str_len);
|
470 | }
|
471 |
|
472 | return result;
|
473 | }
|
474 |
|
475 | List<BigStr*>* BigStr::split(BigStr* sep) {
|
476 | return this->split(sep, len(this));
|
477 | }
|
478 |
|
479 | unsigned BigStr::hash(HashFunc h) {
|
480 | if (!is_hashed_) {
|
481 | hash_ = h(data_, len(this)) >> 1;
|
482 | is_hashed_ = 1;
|
483 | }
|
484 | return hash_;
|
485 | }
|
486 |
|
487 | static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
|
488 | auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
|
489 | auto end = std::cregex_iterator();
|
490 |
|
491 | char int_buf[kMaxFmtWidth];
|
492 | std::string buf;
|
493 | for (std::cregex_iterator it = beg; it != end; ++it) {
|
494 | const std::cmatch& match = *it;
|
495 |
|
496 | const std::csub_match& lit_m = match[1];
|
497 | DCHECK(lit_m.matched);
|
498 | const std::string& lit_s = lit_m.str();
|
499 | buf.append(lit_s);
|
500 |
|
501 | int width = 0;
|
502 | bool zero_pad = false;
|
503 | bool pad_back = false;
|
504 | const std::csub_match& width_m = match[2];
|
505 | const std::string& width_s = width_m.str();
|
506 | bool ok = false;
|
507 | if (width_m.matched && !width_s.empty()) {
|
508 | if (width_s[0] == '0') {
|
509 | zero_pad = true;
|
510 | DCHECK(width_s.size() > 1);
|
511 | ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
|
512 | DCHECK(ok);
|
513 | (void)ok; // silence unused var warning in opt
|
514 | } else {
|
515 | ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
|
516 | DCHECK(ok);
|
517 | }
|
518 | if (width < 0) {
|
519 | pad_back = true;
|
520 | width *= -1;
|
521 | }
|
522 | DCHECK(0 <= width && width < kMaxFmtWidth);
|
523 | }
|
524 |
|
525 | char const* str_to_add = nullptr;
|
526 | int add_len = 0;
|
527 | const std::csub_match& code_m = match[3];
|
528 | const std::string& code_s = code_m.str();
|
529 | if (!code_m.matched) {
|
530 | DCHECK(!width_m.matched); // python errors on invalid format operators
|
531 | break;
|
532 | }
|
533 | DCHECK(code_s.size() == 1);
|
534 | switch (code_s[0]) {
|
535 | case '%': {
|
536 | str_to_add = code_s.c_str();
|
537 | add_len = 1;
|
538 | break;
|
539 | }
|
540 | case 's': {
|
541 | BigStr* s = va_arg(args, BigStr*);
|
542 | // Check type unconditionally because mycpp doesn't always check it
|
543 | CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
|
544 |
|
545 | str_to_add = s->data();
|
546 | add_len = len(s);
|
547 | zero_pad = false; // python ignores the 0 directive for strings
|
548 | break;
|
549 | }
|
550 | case 'r': {
|
551 | BigStr* s = va_arg(args, BigStr*);
|
552 | // Check type unconditionally because mycpp doesn't always check it
|
553 | CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
|
554 |
|
555 | s = repr(s);
|
556 | str_to_add = s->data();
|
557 | add_len = len(s);
|
558 | zero_pad = false; // python ignores the 0 directive for strings
|
559 | break;
|
560 | }
|
561 | case 'd': // fallthrough
|
562 | case 'o': {
|
563 | int d = va_arg(args, int);
|
564 | add_len = snprintf(int_buf, kMaxFmtWidth,
|
565 | match.str().c_str() + lit_s.size(), d);
|
566 | DCHECK(add_len > 0);
|
567 | str_to_add = int_buf;
|
568 | break;
|
569 | }
|
570 | default:
|
571 | DCHECK(0);
|
572 | break;
|
573 | }
|
574 | DCHECK(str_to_add != nullptr);
|
575 |
|
576 | if (pad_back) {
|
577 | buf.append(str_to_add, add_len);
|
578 | }
|
579 | if (add_len < width) {
|
580 | for (int i = 0; i < width - add_len; ++i) {
|
581 | buf.push_back(zero_pad ? '0' : ' ');
|
582 | }
|
583 | }
|
584 | if (!pad_back) {
|
585 | buf.append(str_to_add, add_len);
|
586 | }
|
587 | }
|
588 |
|
589 | return StrFromC(buf.c_str(), buf.size());
|
590 | }
|
591 |
|
592 | BigStr* StrIter::Value() { // similar to at()
|
593 | BigStr* result = NewStr(1);
|
594 | result->data_[0] = s_->data_[i_];
|
595 | DCHECK(result->data_[1] == '\0');
|
596 | return result;
|
597 | }
|
598 |
|
599 | BigStr* StrFormat(const char* fmt, ...) {
|
600 | va_list args;
|
601 | va_start(args, fmt);
|
602 | BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
|
603 | va_end(args);
|
604 | return ret;
|
605 | }
|
606 |
|
607 | BigStr* StrFormat(BigStr* fmt, ...) {
|
608 | va_list args;
|
609 | va_start(args, fmt);
|
610 | BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
|
611 | va_end(args);
|
612 | return ret;
|
613 | }
|