mycpp/gc_str.cc

OILS / mycpp / gc_str.cc View on Github | oilshell.org

613 lines, 389 significant

1	#include "mycpp/gc_str.h"
2
3	#include <ctype.h> // isalpha(), isdigit()
4	#include <stdarg.h>
5
6	#include <regex>
7
8	#include "mycpp/common.h"
9	#include "mycpp/gc_alloc.h" // NewStr()
10	#include "mycpp/gc_builtins.h" // StringToInt()
11	#include "mycpp/gc_list.h" // join(), split() use it
12
13	GLOBAL_STR(kEmptyString, "");
14
15	static const std::regex gStrFmtRegex("([^%])(?:%(-?[0-9])(.))?");
16	static const int kMaxFmtWidth = 256; // arbitrary...
17
18	int BigStr::find(BigStr* needle, int pos) {
19	int length = len(this);
20	DCHECK(len(needle) == 1); // Oil's usage
21	char c = needle->data_[0];
22	for (int i = pos; i < length; ++i) {
23	if (data_[i] == c) {
24	return i;
25	}
26	}
27	return -1;
28	}
29
30	int BigStr::rfind(BigStr* needle) {
31	int length = len(this);
32	DCHECK(len(needle) == 1); // Oil's usage
33	char c = needle->data_[0];
34	for (int i = length - 1; i >= 0; --i) {
35	if (data_[i] == c) {
36	return i;
37	}
38	}
39	return -1;
40	}
41
42	bool BigStr::isdigit() {
43	int n = len(this);
44	if (n == 0) {
45	return false; // special case
46	}
47	for (int i = 0; i < n; ++i) {
48	if (!::isdigit(data_[i])) {
49	return false;
50	}
51	}
52	return true;
53	}
54
55	bool BigStr::isalpha() {
56	int n = len(this);
57	if (n == 0) {
58	return false; // special case
59	}
60	for (int i = 0; i < n; ++i) {
61	if (!::isalpha(data_[i])) {
62	return false;
63	}
64	}
65	return true;
66	}
67
68	// e.g. for osh/braces.py
69	bool BigStr::isupper() {
70	int n = len(this);
71	if (n == 0) {
72	return false; // special case
73	}
74	for (int i = 0; i < n; ++i) {
75	if (!::isupper(data_[i])) {
76	return false;
77	}
78	}
79	return true;
80	}
81
82	bool BigStr::startswith(BigStr* s) {
83	int n = len(s);
84	if (n > len(this)) {
85	return false;
86	}
87	return memcmp(data_, s->data_, n) == 0;
88	}
89
90	bool BigStr::endswith(BigStr* s) {
91	int len_s = len(s);
92	int len_this = len(this);
93	if (len_s > len_this) {
94	return false;
95	}
96	const char* start = data_ + len_this - len_s;
97	return memcmp(start, s->data_, len_s) == 0;
98	}
99
100	// Get a string with one character
101	BigStr* BigStr::at(int i) {
102	int length = len(this);
103	if (i < 0) {
104	i = length + i;
105	}
106	DCHECK(0 <= i);
107	DCHECK(i < length); // had a problem here!
108
109	BigStr* result = NewStr(1);
110	result->data_[0] = data_[i];
111	return result;
112	}
113
114	// s[begin:]
115	BigStr* BigStr::slice(int begin) {
116	return slice(begin, len(this));
117	}
118
119	// s[begin:end]
120	BigStr* BigStr::slice(int begin, int end) {
121	int length = len(this);
122	SLICE_ADJUST(begin, end, length);
123
124	DCHECK(0 <= begin && begin <= length);
125	DCHECK(0 <= end && end <= length);
126
127	int new_len = end - begin;
128	DCHECK(0 <= new_len && new_len <= length);
129
130	BigStr* result = NewStr(new_len);
131	memcpy(result->data_, data_ + begin, new_len);
132
133	return result;
134	}
135
136	// Used by 'help' builtin and --help, neither of which translate yet.
137
138	List<BigStr> BigStr::splitlines(bool keep) {
139	DCHECK(keep == true);
140	FAIL(kNotImplemented);
141	}
142
143	BigStr* BigStr::upper() {
144	int length = len(this);
145	BigStr* result = NewStr(length);
146	char* buffer = result->data();
147	for (int char_index = 0; char_index < length; ++char_index) {
148	buffer[char_index] = toupper(data_[char_index]);
149	}
150	return result;
151	}
152
153	BigStr* BigStr::lower() {
154	int length = len(this);
155	BigStr* result = NewStr(length);
156	char* buffer = result->data();
157	for (int char_index = 0; char_index < length; ++char_index) {
158	buffer[char_index] = tolower(data_[char_index]);
159	}
160	return result;
161	}
162
163	BigStr* BigStr::ljust(int width, BigStr* fillchar) {
164	DCHECK(len(fillchar) == 1);
165
166	int length = len(this);
167	int num_fill = width - length;
168	if (num_fill < 0) {
169	return this;
170	} else {
171	BigStr* result = NewStr(width);
172	char c = fillchar->data_[0];
173	memcpy(result->data_, data_, length);
174	for (int i = length; i < width; ++i) {
175	result->data_[i] = c;
176	}
177	return result;
178	}
179	}
180
181	BigStr* BigStr::rjust(int width, BigStr* fillchar) {
182	DCHECK(len(fillchar) == 1);
183
184	int length = len(this);
185	int num_fill = width - length;
186	if (num_fill < 0) {
187	return this;
188	} else {
189	BigStr* result = NewStr(width);
190	char c = fillchar->data_[0];
191	for (int i = 0; i < num_fill; ++i) {
192	result->data_[i] = c;
193	}
194	memcpy(result->data_ + num_fill, data_, length);
195	return result;
196	}
197	}
198
199	BigStr* BigStr::replace(BigStr* old, BigStr* new_str) {
200	// Use -1 as in python2: "aaaa".replace(-1) -> "AAAA"
201	return replace(old, new_str, -1);
202	}
203
204	BigStr* BigStr::replace(BigStr* old, BigStr* new_str, int count) {
205	// log("replacing %s with %s", old_data, new_str->data_);
206	const char* old_data = old->data_;
207
208	int this_len = len(this);
209	int old_len = len(old);
210
211	const char* last_possible = data_ + this_len - old_len;
212
213	const char* p_this = data_; // advances through 'this'
214
215	// First pass: Calculate number of replacements, and hence new length
216	int replace_count = 0;
217	if (old_len == 0) {
218	replace_count = this_len + 1;
219	if (count > 0) {
220	replace_count = min(replace_count, count);
221	}
222	} else {
223	while (p_this <= last_possible) {
224	if (replace_count != count && // limit replacements (if count != -1)
225	memcmp(p_this, old_data, old_len) == 0) { // equal
226	replace_count++;
227	p_this += old_len;
228	} else {
229	p_this++;
230	}
231	}
232	}
233
234	// log("replacements %d", replace_count);
235
236	if (replace_count == 0) {
237	return this; // Reuse the string if there were no replacements
238	}
239
240	int new_str_len = len(new_str);
241	int result_len =
242	this_len - (replace_count * old_len) + (replace_count * new_str_len);
243
244	BigStr* result = NewStr(result_len);
245
246	const char* new_data = new_str->data_;
247	const size_t new_len = new_str_len;
248
249	// Second pass: Copy pieces into 'result'
250	p_this = data_; // back to beginning
251	char* p_result = result->data_; // advances through 'result'
252	replace_count = 0;
253
254	if (old_len == 0) {
255	// Should place new_str between each char in this
256	while (p_this < last_possible && replace_count != count) {
257	replace_count++;
258	memcpy(p_result, new_data, new_len); // Copy from new_str
259	p_result += new_len; // Move past new_str
260
261	// Write a char from this
262	p_result = p_this;
263	p_this++;
264	p_result++;
265	}
266
267	if (replace_count != count) {
268	// Write a copy of new_str at the end
269	assert(p_this == last_possible);
270	memcpy(p_result, new_data, new_len);
271	} else if (p_this <= last_possible) {
272	// Write the last part of string
273	memcpy(p_result, p_this, data_ + this_len - p_this);
274	}
275	} else {
276	while (p_this <= last_possible) {
277	// Note: would be more efficient if we remembered the match positions
278	if (replace_count != count && // limit replacements (if count != -1)
279	memcmp(p_this, old_data, old_len) == 0) { // equal
280	memcpy(p_result, new_data, new_len); // Copy from new_str
281	replace_count++;
282	p_result += new_len;
283	p_this += old_len;
284	} else { // copy 1 byte
285	p_result = p_this;
286	p_result++;
287	p_this++;
288	}
289	}
290	memcpy(p_result, p_this, data_ + this_len - p_this); // last part of string
291	}
292
293	return result;
294	}
295
296	enum class StripWhere {
297	Left,
298	Right,
299	Both,
300	};
301
302	const int kWhitespace = -1;
303
304	bool OmitChar(uint8_t ch, int what) {
305	if (what == kWhitespace) {
306	return isspace(ch);
307	} else {
308	return what == ch;
309	}
310	}
311
312	// StripAny is modeled after CPython's do_strip() in stringobject.c, and can
313	// implement 6 functions:
314	//
315	// strip / lstrip / rstrip
316	// strip(char) / lstrip(char) / rstrip(char)
317	//
318	// Args:
319	// where: which ends to strip from
320	// what: kWhitespace, or an ASCII code 0-255
321
322	BigStr* StripAny(BigStr* s, StripWhere where, int what) {
323	int length = len(s);
324	const char* char_data = s->data();
325
326	int i = 0;
327	if (where != StripWhere::Right) {
328	while (i < length && OmitChar(char_data[i], what)) {
329	i++;
330	}
331	}
332
333	int j = length;
334	if (where != StripWhere::Left) {
335	do {
336	j--;
337	} while (j >= i && OmitChar(char_data[j], what));
338	j++;
339	}
340
341	if (i == j) { // Optimization to reuse existing object
342	return kEmptyString;
343	}
344
345	if (i == 0 && j == length) { // nothing stripped
346	return s;
347	}
348
349	// Note: makes a copy in leaky version, and will in GC version too
350	int new_len = j - i;
351	BigStr* result = NewStr(new_len);
352	memcpy(result->data(), s->data() + i, new_len);
353	return result;
354	}
355
356	BigStr* BigStr::strip() {
357	return StripAny(this, StripWhere::Both, kWhitespace);
358	}
359
360	// Used for CommandSub in osh/cmd_exec.py
361	BigStr* BigStr::rstrip(BigStr* chars) {
362	DCHECK(len(chars) == 1);
363	int c = chars->data_[0];
364	return StripAny(this, StripWhere::Right, c);
365	}
366
367	BigStr* BigStr::rstrip() {
368	return StripAny(this, StripWhere::Right, kWhitespace);
369	}
370
371	BigStr* BigStr::lstrip(BigStr* chars) {
372	DCHECK(len(chars) == 1);
373	int c = chars->data_[0];
374	return StripAny(this, StripWhere::Left, c);
375	}
376
377	BigStr* BigStr::lstrip() {
378	return StripAny(this, StripWhere::Left, kWhitespace);
379	}
380
381	BigStr* BigStr::join(List<BigStr> items) {
382	int length = 0;
383
384	int num_parts = len(items);
385
386	// " ".join([]) == ""
387	if (num_parts == 0) {
388	return kEmptyString;
389	}
390
391	// Common case
392	// 'anything'.join(["foo"]) == "foo"
393	if (num_parts == 1) {
394	return items->at(0);
395	}
396
397	for (int i = 0; i < num_parts; ++i) {
398	length += len(items->at(i));
399	}
400	// add length of all the separators
401	int this_len = len(this);
402	length += this_len * (num_parts - 1);
403
404	BigStr* result = NewStr(length);
405	char* p_result = result->data_; // advances through
406
407	for (int i = 0; i < num_parts; ++i) {
408	// log("i %d", i);
409	if (i != 0 && this_len) { // optimize common case of ''.join()
410	memcpy(p_result, data_, this_len); // copy the separator
411	p_result += this_len;
412	// log("this_len %d", this_len);
413	}
414
415	int n = len(items->at(i));
416	// log("n: %d", n);
417	memcpy(p_result, items->at(i)->data_, n); // copy the list item
418	p_result += n;
419	}
420
421	return result;
422	}
423
424	static void AppendPart(List<BigStr> result, BigStr* s, int left, int right) {
425	int new_len = right - left;
426	BigStr* part;
427	if (new_len == 0) {
428	part = kEmptyString;
429	} else {
430	part = NewStr(new_len);
431	memcpy(part->data_, s->data_ + left, new_len);
432	}
433	result->append(part);
434	}
435
436	// Split BigStr into List<BigStr*> of parts separated by 'sep'.
437	// The code structure is taken from CPython's Objects/stringlib/split.h.
438	List<BigStr> BigStr::split(BigStr* sep, int max_split) {
439	DCHECK(sep != nullptr);
440	DCHECK(len(sep) == 1); // we can only split one char
441	char sep_char = sep->data_[0];
442
443	int str_len = len(this);
444	if (str_len == 0) {
445	// weird case consistent with Python: ''.split(':') == ['']
446	return NewList<BigStr*>({kEmptyString});
447	}
448
449	List<BigStr> result = NewList<BigStr*>({});
450	int left = 0;
451	int right = 0;
452	int num_parts = 0; // 3 splits results in 4 parts
453
454	while (right < str_len && num_parts < max_split) {
455	// search for separator
456	for (; right < str_len; right++) {
457	if (data_[right] == sep_char) {
458	AppendPart(result, this, left, right);
459	right++;
460	left = right;
461	num_parts++;
462	break;
463	}
464	}
465	}
466	if (num_parts == 0) { // Optimization when there is no split
467	result->append(this);
468	} else if (left <= str_len) { // Last part
469	AppendPart(result, this, left, str_len);
470	}
471
472	return result;
473	}
474
475	List<BigStr> BigStr::split(BigStr* sep) {
476	return this->split(sep, len(this));
477	}
478
479	unsigned BigStr::hash(HashFunc h) {
480	if (!is_hashed_) {
481	hash_ = h(data_, len(this)) >> 1;
482	is_hashed_ = 1;
483	}
484	return hash_;
485	}
486
487	static inline BigStr* _StrFormat(const char* fmt, int fmt_len, va_list args) {
488	auto beg = std::cregex_iterator(fmt, fmt + fmt_len, gStrFmtRegex);
489	auto end = std::cregex_iterator();
490
491	char int_buf[kMaxFmtWidth];
492	std::string buf;
493	for (std::cregex_iterator it = beg; it != end; ++it) {
494	const std::cmatch& match = *it;
495
496	const std::csub_match& lit_m = match[1];
497	DCHECK(lit_m.matched);
498	const std::string& lit_s = lit_m.str();
499	buf.append(lit_s);
500
501	int width = 0;
502	bool zero_pad = false;
503	bool pad_back = false;
504	const std::csub_match& width_m = match[2];
505	const std::string& width_s = width_m.str();
506	bool ok = false;
507	if (width_m.matched && !width_s.empty()) {
508	if (width_s[0] == '0') {
509	zero_pad = true;
510	DCHECK(width_s.size() > 1);
511	ok = StringToInt(width_s.c_str() + 1, width_s.size() - 1, 10, &width);
512	DCHECK(ok);
513	(void)ok; // silence unused var warning in opt
514	} else {
515	ok = StringToInt(width_s.c_str(), width_s.size(), 10, &width);
516	DCHECK(ok);
517	}
518	if (width < 0) {
519	pad_back = true;
520	width *= -1;
521	}
522	DCHECK(0 <= width && width < kMaxFmtWidth);
523	}
524
525	char const* str_to_add = nullptr;
526	int add_len = 0;
527	const std::csub_match& code_m = match[3];
528	const std::string& code_s = code_m.str();
529	if (!code_m.matched) {
530	DCHECK(!width_m.matched); // python errors on invalid format operators
531	break;
532	}
533	DCHECK(code_s.size() == 1);
534	switch (code_s[0]) {
535	case '%': {
536	str_to_add = code_s.c_str();
537	add_len = 1;
538	break;
539	}
540	case 's': {
541	BigStr* s = va_arg(args, BigStr*);
542	// Check type unconditionally because mycpp doesn't always check it
543	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
544
545	str_to_add = s->data();
546	add_len = len(s);
547	zero_pad = false; // python ignores the 0 directive for strings
548	break;
549	}
550	case 'r': {
551	BigStr* s = va_arg(args, BigStr*);
552	// Check type unconditionally because mycpp doesn't always check it
553	CHECK(ObjHeader::FromObject(s)->type_tag == TypeTag::BigStr);
554
555	s = repr(s);
556	str_to_add = s->data();
557	add_len = len(s);
558	zero_pad = false; // python ignores the 0 directive for strings
559	break;
560	}
561	case 'd': // fallthrough
562	case 'o': {
563	int d = va_arg(args, int);
564	add_len = snprintf(int_buf, kMaxFmtWidth,
565	match.str().c_str() + lit_s.size(), d);
566	DCHECK(add_len > 0);
567	str_to_add = int_buf;
568	break;
569	}
570	default:
571	DCHECK(0);
572	break;
573	}
574	DCHECK(str_to_add != nullptr);
575
576	if (pad_back) {
577	buf.append(str_to_add, add_len);
578	}
579	if (add_len < width) {
580	for (int i = 0; i < width - add_len; ++i) {
581	buf.push_back(zero_pad ? '0' : ' ');
582	}
583	}
584	if (!pad_back) {
585	buf.append(str_to_add, add_len);
586	}
587	}
588
589	return StrFromC(buf.c_str(), buf.size());
590	}
591
592	BigStr* StrIter::Value() { // similar to at()
593	BigStr* result = NewStr(1);
594	result->data_[0] = s_->data_[i_];
595	DCHECK(result->data_[1] == '\0');
596	return result;
597	}
598
599	BigStr* StrFormat(const char* fmt, ...) {
600	va_list args;
601	va_start(args, fmt);
602	BigStr* ret = _StrFormat(fmt, strlen(fmt), args);
603	va_end(args);
604	return ret;
605	}
606
607	BigStr* StrFormat(BigStr* fmt, ...) {
608	va_list args;
609	va_start(args, fmt);
610	BigStr* ret = _StrFormat(fmt->data(), len(fmt), args);
611	va_end(args);
612	return ret;
613	}