OILS / mycpp / gc_str.h View on Github | oilshell.org

212 lines, 108 significant
1#ifndef MYCPP_GC_STR_H
2#define MYCPP_GC_STR_H
3
4#include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
5#include "mycpp/gc_obj.h" // GC_OBJ
6#include "mycpp/hash.h" // HashFunc
7
8template <typename T>
9class List;
10
11class BigStr {
12 public:
13 // Don't call this directly. Call NewStr() instead, which calls this.
14 BigStr() {
15 }
16
17 char* data() {
18 return data_;
19 }
20
21 // Call this after writing into buffer created by OverAllocatedStr()
22 void MaybeShrink(int str_len);
23
24 BigStr* at(int i);
25
26 // TODO: take (start, end) for equality without allocation (not pos)
27 int find(BigStr* needle, int pos = 0);
28 int rfind(BigStr* needle);
29
30 BigStr* slice(int begin);
31 BigStr* slice(int begin, int end);
32
33 BigStr* strip();
34 // Used for CommandSub in osh/cmd_exec.py
35 BigStr* rstrip(BigStr* chars);
36 BigStr* rstrip();
37
38 BigStr* lstrip(BigStr* chars);
39 BigStr* lstrip();
40
41 BigStr* ljust(int width, BigStr* fillchar);
42 BigStr* rjust(int width, BigStr* fillchar);
43
44 // Can take (start, end) so Tokens can be compared without allocation
45 bool startswith(BigStr* s);
46 bool endswith(BigStr* s);
47
48 BigStr* replace(BigStr* old, BigStr* new_str);
49 BigStr* replace(BigStr* old, BigStr* new_str, int count);
50 BigStr* join(List<BigStr*>* items);
51
52 List<BigStr*>* split(BigStr* sep);
53 List<BigStr*>* split(BigStr* sep, int max_split);
54 List<BigStr*>* splitlines(bool keep);
55
56 // TODO: Move unicode functions out of mycpp runtime? Because we won't match
57 // Python exactly
58 bool isdigit();
59 bool isalpha();
60 bool isupper();
61
62 BigStr* upper();
63 BigStr* lower();
64
65 // Other options for fast comparison / hashing / string interning:
66 // - unique_id_: an index into intern table. I don't think this works unless
67 // you want to deal with rehashing all strings when the set grows.
68 // - although note that the JVM has -XX:StringTableSize=FIXED, which means
69 // - it can degrade into linked list performance
70 // - Hashed strings become GLOBAL_STR(). Never deallocated.
71 // - Hashed strings become part of the "large object space", which might be
72 // managed by mark and sweep. This requires linked list overhead.
73 // (doubly-linked?)
74 // - Intern strings at GARBAGE COLLECTION TIME, with
75 // LayoutForwarded::new_location_? Is this possible? Does it introduce
76 // too much coupling between strings, hash tables, and GC?
77
78 static constexpr ObjHeader obj_header() {
79 return ObjHeader::BigStr();
80 }
81
82 unsigned hash(HashFunc h);
83
84 int len_;
85 unsigned hash_ : 31;
86 unsigned is_hashed_ : 1;
87 char data_[1]; // flexible array
88
89 private:
90 int _strip_left_pos();
91 int _strip_right_pos();
92
93 DISALLOW_COPY_AND_ASSIGN(BigStr)
94};
95
96constexpr int kStrHeaderSize = offsetof(BigStr, data_);
97
98// Note: for SmallStr, we might copy into the VALUE
99inline void BigStr::MaybeShrink(int str_len) {
100 len_ = str_len;
101 data_[len_] = '\0'; // NUL terminate
102}
103
104inline int len(const BigStr* s) {
105 return s->len_;
106}
107
108BigStr* StrFormat(const char* fmt, ...);
109BigStr* StrFormat(BigStr* fmt, ...);
110
111// NOTE: This iterates over bytes.
112class StrIter {
113 public:
114 explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
115 // Cheney only: s_ could be moved during iteration.
116 // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
117 }
118 ~StrIter() {
119 // gHeap.PopRoot();
120 }
121 void Next() {
122 i_++;
123 }
124 bool Done() {
125 return i_ >= len_;
126 }
127 BigStr* Value(); // similar to at()
128
129 private:
130 BigStr* s_;
131 int i_;
132 int len_;
133
134 DISALLOW_COPY_AND_ASSIGN(StrIter)
135};
136
137extern BigStr* kEmptyString;
138
139// GlobalStr notes:
140// - sizeof("foo") == 4, for the NUL terminator.
141// - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
142// don't put it here because it triggers -Winvalid-offsetof
143
144template <int N>
145class GlobalStr {
146 // A template type with the same layout as BigStr with length N-1 (which needs
147 // a buffer of size N). For initializing global constant instances.
148 public:
149 int len_;
150 unsigned hash_ : 31;
151 unsigned is_hashed_ : 1;
152 const char data_[N];
153
154 DISALLOW_COPY_AND_ASSIGN(GlobalStr)
155};
156
157union Str {
158 public:
159 // Instead of this at the start of every function:
160 // Str* s = nullptr;
161 // It will now be:
162 // Str s(nullptr);
163 //
164 // StackRoot _root(&s);
165 explicit Str(BigStr* big) : big_(big) {
166 }
167
168 char* data() {
169 return big_->data();
170 }
171
172 Str at(int i) {
173 return Str(big_->at(i));
174 }
175
176 Str upper() {
177 return Str(big_->upper());
178 }
179
180 uint64_t raw_bytes_;
181 BigStr* big_;
182 // TODO: add SmallStr, see mycpp/small_str_test.cc
183};
184
185inline int len(const Str s) {
186 return len(s.big_);
187}
188
189// This macro is a workaround for the fact that it's impossible to have a
190// a constexpr initializer for char[N]. The "String Literals as Non-Type
191// Template Parameters" feature of C++ 20 would have done it, but it's not
192// there.
193//
194// https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
195// https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
196//
197// TODO: Can we hash values at compile time so they can be in the intern table?
198
199#define GLOBAL_STR(name, val) \
200 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
201 ObjHeader::Global(TypeTag::BigStr), \
202 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
203 BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
204
205// New style for SmallStr compatibility
206#define GLOBAL_STR2(name, val) \
207 GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
208 ObjHeader::Global(TypeTag::BigStr), \
209 {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
210 Str name(reinterpret_cast<BigStr*>(&_##name.obj));
211
212#endif // MYCPP_GC_STR_H