1 | #ifndef MYCPP_GC_STR_H
|
2 | #define MYCPP_GC_STR_H
|
3 |
|
4 | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN
|
5 | #include "mycpp/gc_obj.h" // GC_OBJ
|
6 | #include "mycpp/hash.h" // HashFunc
|
7 |
|
8 | template <typename T>
|
9 | class List;
|
10 |
|
11 | class BigStr {
|
12 | public:
|
13 | // Don't call this directly. Call NewStr() instead, which calls this.
|
14 | BigStr() {
|
15 | }
|
16 |
|
17 | char* data() {
|
18 | return data_;
|
19 | }
|
20 |
|
21 | // Call this after writing into buffer created by OverAllocatedStr()
|
22 | void MaybeShrink(int str_len);
|
23 |
|
24 | BigStr* at(int i);
|
25 |
|
26 | // TODO: take (start, end) for equality without allocation (not pos)
|
27 | int find(BigStr* needle, int pos = 0);
|
28 | int rfind(BigStr* needle);
|
29 |
|
30 | BigStr* slice(int begin);
|
31 | BigStr* slice(int begin, int end);
|
32 |
|
33 | BigStr* strip();
|
34 | // Used for CommandSub in osh/cmd_exec.py
|
35 | BigStr* rstrip(BigStr* chars);
|
36 | BigStr* rstrip();
|
37 |
|
38 | BigStr* lstrip(BigStr* chars);
|
39 | BigStr* lstrip();
|
40 |
|
41 | BigStr* ljust(int width, BigStr* fillchar);
|
42 | BigStr* rjust(int width, BigStr* fillchar);
|
43 |
|
44 | // Can take (start, end) so Tokens can be compared without allocation
|
45 | bool startswith(BigStr* s);
|
46 | bool endswith(BigStr* s);
|
47 |
|
48 | BigStr* replace(BigStr* old, BigStr* new_str);
|
49 | BigStr* replace(BigStr* old, BigStr* new_str, int count);
|
50 | BigStr* join(List<BigStr*>* items);
|
51 |
|
52 | List<BigStr*>* split(BigStr* sep);
|
53 | List<BigStr*>* split(BigStr* sep, int max_split);
|
54 | List<BigStr*>* splitlines(bool keep);
|
55 |
|
56 | // TODO: Move unicode functions out of mycpp runtime? Because we won't match
|
57 | // Python exactly
|
58 | bool isdigit();
|
59 | bool isalpha();
|
60 | bool isupper();
|
61 |
|
62 | BigStr* upper();
|
63 | BigStr* lower();
|
64 |
|
65 | // Other options for fast comparison / hashing / string interning:
|
66 | // - unique_id_: an index into intern table. I don't think this works unless
|
67 | // you want to deal with rehashing all strings when the set grows.
|
68 | // - although note that the JVM has -XX:StringTableSize=FIXED, which means
|
69 | // - it can degrade into linked list performance
|
70 | // - Hashed strings become GLOBAL_STR(). Never deallocated.
|
71 | // - Hashed strings become part of the "large object space", which might be
|
72 | // managed by mark and sweep. This requires linked list overhead.
|
73 | // (doubly-linked?)
|
74 | // - Intern strings at GARBAGE COLLECTION TIME, with
|
75 | // LayoutForwarded::new_location_? Is this possible? Does it introduce
|
76 | // too much coupling between strings, hash tables, and GC?
|
77 |
|
78 | static constexpr ObjHeader obj_header() {
|
79 | return ObjHeader::BigStr();
|
80 | }
|
81 |
|
82 | unsigned hash(HashFunc h);
|
83 |
|
84 | int len_;
|
85 | unsigned hash_ : 31;
|
86 | unsigned is_hashed_ : 1;
|
87 | char data_[1]; // flexible array
|
88 |
|
89 | private:
|
90 | int _strip_left_pos();
|
91 | int _strip_right_pos();
|
92 |
|
93 | DISALLOW_COPY_AND_ASSIGN(BigStr)
|
94 | };
|
95 |
|
96 | constexpr int kStrHeaderSize = offsetof(BigStr, data_);
|
97 |
|
98 | // Note: for SmallStr, we might copy into the VALUE
|
99 | inline void BigStr::MaybeShrink(int str_len) {
|
100 | len_ = str_len;
|
101 | data_[len_] = '\0'; // NUL terminate
|
102 | }
|
103 |
|
104 | inline int len(const BigStr* s) {
|
105 | return s->len_;
|
106 | }
|
107 |
|
108 | BigStr* StrFormat(const char* fmt, ...);
|
109 | BigStr* StrFormat(BigStr* fmt, ...);
|
110 |
|
111 | // NOTE: This iterates over bytes.
|
112 | class StrIter {
|
113 | public:
|
114 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
|
115 | // Cheney only: s_ could be moved during iteration.
|
116 | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
|
117 | }
|
118 | ~StrIter() {
|
119 | // gHeap.PopRoot();
|
120 | }
|
121 | void Next() {
|
122 | i_++;
|
123 | }
|
124 | bool Done() {
|
125 | return i_ >= len_;
|
126 | }
|
127 | BigStr* Value(); // similar to at()
|
128 |
|
129 | private:
|
130 | BigStr* s_;
|
131 | int i_;
|
132 | int len_;
|
133 |
|
134 | DISALLOW_COPY_AND_ASSIGN(StrIter)
|
135 | };
|
136 |
|
137 | extern BigStr* kEmptyString;
|
138 |
|
139 | // GlobalStr notes:
|
140 | // - sizeof("foo") == 4, for the NUL terminator.
|
141 | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We
|
142 | // don't put it here because it triggers -Winvalid-offsetof
|
143 |
|
144 | template <int N>
|
145 | class GlobalStr {
|
146 | // A template type with the same layout as BigStr with length N-1 (which needs
|
147 | // a buffer of size N). For initializing global constant instances.
|
148 | public:
|
149 | int len_;
|
150 | unsigned hash_ : 31;
|
151 | unsigned is_hashed_ : 1;
|
152 | const char data_[N];
|
153 |
|
154 | DISALLOW_COPY_AND_ASSIGN(GlobalStr)
|
155 | };
|
156 |
|
157 | union Str {
|
158 | public:
|
159 | // Instead of this at the start of every function:
|
160 | // Str* s = nullptr;
|
161 | // It will now be:
|
162 | // Str s(nullptr);
|
163 | //
|
164 | // StackRoot _root(&s);
|
165 | explicit Str(BigStr* big) : big_(big) {
|
166 | }
|
167 |
|
168 | char* data() {
|
169 | return big_->data();
|
170 | }
|
171 |
|
172 | Str at(int i) {
|
173 | return Str(big_->at(i));
|
174 | }
|
175 |
|
176 | Str upper() {
|
177 | return Str(big_->upper());
|
178 | }
|
179 |
|
180 | uint64_t raw_bytes_;
|
181 | BigStr* big_;
|
182 | // TODO: add SmallStr, see mycpp/small_str_test.cc
|
183 | };
|
184 |
|
185 | inline int len(const Str s) {
|
186 | return len(s.big_);
|
187 | }
|
188 |
|
189 | // This macro is a workaround for the fact that it's impossible to have a
|
190 | // a constexpr initializer for char[N]. The "String Literals as Non-Type
|
191 | // Template Parameters" feature of C++ 20 would have done it, but it's not
|
192 | // there.
|
193 | //
|
194 | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
|
195 | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
|
196 | //
|
197 | // TODO: Can we hash values at compile time so they can be in the intern table?
|
198 |
|
199 | #define GLOBAL_STR(name, val) \
|
200 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
201 | ObjHeader::Global(TypeTag::BigStr), \
|
202 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
203 | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
|
204 |
|
205 | // New style for SmallStr compatibility
|
206 | #define GLOBAL_STR2(name, val) \
|
207 | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \
|
208 | ObjHeader::Global(TypeTag::BigStr), \
|
209 | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
|
210 | Str name(reinterpret_cast<BigStr*>(&_##name.obj));
|
211 |
|
212 | #endif // MYCPP_GC_STR_H
|