/home/uke/oil/mycpp/gc_str.h
Line | Count | Source (jump to first uncovered line) |
1 | | #ifndef MYCPP_GC_STR_H |
2 | | #define MYCPP_GC_STR_H |
3 | | |
4 | | #include "mycpp/common.h" // DISALLOW_COPY_AND_ASSIGN |
5 | | #include "mycpp/gc_obj.h" // GC_OBJ |
6 | | #include "mycpp/hash.h" // HashFunc |
7 | | |
8 | | template <typename T> |
9 | | class List; |
10 | | |
11 | | class BigStr { |
12 | | public: |
13 | | // Don't call this directly. Call NewStr() instead, which calls this. |
14 | 1.94k | BigStr() { |
15 | 1.94k | } |
16 | | |
17 | 101 | char* data() { |
18 | 101 | return data_; |
19 | 101 | } |
20 | | |
21 | | // Call this after writing into buffer created by OverAllocatedStr() |
22 | | void MaybeShrink(int str_len); |
23 | | |
24 | | BigStr* at(int i); |
25 | | |
26 | | int find(BigStr* needle, int start = 0, int end = -1); |
27 | | int rfind(BigStr* needle); |
28 | | |
29 | | BigStr* slice(int begin); |
30 | | BigStr* slice(int begin, int end); |
31 | | |
32 | | BigStr* strip(); |
33 | | // Used for CommandSub in osh/cmd_exec.py |
34 | | BigStr* rstrip(BigStr* chars); |
35 | | BigStr* rstrip(); |
36 | | |
37 | | BigStr* lstrip(BigStr* chars); |
38 | | BigStr* lstrip(); |
39 | | |
40 | | BigStr* ljust(int width, BigStr* fillchar); |
41 | | BigStr* rjust(int width, BigStr* fillchar); |
42 | | |
43 | | // Can take (start, end) so Tokens can be compared without allocation |
44 | | bool startswith(BigStr* s); |
45 | | bool endswith(BigStr* s); |
46 | | |
47 | | BigStr* replace(BigStr* old, BigStr* new_str); |
48 | | BigStr* replace(BigStr* old, BigStr* new_str, int count); |
49 | | BigStr* join(List<BigStr*>* items); |
50 | | |
51 | | List<BigStr*>* split(BigStr* sep); |
52 | | List<BigStr*>* split(BigStr* sep, int max_split); |
53 | | List<BigStr*>* splitlines(bool keep); |
54 | | |
55 | | // TODO: Move unicode functions out of mycpp runtime? Because we won't match |
56 | | // Python exactly |
57 | | bool isdigit(); |
58 | | bool isalpha(); |
59 | | bool isupper(); |
60 | | |
61 | | BigStr* upper(); |
62 | | BigStr* lower(); |
63 | | |
64 | | // Other options for fast comparison / hashing / string interning: |
65 | | // - unique_id_: an index into intern table. I don't think this works unless |
66 | | // you want to deal with rehashing all strings when the set grows. |
67 | | // - although note that the JVM has -XX:StringTableSize=FIXED, which means |
68 | | // - it can degrade into linked list performance |
69 | | // - Hashed strings become GLOBAL_STR(). Never deallocated. |
70 | | // - Hashed strings become part of the "large object space", which might be |
71 | | // managed by mark and sweep. This requires linked list overhead. |
72 | | // (doubly-linked?) |
73 | | // - Intern strings at GARBAGE COLLECTION TIME, with |
74 | | // LayoutForwarded::new_location_? Is this possible? Does it introduce |
75 | | // too much coupling between strings, hash tables, and GC? |
76 | | |
77 | 1.94k | static constexpr ObjHeader obj_header() { |
78 | 1.94k | return ObjHeader::BigStr(); |
79 | 1.94k | } |
80 | | |
81 | | unsigned hash(HashFunc h); |
82 | | |
83 | | int len_; |
84 | | unsigned hash_ : 31; |
85 | | unsigned is_hashed_ : 1; |
86 | | char data_[1]; // flexible array |
87 | | |
88 | | private: |
89 | | int _strip_left_pos(); |
90 | | int _strip_right_pos(); |
91 | | |
92 | | DISALLOW_COPY_AND_ASSIGN(BigStr) |
93 | | }; |
94 | | |
95 | | constexpr int kStrHeaderSize = offsetof(BigStr, data_); |
96 | | |
97 | | // Note: for SmallStr, we might copy into the VALUE |
98 | 32 | inline void BigStr::MaybeShrink(int str_len) { |
99 | 32 | len_ = str_len; |
100 | 32 | data_[len_] = '\0'; // NUL terminate |
101 | 32 | } |
102 | | |
103 | 3.75k | inline int len(const BigStr* s) { |
104 | 3.75k | return s->len_; |
105 | 3.75k | } |
106 | | |
107 | | BigStr* StrFormat(const char* fmt, ...); |
108 | | BigStr* StrFormat(BigStr* fmt, ...); |
109 | | |
110 | | // NOTE: This iterates over bytes. |
111 | | class StrIter { |
112 | | public: |
113 | 1 | explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) { |
114 | | // Cheney only: s_ could be moved during iteration. |
115 | | // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_)); |
116 | 1 | } |
117 | 1 | ~StrIter() { |
118 | | // gHeap.PopRoot(); |
119 | 1 | } |
120 | 4 | void Next() { |
121 | 4 | i_++; |
122 | 4 | } |
123 | 5 | bool Done() { |
124 | 5 | return i_ >= len_; |
125 | 5 | } |
126 | | BigStr* Value(); // similar to at() |
127 | | |
128 | | private: |
129 | | BigStr* s_; |
130 | | int i_; |
131 | | int len_; |
132 | | |
133 | | DISALLOW_COPY_AND_ASSIGN(StrIter) |
134 | | }; |
135 | | |
136 | | extern BigStr* kEmptyString; |
137 | | |
138 | | // GlobalStr notes: |
139 | | // - sizeof("foo") == 4, for the NUL terminator. |
140 | | // - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr. We |
141 | | // don't put it here because it triggers -Winvalid-offsetof |
142 | | |
143 | | template <int N> |
144 | | class GlobalStr { |
145 | | // A template type with the same layout as BigStr with length N-1 (which needs |
146 | | // a buffer of size N). For initializing global constant instances. |
147 | | public: |
148 | | int len_; |
149 | | unsigned hash_ : 31; |
150 | | unsigned is_hashed_ : 1; |
151 | | const char data_[N]; |
152 | | |
153 | | DISALLOW_COPY_AND_ASSIGN(GlobalStr) |
154 | | }; |
155 | | |
156 | | union Str { |
157 | | public: |
158 | | // Instead of this at the start of every function: |
159 | | // Str* s = nullptr; |
160 | | // It will now be: |
161 | | // Str s(nullptr); |
162 | | // |
163 | | // StackRoot _root(&s); |
164 | 6 | explicit Str(BigStr* big) : big_(big) { |
165 | 6 | } |
166 | | |
167 | 5 | char* data() { |
168 | 5 | return big_->data(); |
169 | 5 | } |
170 | | |
171 | 5 | Str at(int i) { |
172 | 5 | return Str(big_->at(i)); |
173 | 5 | } |
174 | | |
175 | 0 | Str upper() { |
176 | 0 | return Str(big_->upper()); |
177 | 0 | } |
178 | | |
179 | | uint64_t raw_bytes_; |
180 | | BigStr* big_; |
181 | | // TODO: add SmallStr, see mycpp/small_str_test.cc |
182 | | }; |
183 | | |
184 | 6 | inline int len(const Str s) { |
185 | 6 | return len(s.big_); |
186 | 6 | } |
187 | | |
188 | | // This macro is a workaround for the fact that it's impossible to have a |
189 | | // a constexpr initializer for char[N]. The "String Literals as Non-Type |
190 | | // Template Parameters" feature of C++ 20 would have done it, but it's not |
191 | | // there. |
192 | | // |
193 | | // https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/ |
194 | | // https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor |
195 | | // |
196 | | // TODO: Can we hash values at compile time so they can be in the intern table? |
197 | | |
198 | | #define GLOBAL_STR(name, val) \ |
199 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
200 | | ObjHeader::Global(TypeTag::BigStr), \ |
201 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
202 | | BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj); |
203 | | |
204 | | // New style for SmallStr compatibility |
205 | | #define GLOBAL_STR2(name, val) \ |
206 | | GcGlobal<GlobalStr<sizeof(val)>> _##name = { \ |
207 | | ObjHeader::Global(TypeTag::BigStr), \ |
208 | | {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \ |
209 | | Str name(reinterpret_cast<BigStr*>(&_##name.obj)); |
210 | | |
211 | | // Helper function that's consistent with JSON definition of ASCII whitespace, |
212 | | // e.g. |
213 | | // {"age": \t 42} is OK |
214 | | // {"age": \v 42} is NOT OK |
215 | 70 | inline bool IsAsciiWhitespace(int ch) { |
216 | 70 | return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'; |
217 | 70 | } |
218 | | |
219 | | #endif // MYCPP_GC_STR_H |