mycpp

Coverage Report

Created: 2024-06-09 06:28

/home/uke/oil/mycpp/gc_str.h
Line
Count
Source (jump to first uncovered line)
1
#ifndef MYCPP_GC_STR_H
2
#define MYCPP_GC_STR_H
3
4
#include "mycpp/common.h"  // DISALLOW_COPY_AND_ASSIGN
5
#include "mycpp/gc_obj.h"  // GC_OBJ
6
#include "mycpp/hash.h"    // HashFunc
7
8
template <typename T>
9
class List;
10
11
class BigStr {
12
 public:
13
  // Don't call this directly.  Call NewStr() instead, which calls this.
14
1.94k
  BigStr() {
15
1.94k
  }
16
17
101
  char* data() {
18
101
    return data_;
19
101
  }
20
21
  // Call this after writing into buffer created by OverAllocatedStr()
22
  void MaybeShrink(int str_len);
23
24
  BigStr* at(int i);
25
26
  int find(BigStr* needle, int start = 0, int end = -1);
27
  int rfind(BigStr* needle);
28
29
  BigStr* slice(int begin);
30
  BigStr* slice(int begin, int end);
31
32
  BigStr* strip();
33
  // Used for CommandSub in osh/cmd_exec.py
34
  BigStr* rstrip(BigStr* chars);
35
  BigStr* rstrip();
36
37
  BigStr* lstrip(BigStr* chars);
38
  BigStr* lstrip();
39
40
  BigStr* ljust(int width, BigStr* fillchar);
41
  BigStr* rjust(int width, BigStr* fillchar);
42
43
  // Can take (start, end) so Tokens can be compared without allocation
44
  bool startswith(BigStr* s);
45
  bool endswith(BigStr* s);
46
47
  BigStr* replace(BigStr* old, BigStr* new_str);
48
  BigStr* replace(BigStr* old, BigStr* new_str, int count);
49
  BigStr* join(List<BigStr*>* items);
50
51
  List<BigStr*>* split(BigStr* sep);
52
  List<BigStr*>* split(BigStr* sep, int max_split);
53
  List<BigStr*>* splitlines(bool keep);
54
55
  // TODO: Move unicode functions out of mycpp runtime?  Because we won't match
56
  // Python exactly
57
  bool isdigit();
58
  bool isalpha();
59
  bool isupper();
60
61
  BigStr* upper();
62
  BigStr* lower();
63
64
  // Other options for fast comparison / hashing / string interning:
65
  // - unique_id_: an index into intern table.  I don't think this works unless
66
  //   you want to deal with rehashing all strings when the set grows.
67
  //   - although note that the JVM has -XX:StringTableSize=FIXED, which means
68
  //   - it can degrade into linked list performance
69
  // - Hashed strings become GLOBAL_STR().  Never deallocated.
70
  // - Hashed strings become part of the "large object space", which might be
71
  //   managed by mark and sweep.  This requires linked list overhead.
72
  //   (doubly-linked?)
73
  // - Intern strings at GARBAGE COLLECTION TIME, with
74
  //   LayoutForwarded::new_location_?  Is this possible?  Does it introduce
75
  //   too much coupling between strings, hash tables, and GC?
76
77
1.94k
  static constexpr ObjHeader obj_header() {
78
1.94k
    return ObjHeader::BigStr();
79
1.94k
  }
80
81
  unsigned hash(HashFunc h);
82
83
  int len_;
84
  unsigned hash_ : 31;
85
  unsigned is_hashed_ : 1;
86
  char data_[1];  // flexible array
87
88
 private:
89
  int _strip_left_pos();
90
  int _strip_right_pos();
91
92
  DISALLOW_COPY_AND_ASSIGN(BigStr)
93
};
94
95
constexpr int kStrHeaderSize = offsetof(BigStr, data_);
96
97
// Note: for SmallStr, we might copy into the VALUE
98
32
inline void BigStr::MaybeShrink(int str_len) {
99
32
  len_ = str_len;
100
32
  data_[len_] = '\0';  // NUL terminate
101
32
}
102
103
3.75k
inline int len(const BigStr* s) {
104
3.75k
  return s->len_;
105
3.75k
}
106
107
BigStr* StrFormat(const char* fmt, ...);
108
BigStr* StrFormat(BigStr* fmt, ...);
109
110
// NOTE: This iterates over bytes.
111
class StrIter {
112
 public:
113
1
  explicit StrIter(BigStr* s) : s_(s), i_(0), len_(len(s)) {
114
    // Cheney only: s_ could be moved during iteration.
115
    // gHeap.PushRoot(reinterpret_cast<RawObject**>(&s_));
116
1
  }
117
1
  ~StrIter() {
118
    // gHeap.PopRoot();
119
1
  }
120
4
  void Next() {
121
4
    i_++;
122
4
  }
123
5
  bool Done() {
124
5
    return i_ >= len_;
125
5
  }
126
  BigStr* Value();  // similar to at()
127
128
 private:
129
  BigStr* s_;
130
  int i_;
131
  int len_;
132
133
  DISALLOW_COPY_AND_ASSIGN(StrIter)
134
};
135
136
extern BigStr* kEmptyString;
137
138
// GlobalStr notes:
139
// - sizeof("foo") == 4, for the NUL terminator.
140
// - gc_heap_test.cc has a static_assert that GlobalStr matches BigStr.  We
141
// don't put it here because it triggers -Winvalid-offsetof
142
143
template <int N>
144
class GlobalStr {
145
  // A template type with the same layout as BigStr with length N-1 (which needs
146
  // a buffer of size N).  For initializing global constant instances.
147
 public:
148
  int len_;
149
  unsigned hash_ : 31;
150
  unsigned is_hashed_ : 1;
151
  const char data_[N];
152
153
  DISALLOW_COPY_AND_ASSIGN(GlobalStr)
154
};
155
156
union Str {
157
 public:
158
  // Instead of this at the start of every function:
159
  //   Str* s = nullptr;
160
  // It will now be:
161
  //   Str s(nullptr);
162
  //
163
  //   StackRoot _root(&s);
164
6
  explicit Str(BigStr* big) : big_(big) {
165
6
  }
166
167
5
  char* data() {
168
5
    return big_->data();
169
5
  }
170
171
5
  Str at(int i) {
172
5
    return Str(big_->at(i));
173
5
  }
174
175
0
  Str upper() {
176
0
    return Str(big_->upper());
177
0
  }
178
179
  uint64_t raw_bytes_;
180
  BigStr* big_;
181
  // TODO: add SmallStr, see mycpp/small_str_test.cc
182
};
183
184
6
inline int len(const Str s) {
185
6
  return len(s.big_);
186
6
}
187
188
// This macro is a workaround for the fact that it's impossible to have a
189
// a constexpr initializer for char[N].  The "String Literals as Non-Type
190
// Template Parameters" feature of C++ 20 would have done it, but it's not
191
// there.
192
//
193
// https://old.reddit.com/r/cpp_questions/comments/j0khh6/how_to_constexpr_initialize_class_member_thats/
194
// https://stackoverflow.com/questions/10422487/how-can-i-initialize-char-arrays-in-a-constructor
195
//
196
// TODO: Can we hash values at compile time so they can be in the intern table?
197
198
#define GLOBAL_STR(name, val)                                                \
199
  GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
200
      ObjHeader::Global(TypeTag::BigStr),                                    \
201
      {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
202
  BigStr* name = reinterpret_cast<BigStr*>(&_##name.obj);
203
204
// New style for SmallStr compatibility
205
#define GLOBAL_STR2(name, val)                                               \
206
  GcGlobal<GlobalStr<sizeof(val)>> _##name = {                               \
207
      ObjHeader::Global(TypeTag::BigStr),                                    \
208
      {.len_ = sizeof(val) - 1, .hash_ = 0, .is_hashed_ = 0, .data_ = val}}; \
209
  Str name(reinterpret_cast<BigStr*>(&_##name.obj));
210
211
// Helper function that's consistent with JSON definition of ASCII whitespace,
212
// e.g.
213
// {"age": \t 42} is OK
214
// {"age": \v 42} is NOT OK
215
70
inline bool IsAsciiWhitespace(int ch) {
216
70
  return ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n';
217
70
}
218
219
#endif  // MYCPP_GC_STR_H