| 1 | #include "cpp/libc.h"
|
| 2 |
|
| 3 | #include <locale.h> // setlocale()
|
| 4 | #include <regex.h> // regcomp()
|
| 5 | #include <unistd.h> // gethostname()
|
| 6 | #include <wctype.h> // towupper()
|
| 7 |
|
| 8 | #include "mycpp/runtime.h"
|
| 9 | #include "vendor/greatest.h"
|
| 10 |
|
| 11 | TEST hostname_test() {
|
| 12 | BigStr* s0 = libc::gethostname();
|
| 13 | ASSERT(s0 != nullptr);
|
| 14 |
|
| 15 | char buf[1024];
|
| 16 | ASSERT(gethostname(buf, HOST_NAME_MAX) == 0);
|
| 17 | ASSERT(str_equals(s0, StrFromC(buf)));
|
| 18 |
|
| 19 | PASS();
|
| 20 | }
|
| 21 |
|
| 22 | TEST realpath_test() {
|
| 23 | BigStr* result = libc::realpath(StrFromC("/"));
|
| 24 | ASSERT(str_equals(StrFromC("/"), result));
|
| 25 |
|
| 26 | bool caught = false;
|
| 27 | try {
|
| 28 | libc::realpath(StrFromC("/nonexistent_ZZZ"));
|
| 29 | } catch (IOError_OSError* e) {
|
| 30 | caught = true;
|
| 31 | }
|
| 32 | ASSERT(caught);
|
| 33 |
|
| 34 | PASS();
|
| 35 | }
|
| 36 |
|
| 37 | TEST libc_test() {
|
| 38 | log("sizeof(wchar_t) = %d", sizeof(wchar_t));
|
| 39 |
|
| 40 | int width = 0;
|
| 41 |
|
| 42 | // TODO: enable this test. Is it not picking LC_CTYPE?
|
| 43 | // Do we have to do some initialization like libc.cpython_reset_locale() ?
|
| 44 | #if 0
|
| 45 | try {
|
| 46 | // mu character \u{03bc} in utf-8
|
| 47 | width = libc::wcswidth(StrFromC("\xce\xbc"));
|
| 48 | } catch (UnicodeError* e) {
|
| 49 | log("UnicodeError %s", e->message->data_);
|
| 50 | }
|
| 51 | ASSERT_EQ_FMT(2, width, "%d");
|
| 52 | #endif
|
| 53 |
|
| 54 | BigStr* h = libc::gethostname();
|
| 55 | log("gethostname() = %s %d", h->data_, len(h));
|
| 56 |
|
| 57 | width = libc::wcswidth(StrFromC("foo"));
|
| 58 | ASSERT_EQ(3, width);
|
| 59 |
|
| 60 | libc::print_time(0.1, 0.2, 0.3);
|
| 61 |
|
| 62 | PASS();
|
| 63 | }
|
| 64 |
|
| 65 | static List<BigStr*>* Groups(BigStr* s, List<int>* indices) {
|
| 66 | List<BigStr*>* groups = NewList<BigStr*>();
|
| 67 | int n = len(indices) / 2;
|
| 68 | for (int i = 0; i < n; ++i) {
|
| 69 | int start = indices->at(2 * i);
|
| 70 | int end = indices->at(2 * i + 1);
|
| 71 | if (start == -1) {
|
| 72 | groups->append(nullptr);
|
| 73 | } else {
|
| 74 | groups->append(s->slice(start, end));
|
| 75 | }
|
| 76 | }
|
| 77 | return groups;
|
| 78 | }
|
| 79 |
|
| 80 | TEST regex_wrapper_test() {
|
| 81 | BigStr* s1 = StrFromC("-abaacaaa");
|
| 82 | List<int>* indices = libc::regex_search(StrFromC("(a+).(a+)"), 0, s1, 0);
|
| 83 | List<BigStr*>* results = Groups(s1, indices);
|
| 84 | ASSERT_EQ_FMT(3, len(results), "%d");
|
| 85 | ASSERT(str_equals(StrFromC("abaa"), results->at(0))); // whole match
|
| 86 | ASSERT(str_equals(StrFromC("a"), results->at(1)));
|
| 87 | ASSERT(str_equals(StrFromC("aa"), results->at(2)));
|
| 88 |
|
| 89 | indices = libc::regex_search(StrFromC("z+"), 0, StrFromC("abaacaaa"), 0);
|
| 90 | ASSERT_EQ(nullptr, indices);
|
| 91 |
|
| 92 | // Alternation gives unmatched group
|
| 93 | BigStr* s2 = StrFromC("b");
|
| 94 | indices = libc::regex_search(StrFromC("(a)|(b)"), 0, s2, 0);
|
| 95 | results = Groups(s2, indices);
|
| 96 | ASSERT_EQ_FMT(3, len(results), "%d");
|
| 97 | ASSERT(str_equals(StrFromC("b"), results->at(0))); // whole match
|
| 98 | ASSERT_EQ(nullptr, results->at(1));
|
| 99 | ASSERT(str_equals(StrFromC("b"), results->at(2)));
|
| 100 |
|
| 101 | // Like Unicode test below
|
| 102 | indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_x_"), 0);
|
| 103 | ASSERT(indices != nullptr);
|
| 104 | ASSERT_EQ_FMT(2, len(indices), "%d");
|
| 105 | ASSERT_EQ_FMT(0, indices->at(0), "%d");
|
| 106 | ASSERT_EQ_FMT(3, indices->at(1), "%d");
|
| 107 |
|
| 108 | // TODO(unicode)
|
| 109 | #if 0
|
| 110 | //indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_\u03bc_"), 0);
|
| 111 | indices = libc::regex_search(StrFromC("_._"), 0, StrFromC("_μ_"), 0);
|
| 112 | ASSERT(indices != nullptr);
|
| 113 | ASSERT_EQ_FMT(2, len(indices), "%d");
|
| 114 | ASSERT_EQ_FMT(0, indices->at(0), "%d");
|
| 115 | ASSERT_EQ_FMT(0, indices->at(0), "%d");
|
| 116 | #endif
|
| 117 |
|
| 118 | Tuple2<int, int>* result;
|
| 119 | BigStr* s = StrFromC("oXooXoooXoX");
|
| 120 | result = libc::regex_first_group_match(StrFromC("(X.)"), s, 0);
|
| 121 | ASSERT_EQ_FMT(1, result->at0(), "%d");
|
| 122 | ASSERT_EQ_FMT(3, result->at1(), "%d");
|
| 123 |
|
| 124 | result = libc::regex_first_group_match(StrFromC("(X.)"), s, 3);
|
| 125 | ASSERT_EQ_FMT(4, result->at0(), "%d");
|
| 126 | ASSERT_EQ_FMT(6, result->at1(), "%d");
|
| 127 |
|
| 128 | result = libc::regex_first_group_match(StrFromC("(X.)"), s, 6);
|
| 129 | ASSERT_EQ_FMT(8, result->at0(), "%d");
|
| 130 | ASSERT_EQ_FMT(10, result->at1(), "%d");
|
| 131 |
|
| 132 | PASS();
|
| 133 | }
|
| 134 |
|
| 135 | TEST glob_test() {
|
| 136 | // This depends on the file system
|
| 137 | auto files = libc::glob(StrFromC("*.testdata"));
|
| 138 | // 3 files are made by the shell wrapper
|
| 139 | ASSERT_EQ_FMT(3, len(files), "%d");
|
| 140 |
|
| 141 | print(files->at(0));
|
| 142 |
|
| 143 | auto files2 = libc::glob(StrFromC("*.pyzzz"));
|
| 144 | ASSERT_EQ_FMT(0, len(files2), "%d");
|
| 145 |
|
| 146 | PASS();
|
| 147 | }
|
| 148 |
|
| 149 | TEST fnmatch_test() {
|
| 150 | BigStr* s1 = (StrFromC("foo.py "))->strip();
|
| 151 | ASSERT(libc::fnmatch(StrFromC("*.py"), s1));
|
| 152 | ASSERT(!libc::fnmatch(StrFromC("*.py"), StrFromC("foo.p")));
|
| 153 |
|
| 154 | // Unicode - ? is byte or code point?
|
| 155 | ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_x_")));
|
| 156 |
|
| 157 | // TODO(unicode)
|
| 158 | // ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_\u03bc_")));
|
| 159 | // ASSERT(libc::fnmatch(StrFromC("_?_"), StrFromC("_μ_")));
|
| 160 |
|
| 161 | // extended glob
|
| 162 | ASSERT(libc::fnmatch(StrFromC("*(foo|bar).py"), StrFromC("foo.py")));
|
| 163 | ASSERT(!libc::fnmatch(StrFromC("*(foo|bar).py"), StrFromC("foo.p")));
|
| 164 |
|
| 165 | PASS();
|
| 166 | }
|
| 167 |
|
| 168 | TEST for_test_coverage() {
|
| 169 | // Sometimes we're not connected to a terminal
|
| 170 | try {
|
| 171 | libc::get_terminal_width();
|
| 172 | } catch (IOError_OSError* e) {
|
| 173 | }
|
| 174 |
|
| 175 | PASS();
|
| 176 | }
|
| 177 |
|
| 178 | void FindAll(const char* p, const char* s) {
|
| 179 | regex_t pat;
|
| 180 |
|
| 181 | int cflags = REG_EXTENDED;
|
| 182 | if (regcomp(&pat, p, cflags) != 0) {
|
| 183 | FAIL();
|
| 184 | }
|
| 185 | int outlen = pat.re_nsub + 1; // number of captures
|
| 186 |
|
| 187 | // TODO: Could statically allocate 99, and assert that re_nsub is less than
|
| 188 | // 99. Would speed up loops.
|
| 189 | regmatch_t* pmatch =
|
| 190 | static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * outlen));
|
| 191 |
|
| 192 | int cur_pos = 0;
|
| 193 | // int n = strlen(s);
|
| 194 | while (true) {
|
| 195 | // Necessary so ^ doesn't match in the middle!
|
| 196 | int eflags = cur_pos == 0 ? 0 : REG_NOTBOL;
|
| 197 | bool match = regexec(&pat, s + cur_pos, outlen, pmatch, eflags) == 0;
|
| 198 |
|
| 199 | if (!match) {
|
| 200 | break;
|
| 201 | }
|
| 202 | int i;
|
| 203 | for (i = 0; i < outlen; i++) {
|
| 204 | int start = pmatch[i].rm_so;
|
| 205 | int end = pmatch[i].rm_eo;
|
| 206 | int len = end - start;
|
| 207 | BigStr* m = StrFromC(s + cur_pos + start, len);
|
| 208 | log("%d GROUP %d (%d .. %d) = [%s]", cur_pos, i, start, end, m->data_);
|
| 209 | }
|
| 210 | log("");
|
| 211 | int match_len = pmatch[0].rm_eo;
|
| 212 | if (match_len == 0) {
|
| 213 | break;
|
| 214 | }
|
| 215 | cur_pos += match_len;
|
| 216 | }
|
| 217 |
|
| 218 | free(pmatch);
|
| 219 | regfree(&pat);
|
| 220 | }
|
| 221 |
|
| 222 | // adjacent matches
|
| 223 | const char* s = "a345y-axy- there b789y- cy-";
|
| 224 |
|
| 225 | TEST regex_unanchored() {
|
| 226 | const char* unanchored = "[abc]([0-9]*)(x?)(y)-";
|
| 227 | FindAll(unanchored, s);
|
| 228 |
|
| 229 | PASS();
|
| 230 | }
|
| 231 |
|
| 232 | TEST regex_caret() {
|
| 233 | const char* anchored = "^[abc]([0-9]*)(x?)(y)-";
|
| 234 | FindAll(anchored, s);
|
| 235 |
|
| 236 | PASS();
|
| 237 | }
|
| 238 |
|
| 239 | TEST regex_lexer() {
|
| 240 | // like the Yaks / Make-a-Lisp pattern
|
| 241 | const char* lexer = "([a-z]+)|([0-9]+)|([ ]+)|([+-])";
|
| 242 | FindAll(lexer, s);
|
| 243 |
|
| 244 | PASS();
|
| 245 | }
|
| 246 |
|
| 247 | TEST regex_repeat_with_capture() {
|
| 248 | const char* lexer = "(([a-z]+)([0-9]+)-)*((A+)|(Z+))*";
|
| 249 | FindAll(lexer, "a0-b1-c2-AAZZZA");
|
| 250 | // Groups are weird
|
| 251 | // whole match 0: a0-b1-c2-
|
| 252 | // 1: c2- # last repetition
|
| 253 | // 2: c # last one
|
| 254 | // 3: 2 # last one
|
| 255 | //
|
| 256 | // And then there's an empty match
|
| 257 | //
|
| 258 | // Ideas:
|
| 259 | // - disallow nested groups in Eggex?
|
| 260 | // - I really care about the inner ones -- groups 2 and 3
|
| 261 | // - I want flat groups
|
| 262 |
|
| 263 | PASS();
|
| 264 | }
|
| 265 |
|
| 266 | // Disallow this in eggex, as well as the above
|
| 267 | TEST regex_nested_capture() {
|
| 268 | const char* lexer = "(([a-z]+)([0-9]+))";
|
| 269 | FindAll(lexer, "a0");
|
| 270 | PASS();
|
| 271 | }
|
| 272 |
|
| 273 | // I think we allow this in eggex
|
| 274 | TEST regex_alt_with_capture() {
|
| 275 | const char* lexer = "([a-z]+)|([0-9]+)(-)";
|
| 276 | FindAll(lexer, "x-");
|
| 277 | FindAll(lexer, "7-");
|
| 278 | PASS();
|
| 279 | }
|
| 280 |
|
| 281 | TEST regex_unicode() {
|
| 282 | regex_t pat;
|
| 283 |
|
| 284 | // 1 or 2 bytes
|
| 285 | // const char* p = "_..?_";
|
| 286 | // const char* p = "_[^a]_";
|
| 287 | const char* p = "_._"; // 1 byte, not code point?
|
| 288 |
|
| 289 | if (regcomp(&pat, p, REG_EXTENDED) != 0) {
|
| 290 | FAIL();
|
| 291 | }
|
| 292 | int outlen = pat.re_nsub + 1; // number of captures
|
| 293 | regmatch_t* pmatch =
|
| 294 | static_cast<regmatch_t*>(malloc(sizeof(regmatch_t) * outlen));
|
| 295 |
|
| 296 | int result;
|
| 297 |
|
| 298 | const char* bad = "_xyz_";
|
| 299 | result = regexec(&pat, bad, outlen, pmatch, 0);
|
| 300 | ASSERT_EQ_FMT(1, result, "%d"); // does not match
|
| 301 |
|
| 302 | const char* a = "_x_";
|
| 303 | result = regexec(&pat, a, outlen, pmatch, 0);
|
| 304 | ASSERT_EQ_FMT(0, result, "%d");
|
| 305 |
|
| 306 | // Doesn't change anything
|
| 307 | // int lc_what = LC_ALL;
|
| 308 | int lc_what = LC_CTYPE;
|
| 309 |
|
| 310 | // char* saved_locale = setlocale(LC_ALL, "");
|
| 311 | // char* saved_locale = setlocale(LC_ALL, NULL);
|
| 312 |
|
| 313 | // char* saved_locale = setlocale(lc_what, NULL);
|
| 314 |
|
| 315 | #if 0
|
| 316 | // Doesn't change anything?
|
| 317 | //if (setlocale(LC_CTYPE, "C.utf8") == NULL) {
|
| 318 | if (setlocale(LC_CTYPE, "en_US.UTF-8") == NULL) {
|
| 319 | log("Couldn't set locale to C.utf8");
|
| 320 | FAIL();
|
| 321 | }
|
| 322 | #endif
|
| 323 |
|
| 324 | // const char* u = "_μ_";
|
| 325 | const char* u = "_\u03bc_";
|
| 326 | log("a = %d bytes", strlen(a));
|
| 327 | log("u = %d bytes", strlen(u));
|
| 328 | result = regexec(&pat, u, outlen, pmatch, 0);
|
| 329 |
|
| 330 | #if 0
|
| 331 | if (setlocale(lc_what, saved_locale) == NULL) {
|
| 332 | log("Couldn't restore locale");
|
| 333 | FAIL();
|
| 334 | }
|
| 335 | #endif
|
| 336 |
|
| 337 | free(pmatch); // Clean up before test failures
|
| 338 | regfree(&pat);
|
| 339 |
|
| 340 | // TODO(unicode)
|
| 341 | // ASSERT_EQ_FMT(0, result, "%d");
|
| 342 |
|
| 343 | PASS();
|
| 344 | }
|
| 345 |
|
| 346 | TEST casefold_test() {
|
| 347 | #if 0
|
| 348 | // Turkish
|
| 349 | if (setlocale(LC_CTYPE, "tr_TR.utf8") == NULL) {
|
| 350 | log("Couldn't set locale to tr_TR.utf8");
|
| 351 | FAIL();
|
| 352 | }
|
| 353 | #endif
|
| 354 |
|
| 355 | // LC_CTYPE_MASK instead of LC_CTYPE
|
| 356 | locale_t turkish = newlocale(LC_CTYPE_MASK, "tr_TR.utf8", NULL);
|
| 357 |
|
| 358 | int u = toupper('i');
|
| 359 | int wu = towupper('i');
|
| 360 | int wul = towupper_l('i', turkish);
|
| 361 |
|
| 362 | // Regular: upper case i is I, 73
|
| 363 | // Turkish: upper case is 304
|
| 364 | log("upper = %d", u);
|
| 365 | log("wide upper = %d", wu);
|
| 366 | log("wide upper locale = %d", wul);
|
| 367 |
|
| 368 | freelocale(turkish);
|
| 369 |
|
| 370 | PASS();
|
| 371 | }
|
| 372 |
|
| 373 | GREATEST_MAIN_DEFS();
|
| 374 |
|
| 375 | int main(int argc, char** argv) {
|
| 376 | gHeap.Init();
|
| 377 |
|
| 378 | GREATEST_MAIN_BEGIN();
|
| 379 |
|
| 380 | RUN_TEST(hostname_test);
|
| 381 | RUN_TEST(realpath_test);
|
| 382 | RUN_TEST(libc_test);
|
| 383 | RUN_TEST(regex_wrapper_test);
|
| 384 | RUN_TEST(glob_test);
|
| 385 | RUN_TEST(fnmatch_test);
|
| 386 | RUN_TEST(for_test_coverage);
|
| 387 |
|
| 388 | RUN_TEST(regex_unanchored);
|
| 389 | RUN_TEST(regex_caret);
|
| 390 | RUN_TEST(regex_lexer);
|
| 391 | RUN_TEST(regex_repeat_with_capture);
|
| 392 | RUN_TEST(regex_alt_with_capture);
|
| 393 | RUN_TEST(regex_nested_capture);
|
| 394 | RUN_TEST(regex_unicode);
|
| 395 |
|
| 396 | // Crashes in CI? Because of Turkish locale?
|
| 397 | // RUN_TEST(casefold_test);
|
| 398 |
|
| 399 | gHeap.CleanProcessExit();
|
| 400 |
|
| 401 | GREATEST_MAIN_END();
|
| 402 | return 0;
|
| 403 | }
|