| 1 | /*
|
| 2 | * Souffle - A Datalog Compiler
|
| 3 | * Copyright (c) 2021, The Souffle Developers. All rights reserved
|
| 4 | * Licensed under the Universal Permissive License v 1.0 as shown at:
|
| 5 | * - https://opensource.org/licenses/UPL
|
| 6 | * - <souffle root>/licenses/SOUFFLE-UPL.txt
|
| 7 | */
|
| 8 |
|
| 9 | /************************************************************************
|
| 10 | *
|
| 11 | * @file StringUtil.h
|
| 12 | *
|
| 13 | * @brief Datalog project utilities
|
| 14 | *
|
| 15 | ***********************************************************************/
|
| 16 |
|
| 17 | #pragma once
|
| 18 |
|
| 19 | #include "souffle/RamTypes.h"
|
| 20 | #include <algorithm>
|
| 21 | #include <cctype>
|
| 22 | #include <cstdlib>
|
| 23 | #include <fstream>
|
| 24 | #include <limits>
|
| 25 | #include <set>
|
| 26 | #include <sstream>
|
| 27 | #include <stdexcept>
|
| 28 | #include <string>
|
| 29 | #include <type_traits>
|
| 30 | #include <typeinfo>
|
| 31 | #include <vector>
|
| 32 |
|
| 33 | namespace souffle {
|
| 34 |
|
| 35 | // Forward declaration
|
| 36 | inline bool isPrefix(const std::string& prefix, const std::string& element);
|
| 37 |
|
| 38 | /**
|
| 39 | * Converts a string to a RamSigned
|
| 40 | *
|
| 41 | * This procedure has similar behaviour to std::stoi/stoll.
|
| 42 | *
|
| 43 | * The procedure accepts prefixes 0b (if base = 2) and 0x (if base = 16)
|
| 44 | * If base = 0, the procedure will try to infer the base from the prefix, if present.
|
| 45 | */
|
| 46 | inline RamSigned RamSignedFromString(
|
| 47 | const std::string& str, std::size_t* position = nullptr, const int base = 10) {
|
| 48 | RamSigned val;
|
| 49 |
|
| 50 | if (base == 0) {
|
| 51 | if (isPrefix("-0b", str) || isPrefix("0b", str)) {
|
| 52 | return RamSignedFromString(str, position, 2);
|
| 53 | } else if (isPrefix("-0x", str) || isPrefix("0x", str)) {
|
| 54 | return RamSignedFromString(str, position, 16);
|
| 55 | } else {
|
| 56 | return RamSignedFromString(str, position);
|
| 57 | }
|
| 58 | }
|
| 59 | std::string binaryNumber;
|
| 60 | bool parsingBinary = base == 2;
|
| 61 |
|
| 62 | // stoi/stoll can't handle base 2 prefix by default.
|
| 63 | if (parsingBinary) {
|
| 64 | if (isPrefix("-0b", str)) {
|
| 65 | binaryNumber = "-" + str.substr(3);
|
| 66 | } else if (isPrefix("0b", str)) {
|
| 67 | binaryNumber = str.substr(2);
|
| 68 | }
|
| 69 | }
|
| 70 | const std::string& tmp = parsingBinary ? binaryNumber : str;
|
| 71 |
|
| 72 | #if RAM_DOMAIN_SIZE == 64
|
| 73 | val = std::stoll(tmp, position, base);
|
| 74 | #else
|
| 75 | val = std::stoi(tmp, position, base);
|
| 76 | #endif
|
| 77 |
|
| 78 | if (parsingBinary && position != nullptr) {
|
| 79 | *position += 2;
|
| 80 | }
|
| 81 |
|
| 82 | return val;
|
| 83 | }
|
| 84 |
|
| 85 | /**
|
| 86 | * Converts a string to a RamFloat
|
| 87 | */
|
| 88 | inline RamFloat RamFloatFromString(const std::string& str, std::size_t* position = nullptr) {
|
| 89 | RamFloat val;
|
| 90 | #if RAM_DOMAIN_SIZE == 64
|
| 91 | val = std::stod(str, position);
|
| 92 | #else
|
| 93 | val = std::stof(str, position);
|
| 94 | #endif
|
| 95 | return static_cast<RamFloat>(val);
|
| 96 | }
|
| 97 | /**
|
| 98 | * Converts a string to a RamUnsigned
|
| 99 | *
|
| 100 | * This procedure has similar behaviour to std::stoul/stoull.
|
| 101 | *
|
| 102 | * The procedure accepts prefixes 0b (if base = 2) and 0x (if base = 16)
|
| 103 | * If base = 0, the procedure will try to infer the base from the prefix, if present.
|
| 104 | */
|
| 105 | inline RamUnsigned RamUnsignedFromString(
|
| 106 | const std::string& str, std::size_t* position = nullptr, const int base = 10) {
|
| 107 | // Be default C++ (stoul) allows unsigned numbers starting with "-".
|
| 108 | if (isPrefix("-", str)) {
|
| 109 | throw std::invalid_argument("Unsigned number can't start with minus.");
|
| 110 | }
|
| 111 |
|
| 112 | if (base == 0) {
|
| 113 | if (isPrefix("0b", str)) {
|
| 114 | return RamUnsignedFromString(str, position, 2);
|
| 115 | } else if (isPrefix("0x", str)) {
|
| 116 | return RamUnsignedFromString(str, position, 16);
|
| 117 | } else {
|
| 118 | return RamUnsignedFromString(str, position);
|
| 119 | }
|
| 120 | }
|
| 121 |
|
| 122 | // stoul/stoull can't handle binary prefix by default.
|
| 123 | std::string binaryNumber;
|
| 124 | bool parsingBinary = false;
|
| 125 | if (base == 2 && isPrefix("0b", str)) {
|
| 126 | binaryNumber = str.substr(2);
|
| 127 | parsingBinary = true;
|
| 128 | }
|
| 129 | const std::string& tmp = parsingBinary ? binaryNumber : str;
|
| 130 |
|
| 131 | RamUnsigned val;
|
| 132 | #if RAM_DOMAIN_SIZE == 64
|
| 133 | val = std::stoull(tmp, position, base);
|
| 134 | #else
|
| 135 | val = std::stoul(tmp, position, base);
|
| 136 | #endif
|
| 137 |
|
| 138 | if (parsingBinary && position != nullptr) {
|
| 139 | *position += 2;
|
| 140 | }
|
| 141 |
|
| 142 | // check if it's safe to cast (stoul returns unsigned long)
|
| 143 | if (val > std::numeric_limits<RamUnsigned>::max()) {
|
| 144 | throw std::invalid_argument("Unsigned number of of bounds");
|
| 145 | }
|
| 146 |
|
| 147 | return static_cast<RamUnsigned>(val);
|
| 148 | }
|
| 149 |
|
| 150 | /**
|
| 151 | * Can a string be parsed as RamSigned.
|
| 152 | *
|
| 153 | * Souffle (parser, not fact file readers) accepts: hex, binary and base 10.
|
| 154 | * Integer can be negative, in all 3 formats this means that it
|
| 155 | * starts with minus (c++ default semantics).
|
| 156 | */
|
| 157 | inline bool canBeParsedAsRamSigned(const std::string& string) {
|
| 158 | std::size_t charactersRead = 0;
|
| 159 |
|
| 160 | try {
|
| 161 | RamSignedFromString(string, &charactersRead, 0);
|
| 162 | } catch (...) {
|
| 163 | return false;
|
| 164 | }
|
| 165 |
|
| 166 | return charactersRead == string.size();
|
| 167 | }
|
| 168 |
|
| 169 | /**
|
| 170 | * Can a string be parsed as RamUnsigned.
|
| 171 | *
|
| 172 | * Souffle accepts: hex, binary and base 10.
|
| 173 | */
|
| 174 | inline bool canBeParsedAsRamUnsigned(const std::string& string) {
|
| 175 | std::size_t charactersRead = 0;
|
| 176 | try {
|
| 177 | RamUnsignedFromString(string, &charactersRead, 0);
|
| 178 | } catch (...) {
|
| 179 | return false;
|
| 180 | }
|
| 181 | return charactersRead == string.size();
|
| 182 | }
|
| 183 |
|
| 184 | /**
|
| 185 | * Can a string be parsed as RamFloat.
|
| 186 | */
|
| 187 | inline bool canBeParsedAsRamFloat(const std::string& string) {
|
| 188 | std::size_t charactersRead = 0;
|
| 189 | try {
|
| 190 | RamFloatFromString(string, &charactersRead);
|
| 191 | } catch (...) {
|
| 192 | return false;
|
| 193 | }
|
| 194 | return charactersRead == string.size();
|
| 195 | }
|
| 196 |
|
| 197 | #if RAM_DOMAIN_SIZE == 64
|
| 198 | inline RamDomain stord(const std::string& str, std::size_t* pos = nullptr, int base = 10) {
|
| 199 | return static_cast<RamDomain>(std::stoull(str, pos, base));
|
| 200 | }
|
| 201 | #elif RAM_DOMAIN_SIZE == 32
|
| 202 | inline RamDomain stord(const std::string& str, std::size_t* pos = nullptr, int base = 10) {
|
| 203 | return static_cast<RamDomain>(std::stoul(str, pos, base));
|
| 204 | }
|
| 205 | #else
|
| 206 | #error RAM Domain is neither 32bit nor 64bit
|
| 207 | #endif
|
| 208 |
|
| 209 | /**
|
| 210 | * Check whether a string is a sequence of digits
|
| 211 | */
|
| 212 | inline bool isNumber(const char* str) {
|
| 213 | if (str == nullptr) {
|
| 214 | return false;
|
| 215 | }
|
| 216 |
|
| 217 | while (*str != 0) {
|
| 218 | if (isdigit(*str) == 0) {
|
| 219 | return false;
|
| 220 | }
|
| 221 | str++;
|
| 222 | }
|
| 223 | return true;
|
| 224 | }
|
| 225 |
|
| 226 | /**
|
| 227 | * A generic function converting strings into strings (trivial case).
|
| 228 | */
|
| 229 | inline const std::string& toString(const std::string& str) {
|
| 230 | return str;
|
| 231 | }
|
| 232 |
|
| 233 | namespace detail {
|
| 234 |
|
| 235 | /**
|
| 236 | * A type trait to check whether a given type is printable.
|
| 237 | * In this general case, nothing is printable.
|
| 238 | */
|
| 239 | template <typename T, typename filter = void>
|
| 240 | struct is_printable : public std::false_type {};
|
| 241 |
|
| 242 | /**
|
| 243 | * A type trait to check whether a given type is printable.
|
| 244 | * This specialization makes types with an output operator printable.
|
| 245 | */
|
| 246 | template <typename T>
|
| 247 | struct is_printable<T, typename std::conditional<false,
|
| 248 | decltype(std::declval<std::ostream&>() << std::declval<T>()), void>::type>
|
| 249 | : public std::true_type {};
|
| 250 |
|
| 251 | template <typename T, typename filter = void>
|
| 252 | struct is_html_printable : public std::false_type {};
|
| 253 |
|
| 254 | template <typename T>
|
| 255 | struct is_html_printable<T,
|
| 256 | typename std::conditional<false, decltype(std::declval<T>().printHTML(std::declval<std::ostream&>())),
|
| 257 | void>::type> : public std::true_type {};
|
| 258 |
|
| 259 | } // namespace detail
|
| 260 |
|
| 261 | /**
|
| 262 | * A generic function converting arbitrary objects to strings by utilizing
|
| 263 | * their print capability.
|
| 264 | *
|
| 265 | * This function is mainly intended for implementing test cases and debugging
|
| 266 | * operations.
|
| 267 | */
|
| 268 | template <typename T>
|
| 269 | typename std::enable_if<detail::is_printable<T>::value, std::string>::type toString(const T& value) {
|
| 270 | // write value into stream and return result
|
| 271 | std::stringstream ss;
|
| 272 | ss << value;
|
| 273 | return ss.str();
|
| 274 | }
|
| 275 |
|
| 276 | /**
|
| 277 | * A fallback for the to-string function in case an unprintable object is supposed
|
| 278 | * to be printed.
|
| 279 | */
|
| 280 | template <typename T>
|
| 281 | typename std::enable_if<!detail::is_printable<T>::value, std::string>::type toString(const T&) {
|
| 282 | std::stringstream ss;
|
| 283 | ss << "(print for type ";
|
| 284 | ss << typeid(T).name();
|
| 285 | ss << " not supported)";
|
| 286 | return ss.str();
|
| 287 | }
|
| 288 |
|
| 289 | template <typename T>
|
| 290 | auto toHtml(const T& obj) -> typename std::enable_if<detail::is_html_printable<T>::value, std::string>::type {
|
| 291 | std::stringstream out;
|
| 292 | obj.printHTML(out);
|
| 293 | return out.str();
|
| 294 | }
|
| 295 |
|
| 296 | /** Fallback to `toString` */
|
| 297 | template <typename T>
|
| 298 | auto toHtml(const T& obj) ->
|
| 299 | typename std::enable_if<not detail::is_html_printable<T>::value, std::string>::type {
|
| 300 | return toString(obj);
|
| 301 | }
|
| 302 |
|
| 303 | // -------------------------------------------------------------------------------
|
| 304 | // String Utils
|
| 305 | // -------------------------------------------------------------------------------
|
| 306 |
|
| 307 | /**
|
| 308 | * Determine if one string is a prefix of another
|
| 309 | */
|
| 310 | inline bool isPrefix(const std::string& prefix, const std::string& element) {
|
| 311 | auto itPrefix = prefix.begin();
|
| 312 | auto itElement = element.begin();
|
| 313 |
|
| 314 | while (itPrefix != prefix.end() && itElement != element.end()) {
|
| 315 | if (*itPrefix != *itElement) {
|
| 316 | break;
|
| 317 | }
|
| 318 | ++itPrefix;
|
| 319 | ++itElement;
|
| 320 | }
|
| 321 |
|
| 322 | return itPrefix == prefix.end();
|
| 323 | }
|
| 324 |
|
| 325 | /**
|
| 326 | * Determines whether the given value string ends with the given
|
| 327 | * end string.
|
| 328 | */
|
| 329 | inline bool endsWith(const std::string& value, const std::string& ending) {
|
| 330 | if (value.size() < ending.size()) {
|
| 331 | return false;
|
| 332 | }
|
| 333 | return std::equal(ending.rbegin(), ending.rend(), value.rbegin());
|
| 334 | }
|
| 335 |
|
| 336 | /**
|
| 337 | * Splits a string given a delimiter
|
| 338 | */
|
| 339 | inline std::vector<std::string_view> splitView(std::string_view toSplit, std::string_view delimiter) {
|
| 340 | if (toSplit.empty()) return {toSplit};
|
| 341 |
|
| 342 | auto delimLen = std::max<size_t>(1, delimiter.size()); // ensure we advance even w/ an empty needle
|
| 343 |
|
| 344 | std::vector<std::string_view> parts;
|
| 345 | for (auto tail = toSplit;;) {
|
| 346 | auto pos = tail.find(delimiter);
|
| 347 | parts.push_back(tail.substr(0, pos));
|
| 348 | if (pos == tail.npos) break;
|
| 349 |
|
| 350 | tail = tail.substr(pos + delimLen);
|
| 351 | }
|
| 352 |
|
| 353 | return parts;
|
| 354 | }
|
| 355 |
|
| 356 | /**
|
| 357 | * Splits a string given a delimiter
|
| 358 | */
|
| 359 | inline std::vector<std::string> splitString(std::string_view str, char delimiter) {
|
| 360 | std::vector<std::string> xs;
|
| 361 | for (auto&& x : splitView(str, std::string_view{&delimiter, 1}))
|
| 362 | xs.push_back(std::string(x));
|
| 363 | return xs;
|
| 364 | }
|
| 365 |
|
| 366 | /**
|
| 367 | * Strips the prefix of a given string if it exists. No change otherwise.
|
| 368 | */
|
| 369 | inline std::string stripPrefix(const std::string& prefix, const std::string& element) {
|
| 370 | return isPrefix(prefix, element) ? element.substr(prefix.length()) : element;
|
| 371 | }
|
| 372 |
|
| 373 | /**
|
| 374 | * Stringify a string using escapes for escape, newline, tab, double-quotes and semicolons
|
| 375 | */
|
| 376 | inline std::string stringify(const std::string& input) {
|
| 377 | std::string str(input);
|
| 378 |
|
| 379 | // replace escapes with double escape sequence
|
| 380 | std::size_t start_pos = 0;
|
| 381 | while ((start_pos = str.find('\\', start_pos)) != std::string::npos) {
|
| 382 | str.replace(start_pos, 1, "\\\\");
|
| 383 | start_pos += 2;
|
| 384 | }
|
| 385 | // replace semicolons with escape sequence
|
| 386 | start_pos = 0;
|
| 387 | while ((start_pos = str.find(';', start_pos)) != std::string::npos) {
|
| 388 | str.replace(start_pos, 1, "\\;");
|
| 389 | start_pos += 2;
|
| 390 | }
|
| 391 | // replace double-quotes with escape sequence
|
| 392 | start_pos = 0;
|
| 393 | while ((start_pos = str.find('"', start_pos)) != std::string::npos) {
|
| 394 | str.replace(start_pos, 1, "\\\"");
|
| 395 | start_pos += 2;
|
| 396 | }
|
| 397 | // replace newline with escape sequence
|
| 398 | start_pos = 0;
|
| 399 | while ((start_pos = str.find('\n', start_pos)) != std::string::npos) {
|
| 400 | str.replace(start_pos, 1, "\\n");
|
| 401 | start_pos += 2;
|
| 402 | }
|
| 403 | // replace tab with escape sequence
|
| 404 | start_pos = 0;
|
| 405 | while ((start_pos = str.find('\t', start_pos)) != std::string::npos) {
|
| 406 | str.replace(start_pos, 1, "\\t");
|
| 407 | start_pos += 2;
|
| 408 | }
|
| 409 | return str;
|
| 410 | }
|
| 411 |
|
| 412 | /**
|
| 413 | * Escape JSON string.
|
| 414 | */
|
| 415 | inline std::string escapeJSONstring(const std::string& JSONstr) {
|
| 416 | std::ostringstream destination;
|
| 417 |
|
| 418 | // Iterate over all characters except first and last
|
| 419 | for (char c : JSONstr) {
|
| 420 | if (c == '\"') {
|
| 421 | destination << "\\";
|
| 422 | }
|
| 423 | destination << c;
|
| 424 | }
|
| 425 | return destination.str();
|
| 426 | }
|
| 427 |
|
| 428 | /** Valid C++ identifier, note that this does not ensure the uniqueness of identifiers returned. */
|
| 429 | inline std::string identifier(std::string id) {
|
| 430 | for (std::size_t i = 0; i < id.length(); i++) {
|
| 431 | if (((isalpha(id[i]) == 0) && i == 0) || ((isalnum(id[i]) == 0) && id[i] != '_')) {
|
| 432 | id[i] = '_';
|
| 433 | }
|
| 434 | }
|
| 435 | return id;
|
| 436 | }
|
| 437 |
|
| 438 | // TODO (b-scholz): tidy up unescape/escape functions
|
| 439 |
|
| 440 | inline std::string unescape(
|
| 441 | const std::string& inputString, const std::string& needle, const std::string& replacement) {
|
| 442 | std::string result = inputString;
|
| 443 | std::size_t pos = 0;
|
| 444 | while ((pos = result.find(needle, pos)) != std::string::npos) {
|
| 445 | result = result.replace(pos, needle.length(), replacement);
|
| 446 | pos += replacement.length();
|
| 447 | }
|
| 448 | return result;
|
| 449 | }
|
| 450 |
|
| 451 | inline std::string unescape(const std::string& inputString) {
|
| 452 | std::string unescaped = unescape(inputString, "\\\"", "\"");
|
| 453 | unescaped = unescape(unescaped, "\\t", "\t");
|
| 454 | unescaped = unescape(unescaped, "\\r", "\r");
|
| 455 | unescaped = unescape(unescaped, "\\n", "\n");
|
| 456 | return unescaped;
|
| 457 | }
|
| 458 |
|
| 459 | inline std::string escape(
|
| 460 | const std::string& inputString, const std::string& needle, const std::string& replacement) {
|
| 461 | std::string result = inputString;
|
| 462 | std::size_t pos = 0;
|
| 463 | while ((pos = result.find(needle, pos)) != std::string::npos) {
|
| 464 | result = result.replace(pos, needle.length(), replacement);
|
| 465 | pos += replacement.length();
|
| 466 | }
|
| 467 | return result;
|
| 468 | }
|
| 469 |
|
| 470 | inline std::string escape(const std::string& inputString) {
|
| 471 | std::string escaped = escape(inputString, "\"", "\\\"");
|
| 472 | escaped = escape(escaped, "\t", "\\t");
|
| 473 | escaped = escape(escaped, "\r", "\\r");
|
| 474 | escaped = escape(escaped, "\n", "\\n");
|
| 475 | return escaped;
|
| 476 | }
|
| 477 |
|
| 478 | template <typename C>
|
| 479 | auto escape(C&& os, std::string_view str, std::set<char> const& needs_escape, std::string_view esc) {
|
| 480 | for (auto&& x : str) {
|
| 481 | if (needs_escape.find(x) != needs_escape.end()) {
|
| 482 | os << esc;
|
| 483 | }
|
| 484 | os << x;
|
| 485 | }
|
| 486 |
|
| 487 | return std::forward<C>(os);
|
| 488 | }
|
| 489 |
|
| 490 | inline std::string escape(std::string_view str, std::set<char> const& needs_escape, std::string_view esc) {
|
| 491 | return escape(std::stringstream{}, str, needs_escape, esc).str();
|
| 492 | }
|
| 493 |
|
| 494 | } // end namespace souffle
|