annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/kj/parse/char.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
jpayne@69 2 // Licensed under the MIT License:
jpayne@69 3 //
jpayne@69 4 // Permission is hereby granted, free of charge, to any person obtaining a copy
jpayne@69 5 // of this software and associated documentation files (the "Software"), to deal
jpayne@69 6 // in the Software without restriction, including without limitation the rights
jpayne@69 7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
jpayne@69 8 // copies of the Software, and to permit persons to whom the Software is
jpayne@69 9 // furnished to do so, subject to the following conditions:
jpayne@69 10 //
jpayne@69 11 // The above copyright notice and this permission notice shall be included in
jpayne@69 12 // all copies or substantial portions of the Software.
jpayne@69 13 //
jpayne@69 14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
jpayne@69 15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
jpayne@69 16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
jpayne@69 17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
jpayne@69 18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
jpayne@69 19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
jpayne@69 20 // THE SOFTWARE.
jpayne@69 21
jpayne@69 22 // This file contains parsers useful for character stream inputs, including parsers to parse
jpayne@69 23 // common kinds of tokens like identifiers, numbers, and quoted strings.
jpayne@69 24
jpayne@69 25 #pragma once
jpayne@69 26
jpayne@69 27 #include "common.h"
jpayne@69 28 #include "../string.h"
jpayne@69 29 #include <inttypes.h>
jpayne@69 30
jpayne@69 31 KJ_BEGIN_HEADER
jpayne@69 32
jpayne@69 33 namespace kj {
jpayne@69 34 namespace parse {
jpayne@69 35
jpayne@69 36 // =======================================================================================
jpayne@69 37 // Exact char/string.
jpayne@69 38
jpayne@69 39 class ExactString_ {
jpayne@69 40 public:
jpayne@69 41 constexpr inline ExactString_(const char* str): str(str) {}
jpayne@69 42
jpayne@69 43 template <typename Input>
jpayne@69 44 Maybe<Tuple<>> operator()(Input& input) const {
jpayne@69 45 const char* ptr = str;
jpayne@69 46
jpayne@69 47 while (*ptr != '\0') {
jpayne@69 48 if (input.atEnd() || input.current() != *ptr) return nullptr;
jpayne@69 49 input.next();
jpayne@69 50 ++ptr;
jpayne@69 51 }
jpayne@69 52
jpayne@69 53 return Tuple<>();
jpayne@69 54 }
jpayne@69 55
jpayne@69 56 private:
jpayne@69 57 const char* str;
jpayne@69 58 };
jpayne@69 59
jpayne@69 60 constexpr inline ExactString_ exactString(const char* str) {
jpayne@69 61 return ExactString_(str);
jpayne@69 62 }
jpayne@69 63
jpayne@69 64 template <char c>
jpayne@69 65 constexpr ExactlyConst_<char, c> exactChar() {
jpayne@69 66 // Returns a parser that matches exactly the character given by the template argument (returning
jpayne@69 67 // no result).
jpayne@69 68 return ExactlyConst_<char, c>();
jpayne@69 69 }
jpayne@69 70
jpayne@69 71 // =======================================================================================
jpayne@69 72 // Char ranges / sets
jpayne@69 73
jpayne@69 74 class CharGroup_ {
jpayne@69 75 public:
jpayne@69 76 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
jpayne@69 77
jpayne@69 78 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
jpayne@69 79 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
jpayne@69 80 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
jpayne@69 81 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
jpayne@69 82 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
jpayne@69 83 }
jpayne@69 84
jpayne@69 85 constexpr inline CharGroup_ orAny(const char* chars) const {
jpayne@69 86 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
jpayne@69 87 }
jpayne@69 88
jpayne@69 89 constexpr inline CharGroup_ orChar(unsigned char c) const {
jpayne@69 90 return CharGroup_(bits[0] | bit(c),
jpayne@69 91 bits[1] | bit(c - 64),
jpayne@69 92 bits[2] | bit(c - 128),
jpayne@69 93 bits[3] | bit(c - 256));
jpayne@69 94 }
jpayne@69 95
jpayne@69 96 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
jpayne@69 97 return CharGroup_(bits[0] | other.bits[0],
jpayne@69 98 bits[1] | other.bits[1],
jpayne@69 99 bits[2] | other.bits[2],
jpayne@69 100 bits[3] | other.bits[3]);
jpayne@69 101 }
jpayne@69 102
jpayne@69 103 constexpr inline CharGroup_ invert() const {
jpayne@69 104 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
jpayne@69 105 }
jpayne@69 106
jpayne@69 107 constexpr inline bool contains(unsigned char c) const {
jpayne@69 108 return (bits[c / 64] & (1ll << (c % 64))) != 0;
jpayne@69 109 }
jpayne@69 110
jpayne@69 111 inline bool containsAll(ArrayPtr<const char> text) const {
jpayne@69 112 for (char c: text) {
jpayne@69 113 if (!contains(c)) return false;
jpayne@69 114 }
jpayne@69 115 return true;
jpayne@69 116 }
jpayne@69 117
jpayne@69 118 template <typename Input>
jpayne@69 119 Maybe<char> operator()(Input& input) const {
jpayne@69 120 if (input.atEnd()) return nullptr;
jpayne@69 121 unsigned char c = input.current();
jpayne@69 122 if (contains(c)) {
jpayne@69 123 input.next();
jpayne@69 124 return c;
jpayne@69 125 } else {
jpayne@69 126 return nullptr;
jpayne@69 127 }
jpayne@69 128 }
jpayne@69 129
jpayne@69 130 private:
jpayne@69 131 typedef unsigned long long Bits64;
jpayne@69 132
jpayne@69 133 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
jpayne@69 134 Bits64 bits[4];
jpayne@69 135
jpayne@69 136 static constexpr inline Bits64 oneBits(int count) {
jpayne@69 137 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
jpayne@69 138 }
jpayne@69 139 static constexpr inline Bits64 bit(int index) {
jpayne@69 140 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
jpayne@69 141 }
jpayne@69 142 };
jpayne@69 143
jpayne@69 144 constexpr inline CharGroup_ charRange(char first, char last) {
jpayne@69 145 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
jpayne@69 146 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
jpayne@69 147 // character matched.
jpayne@69 148 //
jpayne@69 149 // The returned object has methods which can be used to match more characters. The following
jpayne@69 150 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
jpayne@69 151 //
jpayne@69 152 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
jpayne@69 153 //
jpayne@69 154 // You can also use `.invert()` to match the opposite set of characters.
jpayne@69 155
jpayne@69 156 return CharGroup_().orRange(first, last);
jpayne@69 157 }
jpayne@69 158
jpayne@69 159 #if _MSC_VER && !defined(__clang__)
jpayne@69 160 #define anyOfChars(chars) CharGroup_().orAny(chars)
jpayne@69 161 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
jpayne@69 162 // building the compiler or schema parser. We don't know why this happens, but Harris found that
jpayne@69 163 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
jpayne@69 164 // Hopefully, MSVC will get fixed soon and we'll be able to remove this.
jpayne@69 165 #else
jpayne@69 166 constexpr inline CharGroup_ anyOfChars(const char* chars) {
jpayne@69 167 // Returns a parser that accepts any of the characters in the given string (which should usually
jpayne@69 168 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
jpayne@69 169 // that function for more info.
jpayne@69 170
jpayne@69 171 return CharGroup_().orAny(chars);
jpayne@69 172 }
jpayne@69 173 #endif
jpayne@69 174
jpayne@69 175 // =======================================================================================
jpayne@69 176
jpayne@69 177 namespace _ { // private
jpayne@69 178
jpayne@69 179 struct ArrayToString {
jpayne@69 180 inline String operator()(const Array<char>& arr) const {
jpayne@69 181 return heapString(arr);
jpayne@69 182 }
jpayne@69 183 };
jpayne@69 184
jpayne@69 185 } // namespace _ (private)
jpayne@69 186
jpayne@69 187 template <typename SubParser>
jpayne@69 188 constexpr inline auto charsToString(SubParser&& subParser)
jpayne@69 189 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
jpayne@69 190 // Wraps a parser that returns Array<char> such that it returns String instead.
jpayne@69 191 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
jpayne@69 192 }
jpayne@69 193
jpayne@69 194 // =======================================================================================
jpayne@69 195 // Basic character classes.
jpayne@69 196
jpayne@69 197 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
jpayne@69 198 constexpr auto digit = charRange('0', '9');
jpayne@69 199 constexpr auto alphaNumeric = alpha.orGroup(digit);
jpayne@69 200 constexpr auto nameStart = alpha.orChar('_');
jpayne@69 201 constexpr auto nameChar = alphaNumeric.orChar('_');
jpayne@69 202 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
jpayne@69 203 constexpr auto octDigit = charRange('0', '7');
jpayne@69 204 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
jpayne@69 205 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
jpayne@69 206
jpayne@69 207 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
jpayne@69 208
jpayne@69 209 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
jpayne@69 210 // Like discard(whitespace) but avoids some memory allocation.
jpayne@69 211
jpayne@69 212 // =======================================================================================
jpayne@69 213 // Identifiers
jpayne@69 214
jpayne@69 215 namespace _ { // private
jpayne@69 216
jpayne@69 217 struct IdentifierToString {
jpayne@69 218 inline String operator()(char first, const Array<char>& rest) const {
jpayne@69 219 if (rest.size() == 0) return heapString(&first, 1);
jpayne@69 220 String result = heapString(rest.size() + 1);
jpayne@69 221 result[0] = first;
jpayne@69 222 memcpy(result.begin() + 1, rest.begin(), rest.size());
jpayne@69 223 return result;
jpayne@69 224 }
jpayne@69 225 };
jpayne@69 226
jpayne@69 227 } // namespace _ (private)
jpayne@69 228
jpayne@69 229 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
jpayne@69 230 // Parses an identifier (e.g. a C variable name).
jpayne@69 231
jpayne@69 232 // =======================================================================================
jpayne@69 233 // Integers
jpayne@69 234
jpayne@69 235 namespace _ { // private
jpayne@69 236
jpayne@69 237 inline char parseDigit(char c) {
jpayne@69 238 if (c < 'A') return c - '0';
jpayne@69 239 if (c < 'a') return c - 'A' + 10;
jpayne@69 240 return c - 'a' + 10;
jpayne@69 241 }
jpayne@69 242
jpayne@69 243 template <uint base>
jpayne@69 244 struct ParseInteger {
jpayne@69 245 inline uint64_t operator()(const Array<char>& digits) const {
jpayne@69 246 return operator()('0', digits);
jpayne@69 247 }
jpayne@69 248 uint64_t operator()(char first, const Array<char>& digits) const {
jpayne@69 249 uint64_t result = parseDigit(first);
jpayne@69 250 for (char digit: digits) {
jpayne@69 251 result = result * base + parseDigit(digit);
jpayne@69 252 }
jpayne@69 253 return result;
jpayne@69 254 }
jpayne@69 255 };
jpayne@69 256
jpayne@69 257
jpayne@69 258 } // namespace _ (private)
jpayne@69 259
jpayne@69 260 constexpr auto integer = sequence(
jpayne@69 261 oneOf(
jpayne@69 262 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
jpayne@69 263 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
jpayne@69 264 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
jpayne@69 265 notLookingAt(alpha.orAny("_.")));
jpayne@69 266
jpayne@69 267 // =======================================================================================
jpayne@69 268 // Numbers (i.e. floats)
jpayne@69 269
jpayne@69 270 namespace _ { // private
jpayne@69 271
jpayne@69 272 struct ParseFloat {
jpayne@69 273 double operator()(const Array<char>& digits,
jpayne@69 274 const Maybe<Array<char>>& fraction,
jpayne@69 275 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
jpayne@69 276 };
jpayne@69 277
jpayne@69 278 } // namespace _ (private)
jpayne@69 279
jpayne@69 280 constexpr auto number = transform(
jpayne@69 281 sequence(
jpayne@69 282 oneOrMore(digit),
jpayne@69 283 optional(sequence(exactChar<'.'>(), many(digit))),
jpayne@69 284 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
jpayne@69 285 notLookingAt(alpha.orAny("_."))),
jpayne@69 286 _::ParseFloat());
jpayne@69 287
jpayne@69 288 // =======================================================================================
jpayne@69 289 // Quoted strings
jpayne@69 290
jpayne@69 291 namespace _ { // private
jpayne@69 292
jpayne@69 293 struct InterpretEscape {
jpayne@69 294 char operator()(char c) const {
jpayne@69 295 switch (c) {
jpayne@69 296 case 'a': return '\a';
jpayne@69 297 case 'b': return '\b';
jpayne@69 298 case 'f': return '\f';
jpayne@69 299 case 'n': return '\n';
jpayne@69 300 case 'r': return '\r';
jpayne@69 301 case 't': return '\t';
jpayne@69 302 case 'v': return '\v';
jpayne@69 303 default: return c;
jpayne@69 304 }
jpayne@69 305 }
jpayne@69 306 };
jpayne@69 307
jpayne@69 308 struct ParseHexEscape {
jpayne@69 309 inline char operator()(char first, char second) const {
jpayne@69 310 return (parseDigit(first) << 4) | parseDigit(second);
jpayne@69 311 }
jpayne@69 312 };
jpayne@69 313
jpayne@69 314 struct ParseHexByte {
jpayne@69 315 inline byte operator()(char first, char second) const {
jpayne@69 316 return (parseDigit(first) << 4) | parseDigit(second);
jpayne@69 317 }
jpayne@69 318 };
jpayne@69 319
jpayne@69 320 struct ParseOctEscape {
jpayne@69 321 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
jpayne@69 322 char result = first - '0';
jpayne@69 323 KJ_IF_MAYBE(digit1, second) {
jpayne@69 324 result = (result << 3) | (*digit1 - '0');
jpayne@69 325 KJ_IF_MAYBE(digit2, third) {
jpayne@69 326 result = (result << 3) | (*digit2 - '0');
jpayne@69 327 }
jpayne@69 328 }
jpayne@69 329 return result;
jpayne@69 330 }
jpayne@69 331 };
jpayne@69 332
jpayne@69 333 } // namespace _ (private)
jpayne@69 334
jpayne@69 335 constexpr auto escapeSequence =
jpayne@69 336 sequence(exactChar<'\\'>(), oneOf(
jpayne@69 337 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
jpayne@69 338 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
jpayne@69 339 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
jpayne@69 340 _::ParseOctEscape())));
jpayne@69 341 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
jpayne@69 342 // a char.
jpayne@69 343
jpayne@69 344 constexpr auto doubleQuotedString = charsToString(sequence(
jpayne@69 345 exactChar<'\"'>(),
jpayne@69 346 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
jpayne@69 347 exactChar<'\"'>()));
jpayne@69 348 // Parses a C-style double-quoted string.
jpayne@69 349
jpayne@69 350 constexpr auto singleQuotedString = charsToString(sequence(
jpayne@69 351 exactChar<'\''>(),
jpayne@69 352 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
jpayne@69 353 exactChar<'\''>()));
jpayne@69 354 // Parses a C-style single-quoted string.
jpayne@69 355
jpayne@69 356 constexpr auto doubleQuotedHexBinary = sequence(
jpayne@69 357 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
jpayne@69 358 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
jpayne@69 359 discardWhitespace,
jpayne@69 360 exactChar<'\"'>());
jpayne@69 361 // Parses a double-quoted hex binary literal. Returns Array<byte>.
jpayne@69 362
jpayne@69 363 } // namespace parse
jpayne@69 364 } // namespace kj
jpayne@69 365
jpayne@69 366 KJ_END_HEADER