jpayne@69: // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors jpayne@69: // Licensed under the MIT License: jpayne@69: // jpayne@69: // Permission is hereby granted, free of charge, to any person obtaining a copy jpayne@69: // of this software and associated documentation files (the "Software"), to deal jpayne@69: // in the Software without restriction, including without limitation the rights jpayne@69: // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell jpayne@69: // copies of the Software, and to permit persons to whom the Software is jpayne@69: // furnished to do so, subject to the following conditions: jpayne@69: // jpayne@69: // The above copyright notice and this permission notice shall be included in jpayne@69: // all copies or substantial portions of the Software. jpayne@69: // jpayne@69: // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR jpayne@69: // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, jpayne@69: // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE jpayne@69: // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER jpayne@69: // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, jpayne@69: // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN jpayne@69: // THE SOFTWARE. jpayne@69: jpayne@69: // This file contains parsers useful for character stream inputs, including parsers to parse jpayne@69: // common kinds of tokens like identifiers, numbers, and quoted strings. jpayne@69: jpayne@69: #pragma once jpayne@69: jpayne@69: #include "common.h" jpayne@69: #include "../string.h" jpayne@69: #include jpayne@69: jpayne@69: KJ_BEGIN_HEADER jpayne@69: jpayne@69: namespace kj { jpayne@69: namespace parse { jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: // Exact char/string. jpayne@69: jpayne@69: class ExactString_ { jpayne@69: public: jpayne@69: constexpr inline ExactString_(const char* str): str(str) {} jpayne@69: jpayne@69: template jpayne@69: Maybe> operator()(Input& input) const { jpayne@69: const char* ptr = str; jpayne@69: jpayne@69: while (*ptr != '\0') { jpayne@69: if (input.atEnd() || input.current() != *ptr) return nullptr; jpayne@69: input.next(); jpayne@69: ++ptr; jpayne@69: } jpayne@69: jpayne@69: return Tuple<>(); jpayne@69: } jpayne@69: jpayne@69: private: jpayne@69: const char* str; jpayne@69: }; jpayne@69: jpayne@69: constexpr inline ExactString_ exactString(const char* str) { jpayne@69: return ExactString_(str); jpayne@69: } jpayne@69: jpayne@69: template jpayne@69: constexpr ExactlyConst_ exactChar() { jpayne@69: // Returns a parser that matches exactly the character given by the template argument (returning jpayne@69: // no result). jpayne@69: return ExactlyConst_(); jpayne@69: } jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: // Char ranges / sets jpayne@69: jpayne@69: class CharGroup_ { jpayne@69: public: jpayne@69: constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} jpayne@69: jpayne@69: constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { jpayne@69: return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), jpayne@69: bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), jpayne@69: bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), jpayne@69: bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); jpayne@69: } jpayne@69: jpayne@69: constexpr inline CharGroup_ orAny(const char* chars) const { jpayne@69: return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); jpayne@69: } jpayne@69: jpayne@69: constexpr inline CharGroup_ orChar(unsigned char c) const { jpayne@69: return CharGroup_(bits[0] | bit(c), jpayne@69: bits[1] | bit(c - 64), jpayne@69: bits[2] | bit(c - 128), jpayne@69: bits[3] | bit(c - 256)); jpayne@69: } jpayne@69: jpayne@69: constexpr inline CharGroup_ orGroup(CharGroup_ other) const { jpayne@69: return CharGroup_(bits[0] | other.bits[0], jpayne@69: bits[1] | other.bits[1], jpayne@69: bits[2] | other.bits[2], jpayne@69: bits[3] | other.bits[3]); jpayne@69: } jpayne@69: jpayne@69: constexpr inline CharGroup_ invert() const { jpayne@69: return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); jpayne@69: } jpayne@69: jpayne@69: constexpr inline bool contains(unsigned char c) const { jpayne@69: return (bits[c / 64] & (1ll << (c % 64))) != 0; jpayne@69: } jpayne@69: jpayne@69: inline bool containsAll(ArrayPtr text) const { jpayne@69: for (char c: text) { jpayne@69: if (!contains(c)) return false; jpayne@69: } jpayne@69: return true; jpayne@69: } jpayne@69: jpayne@69: template jpayne@69: Maybe operator()(Input& input) const { jpayne@69: if (input.atEnd()) return nullptr; jpayne@69: unsigned char c = input.current(); jpayne@69: if (contains(c)) { jpayne@69: input.next(); jpayne@69: return c; jpayne@69: } else { jpayne@69: return nullptr; jpayne@69: } jpayne@69: } jpayne@69: jpayne@69: private: jpayne@69: typedef unsigned long long Bits64; jpayne@69: jpayne@69: constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} jpayne@69: Bits64 bits[4]; jpayne@69: jpayne@69: static constexpr inline Bits64 oneBits(int count) { jpayne@69: return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); jpayne@69: } jpayne@69: static constexpr inline Bits64 bit(int index) { jpayne@69: return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: constexpr inline CharGroup_ charRange(char first, char last) { jpayne@69: // Create a parser which accepts any character in the range from `first` to `last`, inclusive. jpayne@69: // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the jpayne@69: // character matched. jpayne@69: // jpayne@69: // The returned object has methods which can be used to match more characters. The following jpayne@69: // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. jpayne@69: // jpayne@69: // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") jpayne@69: // jpayne@69: // You can also use `.invert()` to match the opposite set of characters. jpayne@69: jpayne@69: return CharGroup_().orRange(first, last); jpayne@69: } jpayne@69: jpayne@69: #if _MSC_VER && !defined(__clang__) jpayne@69: #define anyOfChars(chars) CharGroup_().orAny(chars) jpayne@69: // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from jpayne@69: // building the compiler or schema parser. We don't know why this happens, but Harris found that jpayne@69: // this horrible, horrible hack makes things work. This is awful, but it's better than nothing. jpayne@69: // Hopefully, MSVC will get fixed soon and we'll be able to remove this. jpayne@69: #else jpayne@69: constexpr inline CharGroup_ anyOfChars(const char* chars) { jpayne@69: // Returns a parser that accepts any of the characters in the given string (which should usually jpayne@69: // be a literal). The returned parser is of the same type as returned by `charRange()` -- see jpayne@69: // that function for more info. jpayne@69: jpayne@69: return CharGroup_().orAny(chars); jpayne@69: } jpayne@69: #endif jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: jpayne@69: namespace _ { // private jpayne@69: jpayne@69: struct ArrayToString { jpayne@69: inline String operator()(const Array& arr) const { jpayne@69: return heapString(arr); jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: } // namespace _ (private) jpayne@69: jpayne@69: template jpayne@69: constexpr inline auto charsToString(SubParser&& subParser) jpayne@69: -> decltype(transform(kj::fwd(subParser), _::ArrayToString())) { jpayne@69: // Wraps a parser that returns Array such that it returns String instead. jpayne@69: return parse::transform(kj::fwd(subParser), _::ArrayToString()); jpayne@69: } jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: // Basic character classes. jpayne@69: jpayne@69: constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); jpayne@69: constexpr auto digit = charRange('0', '9'); jpayne@69: constexpr auto alphaNumeric = alpha.orGroup(digit); jpayne@69: constexpr auto nameStart = alpha.orChar('_'); jpayne@69: constexpr auto nameChar = alphaNumeric.orChar('_'); jpayne@69: constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); jpayne@69: constexpr auto octDigit = charRange('0', '7'); jpayne@69: constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v"); jpayne@69: constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); jpayne@69: jpayne@69: constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v")); jpayne@69: jpayne@69: constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v")))); jpayne@69: // Like discard(whitespace) but avoids some memory allocation. jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: // Identifiers jpayne@69: jpayne@69: namespace _ { // private jpayne@69: jpayne@69: struct IdentifierToString { jpayne@69: inline String operator()(char first, const Array& rest) const { jpayne@69: if (rest.size() == 0) return heapString(&first, 1); jpayne@69: String result = heapString(rest.size() + 1); jpayne@69: result[0] = first; jpayne@69: memcpy(result.begin() + 1, rest.begin(), rest.size()); jpayne@69: return result; jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: } // namespace _ (private) jpayne@69: jpayne@69: constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); jpayne@69: // Parses an identifier (e.g. a C variable name). jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: // Integers jpayne@69: jpayne@69: namespace _ { // private jpayne@69: jpayne@69: inline char parseDigit(char c) { jpayne@69: if (c < 'A') return c - '0'; jpayne@69: if (c < 'a') return c - 'A' + 10; jpayne@69: return c - 'a' + 10; jpayne@69: } jpayne@69: jpayne@69: template jpayne@69: struct ParseInteger { jpayne@69: inline uint64_t operator()(const Array& digits) const { jpayne@69: return operator()('0', digits); jpayne@69: } jpayne@69: uint64_t operator()(char first, const Array& digits) const { jpayne@69: uint64_t result = parseDigit(first); jpayne@69: for (char digit: digits) { jpayne@69: result = result * base + parseDigit(digit); jpayne@69: } jpayne@69: return result; jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: jpayne@69: } // namespace _ (private) jpayne@69: jpayne@69: constexpr auto integer = sequence( jpayne@69: oneOf( jpayne@69: transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), jpayne@69: transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), jpayne@69: transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), jpayne@69: notLookingAt(alpha.orAny("_."))); jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: // Numbers (i.e. floats) jpayne@69: jpayne@69: namespace _ { // private jpayne@69: jpayne@69: struct ParseFloat { jpayne@69: double operator()(const Array& digits, jpayne@69: const Maybe>& fraction, jpayne@69: const Maybe, Array>>& exponent) const; jpayne@69: }; jpayne@69: jpayne@69: } // namespace _ (private) jpayne@69: jpayne@69: constexpr auto number = transform( jpayne@69: sequence( jpayne@69: oneOrMore(digit), jpayne@69: optional(sequence(exactChar<'.'>(), many(digit))), jpayne@69: optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))), jpayne@69: notLookingAt(alpha.orAny("_."))), jpayne@69: _::ParseFloat()); jpayne@69: jpayne@69: // ======================================================================================= jpayne@69: // Quoted strings jpayne@69: jpayne@69: namespace _ { // private jpayne@69: jpayne@69: struct InterpretEscape { jpayne@69: char operator()(char c) const { jpayne@69: switch (c) { jpayne@69: case 'a': return '\a'; jpayne@69: case 'b': return '\b'; jpayne@69: case 'f': return '\f'; jpayne@69: case 'n': return '\n'; jpayne@69: case 'r': return '\r'; jpayne@69: case 't': return '\t'; jpayne@69: case 'v': return '\v'; jpayne@69: default: return c; jpayne@69: } jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: struct ParseHexEscape { jpayne@69: inline char operator()(char first, char second) const { jpayne@69: return (parseDigit(first) << 4) | parseDigit(second); jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: struct ParseHexByte { jpayne@69: inline byte operator()(char first, char second) const { jpayne@69: return (parseDigit(first) << 4) | parseDigit(second); jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: struct ParseOctEscape { jpayne@69: inline char operator()(char first, Maybe second, Maybe third) const { jpayne@69: char result = first - '0'; jpayne@69: KJ_IF_MAYBE(digit1, second) { jpayne@69: result = (result << 3) | (*digit1 - '0'); jpayne@69: KJ_IF_MAYBE(digit2, third) { jpayne@69: result = (result << 3) | (*digit2 - '0'); jpayne@69: } jpayne@69: } jpayne@69: return result; jpayne@69: } jpayne@69: }; jpayne@69: jpayne@69: } // namespace _ (private) jpayne@69: jpayne@69: constexpr auto escapeSequence = jpayne@69: sequence(exactChar<'\\'>(), oneOf( jpayne@69: transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()), jpayne@69: transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), jpayne@69: transform(sequence(octDigit, optional(octDigit), optional(octDigit)), jpayne@69: _::ParseOctEscape()))); jpayne@69: // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns jpayne@69: // a char. jpayne@69: jpayne@69: constexpr auto doubleQuotedString = charsToString(sequence( jpayne@69: exactChar<'\"'>(), jpayne@69: many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)), jpayne@69: exactChar<'\"'>())); jpayne@69: // Parses a C-style double-quoted string. jpayne@69: jpayne@69: constexpr auto singleQuotedString = charsToString(sequence( jpayne@69: exactChar<'\''>(), jpayne@69: many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)), jpayne@69: exactChar<'\''>())); jpayne@69: // Parses a C-style single-quoted string. jpayne@69: jpayne@69: constexpr auto doubleQuotedHexBinary = sequence( jpayne@69: exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), jpayne@69: oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), jpayne@69: discardWhitespace, jpayne@69: exactChar<'\"'>()); jpayne@69: // Parses a double-quoted hex binary literal. Returns Array. jpayne@69: jpayne@69: } // namespace parse jpayne@69: } // namespace kj jpayne@69: jpayne@69: KJ_END_HEADER