jpayne@69
|
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
|
jpayne@69
|
2 // Licensed under the MIT License:
|
jpayne@69
|
3 //
|
jpayne@69
|
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
|
jpayne@69
|
5 // of this software and associated documentation files (the "Software"), to deal
|
jpayne@69
|
6 // in the Software without restriction, including without limitation the rights
|
jpayne@69
|
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
jpayne@69
|
8 // copies of the Software, and to permit persons to whom the Software is
|
jpayne@69
|
9 // furnished to do so, subject to the following conditions:
|
jpayne@69
|
10 //
|
jpayne@69
|
11 // The above copyright notice and this permission notice shall be included in
|
jpayne@69
|
12 // all copies or substantial portions of the Software.
|
jpayne@69
|
13 //
|
jpayne@69
|
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
jpayne@69
|
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
jpayne@69
|
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
jpayne@69
|
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
jpayne@69
|
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
jpayne@69
|
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
jpayne@69
|
20 // THE SOFTWARE.
|
jpayne@69
|
21
|
jpayne@69
|
22 // This file contains parsers useful for character stream inputs, including parsers to parse
|
jpayne@69
|
23 // common kinds of tokens like identifiers, numbers, and quoted strings.
|
jpayne@69
|
24
|
jpayne@69
|
25 #pragma once
|
jpayne@69
|
26
|
jpayne@69
|
27 #include "common.h"
|
jpayne@69
|
28 #include "../string.h"
|
jpayne@69
|
29 #include <inttypes.h>
|
jpayne@69
|
30
|
jpayne@69
|
31 KJ_BEGIN_HEADER
|
jpayne@69
|
32
|
jpayne@69
|
33 namespace kj {
|
jpayne@69
|
34 namespace parse {
|
jpayne@69
|
35
|
jpayne@69
|
36 // =======================================================================================
|
jpayne@69
|
37 // Exact char/string.
|
jpayne@69
|
38
|
jpayne@69
|
39 class ExactString_ {
|
jpayne@69
|
40 public:
|
jpayne@69
|
41 constexpr inline ExactString_(const char* str): str(str) {}
|
jpayne@69
|
42
|
jpayne@69
|
43 template <typename Input>
|
jpayne@69
|
44 Maybe<Tuple<>> operator()(Input& input) const {
|
jpayne@69
|
45 const char* ptr = str;
|
jpayne@69
|
46
|
jpayne@69
|
47 while (*ptr != '\0') {
|
jpayne@69
|
48 if (input.atEnd() || input.current() != *ptr) return nullptr;
|
jpayne@69
|
49 input.next();
|
jpayne@69
|
50 ++ptr;
|
jpayne@69
|
51 }
|
jpayne@69
|
52
|
jpayne@69
|
53 return Tuple<>();
|
jpayne@69
|
54 }
|
jpayne@69
|
55
|
jpayne@69
|
56 private:
|
jpayne@69
|
57 const char* str;
|
jpayne@69
|
58 };
|
jpayne@69
|
59
|
jpayne@69
|
60 constexpr inline ExactString_ exactString(const char* str) {
|
jpayne@69
|
61 return ExactString_(str);
|
jpayne@69
|
62 }
|
jpayne@69
|
63
|
jpayne@69
|
64 template <char c>
|
jpayne@69
|
65 constexpr ExactlyConst_<char, c> exactChar() {
|
jpayne@69
|
66 // Returns a parser that matches exactly the character given by the template argument (returning
|
jpayne@69
|
67 // no result).
|
jpayne@69
|
68 return ExactlyConst_<char, c>();
|
jpayne@69
|
69 }
|
jpayne@69
|
70
|
jpayne@69
|
71 // =======================================================================================
|
jpayne@69
|
72 // Char ranges / sets
|
jpayne@69
|
73
|
jpayne@69
|
74 class CharGroup_ {
|
jpayne@69
|
75 public:
|
jpayne@69
|
76 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
|
jpayne@69
|
77
|
jpayne@69
|
78 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
|
jpayne@69
|
79 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
|
jpayne@69
|
80 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
|
jpayne@69
|
81 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
|
jpayne@69
|
82 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
|
jpayne@69
|
83 }
|
jpayne@69
|
84
|
jpayne@69
|
85 constexpr inline CharGroup_ orAny(const char* chars) const {
|
jpayne@69
|
86 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
|
jpayne@69
|
87 }
|
jpayne@69
|
88
|
jpayne@69
|
89 constexpr inline CharGroup_ orChar(unsigned char c) const {
|
jpayne@69
|
90 return CharGroup_(bits[0] | bit(c),
|
jpayne@69
|
91 bits[1] | bit(c - 64),
|
jpayne@69
|
92 bits[2] | bit(c - 128),
|
jpayne@69
|
93 bits[3] | bit(c - 256));
|
jpayne@69
|
94 }
|
jpayne@69
|
95
|
jpayne@69
|
96 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
|
jpayne@69
|
97 return CharGroup_(bits[0] | other.bits[0],
|
jpayne@69
|
98 bits[1] | other.bits[1],
|
jpayne@69
|
99 bits[2] | other.bits[2],
|
jpayne@69
|
100 bits[3] | other.bits[3]);
|
jpayne@69
|
101 }
|
jpayne@69
|
102
|
jpayne@69
|
103 constexpr inline CharGroup_ invert() const {
|
jpayne@69
|
104 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
|
jpayne@69
|
105 }
|
jpayne@69
|
106
|
jpayne@69
|
107 constexpr inline bool contains(unsigned char c) const {
|
jpayne@69
|
108 return (bits[c / 64] & (1ll << (c % 64))) != 0;
|
jpayne@69
|
109 }
|
jpayne@69
|
110
|
jpayne@69
|
111 inline bool containsAll(ArrayPtr<const char> text) const {
|
jpayne@69
|
112 for (char c: text) {
|
jpayne@69
|
113 if (!contains(c)) return false;
|
jpayne@69
|
114 }
|
jpayne@69
|
115 return true;
|
jpayne@69
|
116 }
|
jpayne@69
|
117
|
jpayne@69
|
118 template <typename Input>
|
jpayne@69
|
119 Maybe<char> operator()(Input& input) const {
|
jpayne@69
|
120 if (input.atEnd()) return nullptr;
|
jpayne@69
|
121 unsigned char c = input.current();
|
jpayne@69
|
122 if (contains(c)) {
|
jpayne@69
|
123 input.next();
|
jpayne@69
|
124 return c;
|
jpayne@69
|
125 } else {
|
jpayne@69
|
126 return nullptr;
|
jpayne@69
|
127 }
|
jpayne@69
|
128 }
|
jpayne@69
|
129
|
jpayne@69
|
130 private:
|
jpayne@69
|
131 typedef unsigned long long Bits64;
|
jpayne@69
|
132
|
jpayne@69
|
133 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
|
jpayne@69
|
134 Bits64 bits[4];
|
jpayne@69
|
135
|
jpayne@69
|
136 static constexpr inline Bits64 oneBits(int count) {
|
jpayne@69
|
137 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
|
jpayne@69
|
138 }
|
jpayne@69
|
139 static constexpr inline Bits64 bit(int index) {
|
jpayne@69
|
140 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
|
jpayne@69
|
141 }
|
jpayne@69
|
142 };
|
jpayne@69
|
143
|
jpayne@69
|
144 constexpr inline CharGroup_ charRange(char first, char last) {
|
jpayne@69
|
145 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
|
jpayne@69
|
146 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
|
jpayne@69
|
147 // character matched.
|
jpayne@69
|
148 //
|
jpayne@69
|
149 // The returned object has methods which can be used to match more characters. The following
|
jpayne@69
|
150 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
|
jpayne@69
|
151 //
|
jpayne@69
|
152 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
|
jpayne@69
|
153 //
|
jpayne@69
|
154 // You can also use `.invert()` to match the opposite set of characters.
|
jpayne@69
|
155
|
jpayne@69
|
156 return CharGroup_().orRange(first, last);
|
jpayne@69
|
157 }
|
jpayne@69
|
158
|
jpayne@69
|
159 #if _MSC_VER && !defined(__clang__)
|
jpayne@69
|
160 #define anyOfChars(chars) CharGroup_().orAny(chars)
|
jpayne@69
|
161 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
|
jpayne@69
|
162 // building the compiler or schema parser. We don't know why this happens, but Harris found that
|
jpayne@69
|
163 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
|
jpayne@69
|
164 // Hopefully, MSVC will get fixed soon and we'll be able to remove this.
|
jpayne@69
|
165 #else
|
jpayne@69
|
166 constexpr inline CharGroup_ anyOfChars(const char* chars) {
|
jpayne@69
|
167 // Returns a parser that accepts any of the characters in the given string (which should usually
|
jpayne@69
|
168 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
|
jpayne@69
|
169 // that function for more info.
|
jpayne@69
|
170
|
jpayne@69
|
171 return CharGroup_().orAny(chars);
|
jpayne@69
|
172 }
|
jpayne@69
|
173 #endif
|
jpayne@69
|
174
|
jpayne@69
|
175 // =======================================================================================
|
jpayne@69
|
176
|
jpayne@69
|
177 namespace _ { // private
|
jpayne@69
|
178
|
jpayne@69
|
179 struct ArrayToString {
|
jpayne@69
|
180 inline String operator()(const Array<char>& arr) const {
|
jpayne@69
|
181 return heapString(arr);
|
jpayne@69
|
182 }
|
jpayne@69
|
183 };
|
jpayne@69
|
184
|
jpayne@69
|
185 } // namespace _ (private)
|
jpayne@69
|
186
|
jpayne@69
|
187 template <typename SubParser>
|
jpayne@69
|
188 constexpr inline auto charsToString(SubParser&& subParser)
|
jpayne@69
|
189 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
|
jpayne@69
|
190 // Wraps a parser that returns Array<char> such that it returns String instead.
|
jpayne@69
|
191 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
|
jpayne@69
|
192 }
|
jpayne@69
|
193
|
jpayne@69
|
194 // =======================================================================================
|
jpayne@69
|
195 // Basic character classes.
|
jpayne@69
|
196
|
jpayne@69
|
197 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
|
jpayne@69
|
198 constexpr auto digit = charRange('0', '9');
|
jpayne@69
|
199 constexpr auto alphaNumeric = alpha.orGroup(digit);
|
jpayne@69
|
200 constexpr auto nameStart = alpha.orChar('_');
|
jpayne@69
|
201 constexpr auto nameChar = alphaNumeric.orChar('_');
|
jpayne@69
|
202 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
|
jpayne@69
|
203 constexpr auto octDigit = charRange('0', '7');
|
jpayne@69
|
204 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
|
jpayne@69
|
205 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
|
jpayne@69
|
206
|
jpayne@69
|
207 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
|
jpayne@69
|
208
|
jpayne@69
|
209 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
|
jpayne@69
|
210 // Like discard(whitespace) but avoids some memory allocation.
|
jpayne@69
|
211
|
jpayne@69
|
212 // =======================================================================================
|
jpayne@69
|
213 // Identifiers
|
jpayne@69
|
214
|
jpayne@69
|
215 namespace _ { // private
|
jpayne@69
|
216
|
jpayne@69
|
217 struct IdentifierToString {
|
jpayne@69
|
218 inline String operator()(char first, const Array<char>& rest) const {
|
jpayne@69
|
219 if (rest.size() == 0) return heapString(&first, 1);
|
jpayne@69
|
220 String result = heapString(rest.size() + 1);
|
jpayne@69
|
221 result[0] = first;
|
jpayne@69
|
222 memcpy(result.begin() + 1, rest.begin(), rest.size());
|
jpayne@69
|
223 return result;
|
jpayne@69
|
224 }
|
jpayne@69
|
225 };
|
jpayne@69
|
226
|
jpayne@69
|
227 } // namespace _ (private)
|
jpayne@69
|
228
|
jpayne@69
|
229 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
|
jpayne@69
|
230 // Parses an identifier (e.g. a C variable name).
|
jpayne@69
|
231
|
jpayne@69
|
232 // =======================================================================================
|
jpayne@69
|
233 // Integers
|
jpayne@69
|
234
|
jpayne@69
|
235 namespace _ { // private
|
jpayne@69
|
236
|
jpayne@69
|
237 inline char parseDigit(char c) {
|
jpayne@69
|
238 if (c < 'A') return c - '0';
|
jpayne@69
|
239 if (c < 'a') return c - 'A' + 10;
|
jpayne@69
|
240 return c - 'a' + 10;
|
jpayne@69
|
241 }
|
jpayne@69
|
242
|
jpayne@69
|
243 template <uint base>
|
jpayne@69
|
244 struct ParseInteger {
|
jpayne@69
|
245 inline uint64_t operator()(const Array<char>& digits) const {
|
jpayne@69
|
246 return operator()('0', digits);
|
jpayne@69
|
247 }
|
jpayne@69
|
248 uint64_t operator()(char first, const Array<char>& digits) const {
|
jpayne@69
|
249 uint64_t result = parseDigit(first);
|
jpayne@69
|
250 for (char digit: digits) {
|
jpayne@69
|
251 result = result * base + parseDigit(digit);
|
jpayne@69
|
252 }
|
jpayne@69
|
253 return result;
|
jpayne@69
|
254 }
|
jpayne@69
|
255 };
|
jpayne@69
|
256
|
jpayne@69
|
257
|
jpayne@69
|
258 } // namespace _ (private)
|
jpayne@69
|
259
|
jpayne@69
|
260 constexpr auto integer = sequence(
|
jpayne@69
|
261 oneOf(
|
jpayne@69
|
262 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
|
jpayne@69
|
263 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
|
jpayne@69
|
264 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
|
jpayne@69
|
265 notLookingAt(alpha.orAny("_.")));
|
jpayne@69
|
266
|
jpayne@69
|
267 // =======================================================================================
|
jpayne@69
|
268 // Numbers (i.e. floats)
|
jpayne@69
|
269
|
jpayne@69
|
270 namespace _ { // private
|
jpayne@69
|
271
|
jpayne@69
|
272 struct ParseFloat {
|
jpayne@69
|
273 double operator()(const Array<char>& digits,
|
jpayne@69
|
274 const Maybe<Array<char>>& fraction,
|
jpayne@69
|
275 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
|
jpayne@69
|
276 };
|
jpayne@69
|
277
|
jpayne@69
|
278 } // namespace _ (private)
|
jpayne@69
|
279
|
jpayne@69
|
280 constexpr auto number = transform(
|
jpayne@69
|
281 sequence(
|
jpayne@69
|
282 oneOrMore(digit),
|
jpayne@69
|
283 optional(sequence(exactChar<'.'>(), many(digit))),
|
jpayne@69
|
284 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
|
jpayne@69
|
285 notLookingAt(alpha.orAny("_."))),
|
jpayne@69
|
286 _::ParseFloat());
|
jpayne@69
|
287
|
jpayne@69
|
288 // =======================================================================================
|
jpayne@69
|
289 // Quoted strings
|
jpayne@69
|
290
|
jpayne@69
|
291 namespace _ { // private
|
jpayne@69
|
292
|
jpayne@69
|
293 struct InterpretEscape {
|
jpayne@69
|
294 char operator()(char c) const {
|
jpayne@69
|
295 switch (c) {
|
jpayne@69
|
296 case 'a': return '\a';
|
jpayne@69
|
297 case 'b': return '\b';
|
jpayne@69
|
298 case 'f': return '\f';
|
jpayne@69
|
299 case 'n': return '\n';
|
jpayne@69
|
300 case 'r': return '\r';
|
jpayne@69
|
301 case 't': return '\t';
|
jpayne@69
|
302 case 'v': return '\v';
|
jpayne@69
|
303 default: return c;
|
jpayne@69
|
304 }
|
jpayne@69
|
305 }
|
jpayne@69
|
306 };
|
jpayne@69
|
307
|
jpayne@69
|
308 struct ParseHexEscape {
|
jpayne@69
|
309 inline char operator()(char first, char second) const {
|
jpayne@69
|
310 return (parseDigit(first) << 4) | parseDigit(second);
|
jpayne@69
|
311 }
|
jpayne@69
|
312 };
|
jpayne@69
|
313
|
jpayne@69
|
314 struct ParseHexByte {
|
jpayne@69
|
315 inline byte operator()(char first, char second) const {
|
jpayne@69
|
316 return (parseDigit(first) << 4) | parseDigit(second);
|
jpayne@69
|
317 }
|
jpayne@69
|
318 };
|
jpayne@69
|
319
|
jpayne@69
|
320 struct ParseOctEscape {
|
jpayne@69
|
321 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
|
jpayne@69
|
322 char result = first - '0';
|
jpayne@69
|
323 KJ_IF_MAYBE(digit1, second) {
|
jpayne@69
|
324 result = (result << 3) | (*digit1 - '0');
|
jpayne@69
|
325 KJ_IF_MAYBE(digit2, third) {
|
jpayne@69
|
326 result = (result << 3) | (*digit2 - '0');
|
jpayne@69
|
327 }
|
jpayne@69
|
328 }
|
jpayne@69
|
329 return result;
|
jpayne@69
|
330 }
|
jpayne@69
|
331 };
|
jpayne@69
|
332
|
jpayne@69
|
333 } // namespace _ (private)
|
jpayne@69
|
334
|
jpayne@69
|
335 constexpr auto escapeSequence =
|
jpayne@69
|
336 sequence(exactChar<'\\'>(), oneOf(
|
jpayne@69
|
337 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
|
jpayne@69
|
338 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
|
jpayne@69
|
339 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
|
jpayne@69
|
340 _::ParseOctEscape())));
|
jpayne@69
|
341 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
|
jpayne@69
|
342 // a char.
|
jpayne@69
|
343
|
jpayne@69
|
344 constexpr auto doubleQuotedString = charsToString(sequence(
|
jpayne@69
|
345 exactChar<'\"'>(),
|
jpayne@69
|
346 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
|
jpayne@69
|
347 exactChar<'\"'>()));
|
jpayne@69
|
348 // Parses a C-style double-quoted string.
|
jpayne@69
|
349
|
jpayne@69
|
350 constexpr auto singleQuotedString = charsToString(sequence(
|
jpayne@69
|
351 exactChar<'\''>(),
|
jpayne@69
|
352 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
|
jpayne@69
|
353 exactChar<'\''>()));
|
jpayne@69
|
354 // Parses a C-style single-quoted string.
|
jpayne@69
|
355
|
jpayne@69
|
356 constexpr auto doubleQuotedHexBinary = sequence(
|
jpayne@69
|
357 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
|
jpayne@69
|
358 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
|
jpayne@69
|
359 discardWhitespace,
|
jpayne@69
|
360 exactChar<'\"'>());
|
jpayne@69
|
361 // Parses a double-quoted hex binary literal. Returns Array<byte>.
|
jpayne@69
|
362
|
jpayne@69
|
363 } // namespace parse
|
jpayne@69
|
364 } // namespace kj
|
jpayne@69
|
365
|
jpayne@69
|
366 KJ_END_HEADER
|