comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/kj/parse/char.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors
2 // Licensed under the MIT License:
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 // THE SOFTWARE.
21
22 // This file contains parsers useful for character stream inputs, including parsers to parse
23 // common kinds of tokens like identifiers, numbers, and quoted strings.
24
25 #pragma once
26
27 #include "common.h"
28 #include "../string.h"
29 #include <inttypes.h>
30
31 KJ_BEGIN_HEADER
32
33 namespace kj {
34 namespace parse {
35
36 // =======================================================================================
37 // Exact char/string.
38
39 class ExactString_ {
40 public:
41 constexpr inline ExactString_(const char* str): str(str) {}
42
43 template <typename Input>
44 Maybe<Tuple<>> operator()(Input& input) const {
45 const char* ptr = str;
46
47 while (*ptr != '\0') {
48 if (input.atEnd() || input.current() != *ptr) return nullptr;
49 input.next();
50 ++ptr;
51 }
52
53 return Tuple<>();
54 }
55
56 private:
57 const char* str;
58 };
59
60 constexpr inline ExactString_ exactString(const char* str) {
61 return ExactString_(str);
62 }
63
64 template <char c>
65 constexpr ExactlyConst_<char, c> exactChar() {
66 // Returns a parser that matches exactly the character given by the template argument (returning
67 // no result).
68 return ExactlyConst_<char, c>();
69 }
70
71 // =======================================================================================
72 // Char ranges / sets
73
74 class CharGroup_ {
75 public:
76 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {}
77
78 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const {
79 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )),
80 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)),
81 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)),
82 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192)));
83 }
84
85 constexpr inline CharGroup_ orAny(const char* chars) const {
86 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1);
87 }
88
89 constexpr inline CharGroup_ orChar(unsigned char c) const {
90 return CharGroup_(bits[0] | bit(c),
91 bits[1] | bit(c - 64),
92 bits[2] | bit(c - 128),
93 bits[3] | bit(c - 256));
94 }
95
96 constexpr inline CharGroup_ orGroup(CharGroup_ other) const {
97 return CharGroup_(bits[0] | other.bits[0],
98 bits[1] | other.bits[1],
99 bits[2] | other.bits[2],
100 bits[3] | other.bits[3]);
101 }
102
103 constexpr inline CharGroup_ invert() const {
104 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]);
105 }
106
107 constexpr inline bool contains(unsigned char c) const {
108 return (bits[c / 64] & (1ll << (c % 64))) != 0;
109 }
110
111 inline bool containsAll(ArrayPtr<const char> text) const {
112 for (char c: text) {
113 if (!contains(c)) return false;
114 }
115 return true;
116 }
117
118 template <typename Input>
119 Maybe<char> operator()(Input& input) const {
120 if (input.atEnd()) return nullptr;
121 unsigned char c = input.current();
122 if (contains(c)) {
123 input.next();
124 return c;
125 } else {
126 return nullptr;
127 }
128 }
129
130 private:
131 typedef unsigned long long Bits64;
132
133 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {}
134 Bits64 bits[4];
135
136 static constexpr inline Bits64 oneBits(int count) {
137 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1);
138 }
139 static constexpr inline Bits64 bit(int index) {
140 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index);
141 }
142 };
143
144 constexpr inline CharGroup_ charRange(char first, char last) {
145 // Create a parser which accepts any character in the range from `first` to `last`, inclusive.
146 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the
147 // character matched.
148 //
149 // The returned object has methods which can be used to match more characters. The following
150 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'.
151 //
152 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.")
153 //
154 // You can also use `.invert()` to match the opposite set of characters.
155
156 return CharGroup_().orRange(first, last);
157 }
158
159 #if _MSC_VER && !defined(__clang__)
160 #define anyOfChars(chars) CharGroup_().orAny(chars)
161 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from
162 // building the compiler or schema parser. We don't know why this happens, but Harris found that
163 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing.
164 // Hopefully, MSVC will get fixed soon and we'll be able to remove this.
165 #else
166 constexpr inline CharGroup_ anyOfChars(const char* chars) {
167 // Returns a parser that accepts any of the characters in the given string (which should usually
168 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see
169 // that function for more info.
170
171 return CharGroup_().orAny(chars);
172 }
173 #endif
174
175 // =======================================================================================
176
177 namespace _ { // private
178
179 struct ArrayToString {
180 inline String operator()(const Array<char>& arr) const {
181 return heapString(arr);
182 }
183 };
184
185 } // namespace _ (private)
186
187 template <typename SubParser>
188 constexpr inline auto charsToString(SubParser&& subParser)
189 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) {
190 // Wraps a parser that returns Array<char> such that it returns String instead.
191 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString());
192 }
193
194 // =======================================================================================
195 // Basic character classes.
196
197 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z');
198 constexpr auto digit = charRange('0', '9');
199 constexpr auto alphaNumeric = alpha.orGroup(digit);
200 constexpr auto nameStart = alpha.orChar('_');
201 constexpr auto nameChar = alphaNumeric.orChar('_');
202 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F');
203 constexpr auto octDigit = charRange('0', '7');
204 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v");
205 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert();
206
207 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v"));
208
209 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v"))));
210 // Like discard(whitespace) but avoids some memory allocation.
211
212 // =======================================================================================
213 // Identifiers
214
215 namespace _ { // private
216
217 struct IdentifierToString {
218 inline String operator()(char first, const Array<char>& rest) const {
219 if (rest.size() == 0) return heapString(&first, 1);
220 String result = heapString(rest.size() + 1);
221 result[0] = first;
222 memcpy(result.begin() + 1, rest.begin(), rest.size());
223 return result;
224 }
225 };
226
227 } // namespace _ (private)
228
229 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString());
230 // Parses an identifier (e.g. a C variable name).
231
232 // =======================================================================================
233 // Integers
234
235 namespace _ { // private
236
237 inline char parseDigit(char c) {
238 if (c < 'A') return c - '0';
239 if (c < 'a') return c - 'A' + 10;
240 return c - 'a' + 10;
241 }
242
243 template <uint base>
244 struct ParseInteger {
245 inline uint64_t operator()(const Array<char>& digits) const {
246 return operator()('0', digits);
247 }
248 uint64_t operator()(char first, const Array<char>& digits) const {
249 uint64_t result = parseDigit(first);
250 for (char digit: digits) {
251 result = result * base + parseDigit(digit);
252 }
253 return result;
254 }
255 };
256
257
258 } // namespace _ (private)
259
260 constexpr auto integer = sequence(
261 oneOf(
262 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()),
263 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()),
264 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())),
265 notLookingAt(alpha.orAny("_.")));
266
267 // =======================================================================================
268 // Numbers (i.e. floats)
269
270 namespace _ { // private
271
272 struct ParseFloat {
273 double operator()(const Array<char>& digits,
274 const Maybe<Array<char>>& fraction,
275 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const;
276 };
277
278 } // namespace _ (private)
279
280 constexpr auto number = transform(
281 sequence(
282 oneOrMore(digit),
283 optional(sequence(exactChar<'.'>(), many(digit))),
284 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))),
285 notLookingAt(alpha.orAny("_."))),
286 _::ParseFloat());
287
288 // =======================================================================================
289 // Quoted strings
290
291 namespace _ { // private
292
293 struct InterpretEscape {
294 char operator()(char c) const {
295 switch (c) {
296 case 'a': return '\a';
297 case 'b': return '\b';
298 case 'f': return '\f';
299 case 'n': return '\n';
300 case 'r': return '\r';
301 case 't': return '\t';
302 case 'v': return '\v';
303 default: return c;
304 }
305 }
306 };
307
308 struct ParseHexEscape {
309 inline char operator()(char first, char second) const {
310 return (parseDigit(first) << 4) | parseDigit(second);
311 }
312 };
313
314 struct ParseHexByte {
315 inline byte operator()(char first, char second) const {
316 return (parseDigit(first) << 4) | parseDigit(second);
317 }
318 };
319
320 struct ParseOctEscape {
321 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const {
322 char result = first - '0';
323 KJ_IF_MAYBE(digit1, second) {
324 result = (result << 3) | (*digit1 - '0');
325 KJ_IF_MAYBE(digit2, third) {
326 result = (result << 3) | (*digit2 - '0');
327 }
328 }
329 return result;
330 }
331 };
332
333 } // namespace _ (private)
334
335 constexpr auto escapeSequence =
336 sequence(exactChar<'\\'>(), oneOf(
337 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()),
338 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()),
339 transform(sequence(octDigit, optional(octDigit), optional(octDigit)),
340 _::ParseOctEscape())));
341 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns
342 // a char.
343
344 constexpr auto doubleQuotedString = charsToString(sequence(
345 exactChar<'\"'>(),
346 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)),
347 exactChar<'\"'>()));
348 // Parses a C-style double-quoted string.
349
350 constexpr auto singleQuotedString = charsToString(sequence(
351 exactChar<'\''>(),
352 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)),
353 exactChar<'\''>()));
354 // Parses a C-style single-quoted string.
355
356 constexpr auto doubleQuotedHexBinary = sequence(
357 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(),
358 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())),
359 discardWhitespace,
360 exactChar<'\"'>());
361 // Parses a double-quoted hex binary literal. Returns Array<byte>.
362
363 } // namespace parse
364 } // namespace kj
365
366 KJ_END_HEADER