Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/kj/parse/char.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // Copyright (c) 2013-2014 Sandstorm Development Group, Inc. and contributors | |
2 // Licensed under the MIT License: | |
3 // | |
4 // Permission is hereby granted, free of charge, to any person obtaining a copy | |
5 // of this software and associated documentation files (the "Software"), to deal | |
6 // in the Software without restriction, including without limitation the rights | |
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
8 // copies of the Software, and to permit persons to whom the Software is | |
9 // furnished to do so, subject to the following conditions: | |
10 // | |
11 // The above copyright notice and this permission notice shall be included in | |
12 // all copies or substantial portions of the Software. | |
13 // | |
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | |
20 // THE SOFTWARE. | |
21 | |
22 // This file contains parsers useful for character stream inputs, including parsers to parse | |
23 // common kinds of tokens like identifiers, numbers, and quoted strings. | |
24 | |
25 #pragma once | |
26 | |
27 #include "common.h" | |
28 #include "../string.h" | |
29 #include <inttypes.h> | |
30 | |
31 KJ_BEGIN_HEADER | |
32 | |
33 namespace kj { | |
34 namespace parse { | |
35 | |
36 // ======================================================================================= | |
37 // Exact char/string. | |
38 | |
39 class ExactString_ { | |
40 public: | |
41 constexpr inline ExactString_(const char* str): str(str) {} | |
42 | |
43 template <typename Input> | |
44 Maybe<Tuple<>> operator()(Input& input) const { | |
45 const char* ptr = str; | |
46 | |
47 while (*ptr != '\0') { | |
48 if (input.atEnd() || input.current() != *ptr) return nullptr; | |
49 input.next(); | |
50 ++ptr; | |
51 } | |
52 | |
53 return Tuple<>(); | |
54 } | |
55 | |
56 private: | |
57 const char* str; | |
58 }; | |
59 | |
60 constexpr inline ExactString_ exactString(const char* str) { | |
61 return ExactString_(str); | |
62 } | |
63 | |
64 template <char c> | |
65 constexpr ExactlyConst_<char, c> exactChar() { | |
66 // Returns a parser that matches exactly the character given by the template argument (returning | |
67 // no result). | |
68 return ExactlyConst_<char, c>(); | |
69 } | |
70 | |
71 // ======================================================================================= | |
72 // Char ranges / sets | |
73 | |
74 class CharGroup_ { | |
75 public: | |
76 constexpr inline CharGroup_(): bits{0, 0, 0, 0} {} | |
77 | |
78 constexpr inline CharGroup_ orRange(unsigned char first, unsigned char last) const { | |
79 return CharGroup_(bits[0] | (oneBits(last + 1) & ~oneBits(first )), | |
80 bits[1] | (oneBits(last - 63) & ~oneBits(first - 64)), | |
81 bits[2] | (oneBits(last - 127) & ~oneBits(first - 128)), | |
82 bits[3] | (oneBits(last - 191) & ~oneBits(first - 192))); | |
83 } | |
84 | |
85 constexpr inline CharGroup_ orAny(const char* chars) const { | |
86 return *chars == 0 ? *this : orChar(*chars).orAny(chars + 1); | |
87 } | |
88 | |
89 constexpr inline CharGroup_ orChar(unsigned char c) const { | |
90 return CharGroup_(bits[0] | bit(c), | |
91 bits[1] | bit(c - 64), | |
92 bits[2] | bit(c - 128), | |
93 bits[3] | bit(c - 256)); | |
94 } | |
95 | |
96 constexpr inline CharGroup_ orGroup(CharGroup_ other) const { | |
97 return CharGroup_(bits[0] | other.bits[0], | |
98 bits[1] | other.bits[1], | |
99 bits[2] | other.bits[2], | |
100 bits[3] | other.bits[3]); | |
101 } | |
102 | |
103 constexpr inline CharGroup_ invert() const { | |
104 return CharGroup_(~bits[0], ~bits[1], ~bits[2], ~bits[3]); | |
105 } | |
106 | |
107 constexpr inline bool contains(unsigned char c) const { | |
108 return (bits[c / 64] & (1ll << (c % 64))) != 0; | |
109 } | |
110 | |
111 inline bool containsAll(ArrayPtr<const char> text) const { | |
112 for (char c: text) { | |
113 if (!contains(c)) return false; | |
114 } | |
115 return true; | |
116 } | |
117 | |
118 template <typename Input> | |
119 Maybe<char> operator()(Input& input) const { | |
120 if (input.atEnd()) return nullptr; | |
121 unsigned char c = input.current(); | |
122 if (contains(c)) { | |
123 input.next(); | |
124 return c; | |
125 } else { | |
126 return nullptr; | |
127 } | |
128 } | |
129 | |
130 private: | |
131 typedef unsigned long long Bits64; | |
132 | |
133 constexpr inline CharGroup_(Bits64 a, Bits64 b, Bits64 c, Bits64 d): bits{a, b, c, d} {} | |
134 Bits64 bits[4]; | |
135 | |
136 static constexpr inline Bits64 oneBits(int count) { | |
137 return count <= 0 ? 0ll : count >= 64 ? -1ll : ((1ll << count) - 1); | |
138 } | |
139 static constexpr inline Bits64 bit(int index) { | |
140 return index < 0 ? 0 : index >= 64 ? 0 : (1ll << index); | |
141 } | |
142 }; | |
143 | |
144 constexpr inline CharGroup_ charRange(char first, char last) { | |
145 // Create a parser which accepts any character in the range from `first` to `last`, inclusive. | |
146 // For example: `charRange('a', 'z')` matches all lower-case letters. The parser's result is the | |
147 // character matched. | |
148 // | |
149 // The returned object has methods which can be used to match more characters. The following | |
150 // produces a parser which accepts any letter as well as '_', '+', '-', and '.'. | |
151 // | |
152 // charRange('a', 'z').orRange('A', 'Z').orChar('_').orAny("+-.") | |
153 // | |
154 // You can also use `.invert()` to match the opposite set of characters. | |
155 | |
156 return CharGroup_().orRange(first, last); | |
157 } | |
158 | |
159 #if _MSC_VER && !defined(__clang__) | |
160 #define anyOfChars(chars) CharGroup_().orAny(chars) | |
161 // TODO(msvc): MSVC ICEs on the proper definition of `anyOfChars()`, which in turn prevents us from | |
162 // building the compiler or schema parser. We don't know why this happens, but Harris found that | |
163 // this horrible, horrible hack makes things work. This is awful, but it's better than nothing. | |
164 // Hopefully, MSVC will get fixed soon and we'll be able to remove this. | |
165 #else | |
166 constexpr inline CharGroup_ anyOfChars(const char* chars) { | |
167 // Returns a parser that accepts any of the characters in the given string (which should usually | |
168 // be a literal). The returned parser is of the same type as returned by `charRange()` -- see | |
169 // that function for more info. | |
170 | |
171 return CharGroup_().orAny(chars); | |
172 } | |
173 #endif | |
174 | |
175 // ======================================================================================= | |
176 | |
177 namespace _ { // private | |
178 | |
179 struct ArrayToString { | |
180 inline String operator()(const Array<char>& arr) const { | |
181 return heapString(arr); | |
182 } | |
183 }; | |
184 | |
185 } // namespace _ (private) | |
186 | |
187 template <typename SubParser> | |
188 constexpr inline auto charsToString(SubParser&& subParser) | |
189 -> decltype(transform(kj::fwd<SubParser>(subParser), _::ArrayToString())) { | |
190 // Wraps a parser that returns Array<char> such that it returns String instead. | |
191 return parse::transform(kj::fwd<SubParser>(subParser), _::ArrayToString()); | |
192 } | |
193 | |
194 // ======================================================================================= | |
195 // Basic character classes. | |
196 | |
197 constexpr auto alpha = charRange('a', 'z').orRange('A', 'Z'); | |
198 constexpr auto digit = charRange('0', '9'); | |
199 constexpr auto alphaNumeric = alpha.orGroup(digit); | |
200 constexpr auto nameStart = alpha.orChar('_'); | |
201 constexpr auto nameChar = alphaNumeric.orChar('_'); | |
202 constexpr auto hexDigit = charRange('0', '9').orRange('a', 'f').orRange('A', 'F'); | |
203 constexpr auto octDigit = charRange('0', '7'); | |
204 constexpr auto whitespaceChar = anyOfChars(" \f\n\r\t\v"); | |
205 constexpr auto controlChar = charRange(0, 0x1f).invert().orGroup(whitespaceChar).invert(); | |
206 | |
207 constexpr auto whitespace = many(anyOfChars(" \f\n\r\t\v")); | |
208 | |
209 constexpr auto discardWhitespace = discard(many(discard(anyOfChars(" \f\n\r\t\v")))); | |
210 // Like discard(whitespace) but avoids some memory allocation. | |
211 | |
212 // ======================================================================================= | |
213 // Identifiers | |
214 | |
215 namespace _ { // private | |
216 | |
217 struct IdentifierToString { | |
218 inline String operator()(char first, const Array<char>& rest) const { | |
219 if (rest.size() == 0) return heapString(&first, 1); | |
220 String result = heapString(rest.size() + 1); | |
221 result[0] = first; | |
222 memcpy(result.begin() + 1, rest.begin(), rest.size()); | |
223 return result; | |
224 } | |
225 }; | |
226 | |
227 } // namespace _ (private) | |
228 | |
229 constexpr auto identifier = transform(sequence(nameStart, many(nameChar)), _::IdentifierToString()); | |
230 // Parses an identifier (e.g. a C variable name). | |
231 | |
232 // ======================================================================================= | |
233 // Integers | |
234 | |
235 namespace _ { // private | |
236 | |
237 inline char parseDigit(char c) { | |
238 if (c < 'A') return c - '0'; | |
239 if (c < 'a') return c - 'A' + 10; | |
240 return c - 'a' + 10; | |
241 } | |
242 | |
243 template <uint base> | |
244 struct ParseInteger { | |
245 inline uint64_t operator()(const Array<char>& digits) const { | |
246 return operator()('0', digits); | |
247 } | |
248 uint64_t operator()(char first, const Array<char>& digits) const { | |
249 uint64_t result = parseDigit(first); | |
250 for (char digit: digits) { | |
251 result = result * base + parseDigit(digit); | |
252 } | |
253 return result; | |
254 } | |
255 }; | |
256 | |
257 | |
258 } // namespace _ (private) | |
259 | |
260 constexpr auto integer = sequence( | |
261 oneOf( | |
262 transform(sequence(exactChar<'0'>(), exactChar<'x'>(), oneOrMore(hexDigit)), _::ParseInteger<16>()), | |
263 transform(sequence(exactChar<'0'>(), many(octDigit)), _::ParseInteger<8>()), | |
264 transform(sequence(charRange('1', '9'), many(digit)), _::ParseInteger<10>())), | |
265 notLookingAt(alpha.orAny("_."))); | |
266 | |
267 // ======================================================================================= | |
268 // Numbers (i.e. floats) | |
269 | |
270 namespace _ { // private | |
271 | |
272 struct ParseFloat { | |
273 double operator()(const Array<char>& digits, | |
274 const Maybe<Array<char>>& fraction, | |
275 const Maybe<Tuple<Maybe<char>, Array<char>>>& exponent) const; | |
276 }; | |
277 | |
278 } // namespace _ (private) | |
279 | |
280 constexpr auto number = transform( | |
281 sequence( | |
282 oneOrMore(digit), | |
283 optional(sequence(exactChar<'.'>(), many(digit))), | |
284 optional(sequence(discard(anyOfChars("eE")), optional(anyOfChars("+-")), many(digit))), | |
285 notLookingAt(alpha.orAny("_."))), | |
286 _::ParseFloat()); | |
287 | |
288 // ======================================================================================= | |
289 // Quoted strings | |
290 | |
291 namespace _ { // private | |
292 | |
293 struct InterpretEscape { | |
294 char operator()(char c) const { | |
295 switch (c) { | |
296 case 'a': return '\a'; | |
297 case 'b': return '\b'; | |
298 case 'f': return '\f'; | |
299 case 'n': return '\n'; | |
300 case 'r': return '\r'; | |
301 case 't': return '\t'; | |
302 case 'v': return '\v'; | |
303 default: return c; | |
304 } | |
305 } | |
306 }; | |
307 | |
308 struct ParseHexEscape { | |
309 inline char operator()(char first, char second) const { | |
310 return (parseDigit(first) << 4) | parseDigit(second); | |
311 } | |
312 }; | |
313 | |
314 struct ParseHexByte { | |
315 inline byte operator()(char first, char second) const { | |
316 return (parseDigit(first) << 4) | parseDigit(second); | |
317 } | |
318 }; | |
319 | |
320 struct ParseOctEscape { | |
321 inline char operator()(char first, Maybe<char> second, Maybe<char> third) const { | |
322 char result = first - '0'; | |
323 KJ_IF_MAYBE(digit1, second) { | |
324 result = (result << 3) | (*digit1 - '0'); | |
325 KJ_IF_MAYBE(digit2, third) { | |
326 result = (result << 3) | (*digit2 - '0'); | |
327 } | |
328 } | |
329 return result; | |
330 } | |
331 }; | |
332 | |
333 } // namespace _ (private) | |
334 | |
335 constexpr auto escapeSequence = | |
336 sequence(exactChar<'\\'>(), oneOf( | |
337 transform(anyOfChars("abfnrtv'\"\\\?"), _::InterpretEscape()), | |
338 transform(sequence(exactChar<'x'>(), hexDigit, hexDigit), _::ParseHexEscape()), | |
339 transform(sequence(octDigit, optional(octDigit), optional(octDigit)), | |
340 _::ParseOctEscape()))); | |
341 // A parser that parses a C-string-style escape sequence (starting with a backslash). Returns | |
342 // a char. | |
343 | |
344 constexpr auto doubleQuotedString = charsToString(sequence( | |
345 exactChar<'\"'>(), | |
346 many(oneOf(anyOfChars("\\\n\"").invert(), escapeSequence)), | |
347 exactChar<'\"'>())); | |
348 // Parses a C-style double-quoted string. | |
349 | |
350 constexpr auto singleQuotedString = charsToString(sequence( | |
351 exactChar<'\''>(), | |
352 many(oneOf(anyOfChars("\\\n\'").invert(), escapeSequence)), | |
353 exactChar<'\''>())); | |
354 // Parses a C-style single-quoted string. | |
355 | |
356 constexpr auto doubleQuotedHexBinary = sequence( | |
357 exactChar<'0'>(), exactChar<'x'>(), exactChar<'\"'>(), | |
358 oneOrMore(transform(sequence(discardWhitespace, hexDigit, hexDigit), _::ParseHexByte())), | |
359 discardWhitespace, | |
360 exactChar<'\"'>()); | |
361 // Parses a double-quoted hex binary literal. Returns Array<byte>. | |
362 | |
363 } // namespace parse | |
364 } // namespace kj | |
365 | |
366 KJ_END_HEADER |