comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/kj/encoding.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
comparison
equal deleted inserted replaced
67:0e9998148a16 69:33d812a61356
1 // Copyright (c) 2017 Cloudflare, Inc. and contributors
2 // Licensed under the MIT License:
3 //
4 // Permission is hereby granted, free of charge, to any person obtaining a copy
5 // of this software and associated documentation files (the "Software"), to deal
6 // in the Software without restriction, including without limitation the rights
7 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 // copies of the Software, and to permit persons to whom the Software is
9 // furnished to do so, subject to the following conditions:
10 //
11 // The above copyright notice and this permission notice shall be included in
12 // all copies or substantial portions of the Software.
13 //
14 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
18 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
19 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
20 // THE SOFTWARE.
21
22 #pragma once
23 // Functions for encoding/decoding bytes and text in common formats, including:
24 // - UTF-{8,16,32}
25 // - Hex
26 // - URI encoding
27 // - Base64
28
29 #include "string.h"
30
31 KJ_BEGIN_HEADER
32
33 namespace kj {
34
35 template <typename ResultType>
36 struct EncodingResult: public ResultType {
37 // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except
38 // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input.
39 // Each encoding/decoding function that returns this type will "work around" errors in some way,
40 // so an application doesn't strictly have to check for errors. E.g. the Unicode functions
41 // replace errors with U+FFFD in the output.
42 //
43 // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T>
44 // exactly if it were a Maybe<T> that is null in case of errors.
45
46 inline EncodingResult(ResultType&& result, bool hadErrors)
47 : ResultType(kj::mv(result)), hadErrors(hadErrors) {}
48
49 const bool hadErrors;
50 };
51
52 template <typename T>
53 inline auto KJ_STRINGIFY(const EncodingResult<T>& value)
54 -> decltype(toCharSequence(implicitCast<const T&>(value))) {
55 return toCharSequence(implicitCast<const T&>(value));
56 }
57
58 EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false);
59 EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false);
60 // Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32.
61 //
62 // If `nulTerminate` is true, an extra NUL character will be added to the end of the output.
63 //
64 // The returned arrays are in platform-native endianness (otherwise they wouldn't really be
65 // char16_t / char32_t).
66 //
67 // Note that the KJ Unicode encoding and decoding functions actually implement
68 // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is
69 // handled. See comments on decodeUtf16() for more info.
70
71 EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16);
72 EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32);
73 // Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use).
74 //
75 // The input should NOT include a NUL terminator; any NUL characters in the input array will be
76 // preserved in the output.
77 //
78 // The input must be in platform-native endianness. BOMs are NOT recognized by these functions.
79 //
80 // Note that the KJ Unicode encoding and decoding functions actually implement
81 // [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array
82 // of char16_t and you pass it through any number of conversions to other Unicode encodings,
83 // eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with
84 // exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This
85 // is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode)
86 // and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example,
87 // file names on Windows NT are encoded using 16-bit characters, without enforcing that the
88 // character sequence is valid UTF-16. It is important that programs on Windows be able to handle
89 // such filenames, even if they choose to convert the name to UTF-8 for internal processing.
90 //
91 // Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through
92 // UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the
93 // result), but will NOT be replaced with the Unicode replacement character as other erroneous
94 // sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding.
95 //
96 // KJ makes the following guarantees about invalid input:
97 // - A round trip from UTF-16 to other encodings and back will produce exactly the original input,
98 // with every leg of the trip raising the `hadErrors` flag if the original input was not valid.
99 // - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly
100 // the original input, or will have replaced some invalid sequences with the Unicode replacement
101 // character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD,
102 // and no code units will ever be added except to encode U+FFFD. If the original input was not
103 // valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be
104 // raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after
105 // all, is a valid code point).
106
107 EncodingResult<Array<wchar_t>> encodeWideString(
108 ArrayPtr<const char> text, bool nulTerminate = false);
109 EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide);
110 // Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have
111 // different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16,
112 // but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit,
113 // encoding UTF-8 (e.g. BeOS did this).
114 //
115 // KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on
116 // the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above
117 // (or simply make a copy if wchar_t is 8 bits).
118
119 String encodeHex(ArrayPtr<const byte> bytes);
120 EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text);
121 // Encode/decode bytes as hex strings.
122
123 String encodeUriComponent(ArrayPtr<const byte> bytes);
124 String encodeUriComponent(ArrayPtr<const char> bytes);
125 EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text);
126 // Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396.
127 // This is the same behavior as JavaScript's `encodeURIComponent()`.
128 //
129 // See https://tools.ietf.org/html/rfc2396#section-2.3
130
131 String encodeUriFragment(ArrayPtr<const byte> bytes);
132 String encodeUriFragment(ArrayPtr<const char> bytes);
133 // Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL
134 // specification. Use decodeUriComponent() to decode.
135 //
136 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
137 // decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
138 //
139 // See https://url.spec.whatwg.org/#fragment-percent-encode-set
140
141 String encodeUriPath(ArrayPtr<const byte> bytes);
142 String encodeUriPath(ArrayPtr<const char> bytes);
143 // Encode URL path components (not entire paths!) using the path percent encode set defined by the
144 // WHATWG URL specification. Use decodeUriComponent() to decode.
145 //
146 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
147 // decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
148 //
149 // Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set
150 // defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this
151 // function on individual path components, and never entire paths, augmenting the character set to
152 // include these separators allows this function to be used to implement a URL class that stores
153 // its path components in percent-decoded form.
154 //
155 // See https://url.spec.whatwg.org/#path-percent-encode-set
156
157 String encodeUriUserInfo(ArrayPtr<const byte> bytes);
158 String encodeUriUserInfo(ArrayPtr<const char> bytes);
159 // Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL
160 // specification. Use decodeUriComponent() to decode.
161 //
162 // Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent-
163 // decoded data. In other words, this function is not idempotent, in contrast to the URL spec.
164 //
165 // See https://url.spec.whatwg.org/#userinfo-percent-encode-set
166
167 String encodeWwwForm(ArrayPtr<const byte> bytes);
168 String encodeWwwForm(ArrayPtr<const char> bytes);
169 EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text);
170 // Encode/decode URI components using % escapes and '+' (for spaces) according to the
171 // application/x-www-form-urlencoded format defined by the WHATWG URL specification.
172 //
173 // Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is
174 // not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens
175 // to agree with us!
176 //
177 // See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer
178
179 struct DecodeUriOptions {
180 // Parameter to `decodeBinaryUriComponent()`.
181
182 // This struct is intentionally convertible from bool, in order to maintain backwards
183 // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second
184 // parameter.
185 DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false)
186 : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {}
187
188 bool nulTerminate;
189 // Append a terminal NUL byte.
190
191 bool plusToSpace;
192 // Convert '+' to ' ' characters before percent decoding. Used to decode
193 // application/x-www-form-urlencoded text, such as query strings.
194 };
195 EncodingResult<Array<byte>> decodeBinaryUriComponent(
196 ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions());
197 // Decode URI components using % escapes. This is a lower-level interface used to implement both
198 // `decodeUriComponent()` and `decodeWwwForm()`
199
200 String encodeCEscape(ArrayPtr<const byte> bytes);
201 String encodeCEscape(ArrayPtr<const char> bytes);
202 EncodingResult<Array<byte>> decodeBinaryCEscape(
203 ArrayPtr<const char> text, bool nulTerminate = false);
204 EncodingResult<String> decodeCEscape(ArrayPtr<const char> text);
205
206 String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false);
207 // Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted
208 // into the output every 72 characters (e.g. for encoding e-mail bodies).
209
210 EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text);
211 // Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see
212 // https://html.spec.whatwg.org/multipage/webappapis.html#atob for details.
213
214 String encodeBase64Url(ArrayPtr<const byte> bytes);
215 // Encode the given bytes as URL-safe base64 text. (RFC 4648, section 5)
216
217 // =======================================================================================
218 // inline implementation details
219
220 namespace _ { // private
221
222 template <typename T>
223 NullableValue<T> readMaybe(EncodingResult<T>&& value) {
224 if (value.hadErrors) {
225 return nullptr;
226 } else {
227 return kj::mv(value);
228 }
229 }
230
231 template <typename T>
232 T* readMaybe(EncodingResult<T>& value) {
233 if (value.hadErrors) {
234 return nullptr;
235 } else {
236 return &value;
237 }
238 }
239
240 template <typename T>
241 const T* readMaybe(const EncodingResult<T>& value) {
242 if (value.hadErrors) {
243 return nullptr;
244 } else {
245 return &value;
246 }
247 }
248
249 String encodeCEscapeImpl(ArrayPtr<const byte> bytes, bool isBinary);
250
251 } // namespace _ (private)
252
253 inline String encodeUriComponent(ArrayPtr<const char> text) {
254 return encodeUriComponent(text.asBytes());
255 }
256 inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) {
257 auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true });
258 return { String(result.releaseAsChars()), result.hadErrors };
259 }
260
261 inline String encodeUriFragment(ArrayPtr<const char> text) {
262 return encodeUriFragment(text.asBytes());
263 }
264 inline String encodeUriPath(ArrayPtr<const char> text) {
265 return encodeUriPath(text.asBytes());
266 }
267 inline String encodeUriUserInfo(ArrayPtr<const char> text) {
268 return encodeUriUserInfo(text.asBytes());
269 }
270
271 inline String encodeWwwForm(ArrayPtr<const char> text) {
272 return encodeWwwForm(text.asBytes());
273 }
274 inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) {
275 auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true,
276 /*.plusToSpace=*/true });
277 return { String(result.releaseAsChars()), result.hadErrors };
278 }
279
280 inline String encodeCEscape(ArrayPtr<const char> text) {
281 return _::encodeCEscapeImpl(text.asBytes(), false);
282 }
283
284 inline String encodeCEscape(ArrayPtr<const byte> bytes) {
285 return _::encodeCEscapeImpl(bytes, true);
286 }
287
288 inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) {
289 auto result = decodeBinaryCEscape(text, true);
290 return { String(result.releaseAsChars()), result.hadErrors };
291 }
292
293 // If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL
294 // termintator, which is surprising. Let's add overloads that avoid that. In practice this probably
295 // only even matters for encoding-test.c++.
296
297 template <size_t s>
298 inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) {
299 return encodeUtf16(arrayPtr(text, s - 1), nulTerminate);
300 }
301 template <size_t s>
302 inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) {
303 return encodeUtf32(arrayPtr(text, s - 1), nulTerminate);
304 }
305 template <size_t s>
306 inline EncodingResult<Array<wchar_t>> encodeWideString(
307 const char (&text)[s], bool nulTerminate=false) {
308 return encodeWideString(arrayPtr(text, s - 1), nulTerminate);
309 }
310 template <size_t s>
311 inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) {
312 return decodeUtf16(arrayPtr(utf16, s - 1));
313 }
314 template <size_t s>
315 inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) {
316 return decodeUtf32(arrayPtr(utf32, s - 1));
317 }
318 template <size_t s>
319 inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) {
320 return decodeWideString(arrayPtr(utf32, s - 1));
321 }
322 template <size_t s>
323 inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) {
324 return decodeHex(arrayPtr(text, s - 1));
325 }
326 template <size_t s>
327 inline String encodeUriComponent(const char (&text)[s]) {
328 return encodeUriComponent(arrayPtr(text, s - 1));
329 }
330 template <size_t s>
331 inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) {
332 return decodeBinaryUriComponent(arrayPtr(text, s - 1));
333 }
334 template <size_t s>
335 inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) {
336 return decodeUriComponent(arrayPtr(text, s-1));
337 }
338 template <size_t s>
339 inline String encodeUriFragment(const char (&text)[s]) {
340 return encodeUriFragment(arrayPtr(text, s - 1));
341 }
342 template <size_t s>
343 inline String encodeUriPath(const char (&text)[s]) {
344 return encodeUriPath(arrayPtr(text, s - 1));
345 }
346 template <size_t s>
347 inline String encodeUriUserInfo(const char (&text)[s]) {
348 return encodeUriUserInfo(arrayPtr(text, s - 1));
349 }
350 template <size_t s>
351 inline String encodeWwwForm(const char (&text)[s]) {
352 return encodeWwwForm(arrayPtr(text, s - 1));
353 }
354 template <size_t s>
355 inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) {
356 return decodeWwwForm(arrayPtr(text, s-1));
357 }
358 template <size_t s>
359 inline String encodeCEscape(const char (&text)[s]) {
360 return encodeCEscape(arrayPtr(text, s - 1));
361 }
362 template <size_t s>
363 inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) {
364 return decodeBinaryCEscape(arrayPtr(text, s - 1));
365 }
366 template <size_t s>
367 inline EncodingResult<String> decodeCEscape(const char (&text)[s]) {
368 return decodeCEscape(arrayPtr(text, s-1));
369 }
370 template <size_t s>
371 EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) {
372 return decodeBase64(arrayPtr(text, s - 1));
373 }
374
375 #if __cpp_char8_t
376 template <size_t s>
377 inline EncodingResult<Array<char16_t>> encodeUtf16(const char8_t (&text)[s], bool nulTerminate=false) {
378 return encodeUtf16(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate);
379 }
380 template <size_t s>
381 inline EncodingResult<Array<char32_t>> encodeUtf32(const char8_t (&text)[s], bool nulTerminate=false) {
382 return encodeUtf32(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate);
383 }
384 template <size_t s>
385 inline EncodingResult<Array<wchar_t>> encodeWideString(
386 const char8_t (&text)[s], bool nulTerminate=false) {
387 return encodeWideString(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate);
388 }
389 template <size_t s>
390 inline EncodingResult<Array<byte>> decodeHex(const char8_t (&text)[s]) {
391 return decodeHex(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
392 }
393 template <size_t s>
394 inline String encodeUriComponent(const char8_t (&text)[s]) {
395 return encodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
396 }
397 template <size_t s>
398 inline Array<byte> decodeBinaryUriComponent(const char8_t (&text)[s]) {
399 return decodeBinaryUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
400 }
401 template <size_t s>
402 inline EncodingResult<String> decodeUriComponent(const char8_t (&text)[s]) {
403 return decodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s-1));
404 }
405 template <size_t s>
406 inline String encodeUriFragment(const char8_t (&text)[s]) {
407 return encodeUriFragment(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
408 }
409 template <size_t s>
410 inline String encodeUriPath(const char8_t (&text)[s]) {
411 return encodeUriPath(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
412 }
413 template <size_t s>
414 inline String encodeUriUserInfo(const char8_t (&text)[s]) {
415 return encodeUriUserInfo(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
416 }
417 template <size_t s>
418 inline String encodeWwwForm(const char8_t (&text)[s]) {
419 return encodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
420 }
421 template <size_t s>
422 inline EncodingResult<String> decodeWwwForm(const char8_t (&text)[s]) {
423 return decodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s-1));
424 }
425 template <size_t s>
426 inline String encodeCEscape(const char8_t (&text)[s]) {
427 return encodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
428 }
429 template <size_t s>
430 inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char8_t (&text)[s]) {
431 return decodeBinaryCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
432 }
433 template <size_t s>
434 inline EncodingResult<String> decodeCEscape(const char8_t (&text)[s]) {
435 return decodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s-1));
436 }
437 template <size_t s>
438 EncodingResult<Array<byte>> decodeBase64(const char8_t (&text)[s]) {
439 return decodeBase64(arrayPtr(reinterpret_cast<const char*>(text), s - 1));
440 }
441 #endif
442
443 } // namespace kj
444
445 KJ_END_HEADER