Mercurial > repos > rliterman > csp2
diff CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/kj/encoding.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/kj/encoding.h Tue Mar 18 17:55:14 2025 -0400 @@ -0,0 +1,445 @@ +// Copyright (c) 2017 Cloudflare, Inc. and contributors +// Licensed under the MIT License: +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +#pragma once +// Functions for encoding/decoding bytes and text in common formats, including: +// - UTF-{8,16,32} +// - Hex +// - URI encoding +// - Base64 + +#include "string.h" + +KJ_BEGIN_HEADER + +namespace kj { + +template <typename ResultType> +struct EncodingResult: public ResultType { + // Equivalent to ResultType (a String or wide-char array) for all intents and purposes, except + // that the bool `hadErrors` can be inspected to see if any errors were encountered in the input. + // Each encoding/decoding function that returns this type will "work around" errors in some way, + // so an application doesn't strictly have to check for errors. E.g. the Unicode functions + // replace errors with U+FFFD in the output. + // + // Through magic, KJ_IF_MAYBE() and KJ_{REQUIRE,ASSERT}_NONNULL() work on EncodingResult<T> + // exactly if it were a Maybe<T> that is null in case of errors. + + inline EncodingResult(ResultType&& result, bool hadErrors) + : ResultType(kj::mv(result)), hadErrors(hadErrors) {} + + const bool hadErrors; +}; + +template <typename T> +inline auto KJ_STRINGIFY(const EncodingResult<T>& value) + -> decltype(toCharSequence(implicitCast<const T&>(value))) { + return toCharSequence(implicitCast<const T&>(value)); +} + +EncodingResult<Array<char16_t>> encodeUtf16(ArrayPtr<const char> text, bool nulTerminate = false); +EncodingResult<Array<char32_t>> encodeUtf32(ArrayPtr<const char> text, bool nulTerminate = false); +// Convert UTF-8 text (which KJ strings use) to UTF-16 or UTF-32. +// +// If `nulTerminate` is true, an extra NUL character will be added to the end of the output. +// +// The returned arrays are in platform-native endianness (otherwise they wouldn't really be +// char16_t / char32_t). +// +// Note that the KJ Unicode encoding and decoding functions actually implement +// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/), which affects how invalid input is +// handled. See comments on decodeUtf16() for more info. + +EncodingResult<String> decodeUtf16(ArrayPtr<const char16_t> utf16); +EncodingResult<String> decodeUtf32(ArrayPtr<const char32_t> utf32); +// Convert UTF-16 or UTF-32 to UTF-8 (which KJ strings use). +// +// The input should NOT include a NUL terminator; any NUL characters in the input array will be +// preserved in the output. +// +// The input must be in platform-native endianness. BOMs are NOT recognized by these functions. +// +// Note that the KJ Unicode encoding and decoding functions actually implement +// [WTF-8 encoding](http://simonsapin.github.io/wtf-8/). This means that if you start with an array +// of char16_t and you pass it through any number of conversions to other Unicode encodings, +// eventually returning it to UTF-16, all the while ignoring `hadErrors`, you will end up with +// exactly the same char16_t array you started with, *even if* the array is not valid UTF-16. This +// is useful because many real-world systems that were designed for UCS-2 (plain 16-bit Unicode) +// and later "upgraded" to UTF-16 do not enforce that their UTF-16 is well-formed. For example, +// file names on Windows NT are encoded using 16-bit characters, without enforcing that the +// character sequence is valid UTF-16. It is important that programs on Windows be able to handle +// such filenames, even if they choose to convert the name to UTF-8 for internal processing. +// +// Specifically, KJ's Unicode handling allows unpaired surrogate code points to round-trip through +// UTF-8 and UTF-32. Unpaired surrogates will be flagged as an error (setting `hadErrors` in the +// result), but will NOT be replaced with the Unicode replacement character as other erroneous +// sequences would be, but rather encoded as an invalid surrogate codepoint in the target encoding. +// +// KJ makes the following guarantees about invalid input: +// - A round trip from UTF-16 to other encodings and back will produce exactly the original input, +// with every leg of the trip raising the `hadErrors` flag if the original input was not valid. +// - A round trip from UTF-8 or UTF-32 to other encodings and back will either produce exactly +// the original input, or will have replaced some invalid sequences with the Unicode replacement +// character, U+FFFD. No code units will ever be removed unless they are replaced with U+FFFD, +// and no code units will ever be added except to encode U+FFFD. If the original input was not +// valid, the `hadErrors` flag will be raised on the first leg of the trip, and will also be +// raised on subsequent legs unless all invalid sequences were replaced with U+FFFD (which, after +// all, is a valid code point). + +EncodingResult<Array<wchar_t>> encodeWideString( + ArrayPtr<const char> text, bool nulTerminate = false); +EncodingResult<String> decodeWideString(ArrayPtr<const wchar_t> wide); +// Encode / decode strings of wchar_t, aka "wide strings". Unfortunately, different platforms have +// different definitions for wchar_t. For example, on Windows they are 16-bit and encode UTF-16, +// but on Linux they are 32-bit and encode UTF-32. Some platforms even define wchar_t as 8-bit, +// encoding UTF-8 (e.g. BeOS did this). +// +// KJ assumes that wide strings use the UTF encoding that corresponds to the size of wchar_t on +// the target platform. So, these functions are simple aliases for encodeUtf*/decodeUtf*, above +// (or simply make a copy if wchar_t is 8 bits). + +String encodeHex(ArrayPtr<const byte> bytes); +EncodingResult<Array<byte>> decodeHex(ArrayPtr<const char> text); +// Encode/decode bytes as hex strings. + +String encodeUriComponent(ArrayPtr<const byte> bytes); +String encodeUriComponent(ArrayPtr<const char> bytes); +EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text); +// Encode/decode URI components using % escapes for characters listed as "reserved" in RFC 2396. +// This is the same behavior as JavaScript's `encodeURIComponent()`. +// +// See https://tools.ietf.org/html/rfc2396#section-2.3 + +String encodeUriFragment(ArrayPtr<const byte> bytes); +String encodeUriFragment(ArrayPtr<const char> bytes); +// Encode URL fragment components using the fragment percent encode set defined by the WHATWG URL +// specification. Use decodeUriComponent() to decode. +// +// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- +// decoded data. In other words, this function is not idempotent, in contrast to the URL spec. +// +// See https://url.spec.whatwg.org/#fragment-percent-encode-set + +String encodeUriPath(ArrayPtr<const byte> bytes); +String encodeUriPath(ArrayPtr<const char> bytes); +// Encode URL path components (not entire paths!) using the path percent encode set defined by the +// WHATWG URL specification. Use decodeUriComponent() to decode. +// +// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- +// decoded data. In other words, this function is not idempotent, in contrast to the URL spec. +// +// Quirk: This percent-encodes '/' and '\' characters as well, which are not actually in the set +// defined by the WHATWG URL spec. Since a conforming URL implementation will only ever call this +// function on individual path components, and never entire paths, augmenting the character set to +// include these separators allows this function to be used to implement a URL class that stores +// its path components in percent-decoded form. +// +// See https://url.spec.whatwg.org/#path-percent-encode-set + +String encodeUriUserInfo(ArrayPtr<const byte> bytes); +String encodeUriUserInfo(ArrayPtr<const char> bytes); +// Encode URL userinfo components using the userinfo percent encode set defined by the WHATWG URL +// specification. Use decodeUriComponent() to decode. +// +// Quirk: We also percent-encode the '%' sign itself, because we expect to be called on percent- +// decoded data. In other words, this function is not idempotent, in contrast to the URL spec. +// +// See https://url.spec.whatwg.org/#userinfo-percent-encode-set + +String encodeWwwForm(ArrayPtr<const byte> bytes); +String encodeWwwForm(ArrayPtr<const char> bytes); +EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text); +// Encode/decode URI components using % escapes and '+' (for spaces) according to the +// application/x-www-form-urlencoded format defined by the WHATWG URL specification. +// +// Note: Like the fragment, path, and userinfo percent-encoding functions above, this function is +// not idempotent: we percent-encode '%' signs. However, in this particular case the spec happens +// to agree with us! +// +// See https://url.spec.whatwg.org/#concept-urlencoded-byte-serializer + +struct DecodeUriOptions { + // Parameter to `decodeBinaryUriComponent()`. + + // This struct is intentionally convertible from bool, in order to maintain backwards + // compatibility with code written when `decodeBinaryUriComponent()` took a boolean second + // parameter. + DecodeUriOptions(bool nulTerminate = false, bool plusToSpace = false) + : nulTerminate(nulTerminate), plusToSpace(plusToSpace) {} + + bool nulTerminate; + // Append a terminal NUL byte. + + bool plusToSpace; + // Convert '+' to ' ' characters before percent decoding. Used to decode + // application/x-www-form-urlencoded text, such as query strings. +}; +EncodingResult<Array<byte>> decodeBinaryUriComponent( + ArrayPtr<const char> text, DecodeUriOptions options = DecodeUriOptions()); +// Decode URI components using % escapes. This is a lower-level interface used to implement both +// `decodeUriComponent()` and `decodeWwwForm()` + +String encodeCEscape(ArrayPtr<const byte> bytes); +String encodeCEscape(ArrayPtr<const char> bytes); +EncodingResult<Array<byte>> decodeBinaryCEscape( + ArrayPtr<const char> text, bool nulTerminate = false); +EncodingResult<String> decodeCEscape(ArrayPtr<const char> text); + +String encodeBase64(ArrayPtr<const byte> bytes, bool breakLines = false); +// Encode the given bytes as base64 text. If `breakLines` is true, line breaks will be inserted +// into the output every 72 characters (e.g. for encoding e-mail bodies). + +EncodingResult<Array<byte>> decodeBase64(ArrayPtr<const char> text); +// Decode base64 text. This function reports errors required by the WHATWG HTML/Infra specs: see +// https://html.spec.whatwg.org/multipage/webappapis.html#atob for details. + +String encodeBase64Url(ArrayPtr<const byte> bytes); +// Encode the given bytes as URL-safe base64 text. (RFC 4648, section 5) + +// ======================================================================================= +// inline implementation details + +namespace _ { // private + +template <typename T> +NullableValue<T> readMaybe(EncodingResult<T>&& value) { + if (value.hadErrors) { + return nullptr; + } else { + return kj::mv(value); + } +} + +template <typename T> +T* readMaybe(EncodingResult<T>& value) { + if (value.hadErrors) { + return nullptr; + } else { + return &value; + } +} + +template <typename T> +const T* readMaybe(const EncodingResult<T>& value) { + if (value.hadErrors) { + return nullptr; + } else { + return &value; + } +} + +String encodeCEscapeImpl(ArrayPtr<const byte> bytes, bool isBinary); + +} // namespace _ (private) + +inline String encodeUriComponent(ArrayPtr<const char> text) { + return encodeUriComponent(text.asBytes()); +} +inline EncodingResult<String> decodeUriComponent(ArrayPtr<const char> text) { + auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true }); + return { String(result.releaseAsChars()), result.hadErrors }; +} + +inline String encodeUriFragment(ArrayPtr<const char> text) { + return encodeUriFragment(text.asBytes()); +} +inline String encodeUriPath(ArrayPtr<const char> text) { + return encodeUriPath(text.asBytes()); +} +inline String encodeUriUserInfo(ArrayPtr<const char> text) { + return encodeUriUserInfo(text.asBytes()); +} + +inline String encodeWwwForm(ArrayPtr<const char> text) { + return encodeWwwForm(text.asBytes()); +} +inline EncodingResult<String> decodeWwwForm(ArrayPtr<const char> text) { + auto result = decodeBinaryUriComponent(text, DecodeUriOptions { /*.nulTerminate=*/true, + /*.plusToSpace=*/true }); + return { String(result.releaseAsChars()), result.hadErrors }; +} + +inline String encodeCEscape(ArrayPtr<const char> text) { + return _::encodeCEscapeImpl(text.asBytes(), false); +} + +inline String encodeCEscape(ArrayPtr<const byte> bytes) { + return _::encodeCEscapeImpl(bytes, true); +} + +inline EncodingResult<String> decodeCEscape(ArrayPtr<const char> text) { + auto result = decodeBinaryCEscape(text, true); + return { String(result.releaseAsChars()), result.hadErrors }; +} + +// If you pass a string literal to a function taking ArrayPtr<const char>, it'll include the NUL +// termintator, which is surprising. Let's add overloads that avoid that. In practice this probably +// only even matters for encoding-test.c++. + +template <size_t s> +inline EncodingResult<Array<char16_t>> encodeUtf16(const char (&text)[s], bool nulTerminate=false) { + return encodeUtf16(arrayPtr(text, s - 1), nulTerminate); +} +template <size_t s> +inline EncodingResult<Array<char32_t>> encodeUtf32(const char (&text)[s], bool nulTerminate=false) { + return encodeUtf32(arrayPtr(text, s - 1), nulTerminate); +} +template <size_t s> +inline EncodingResult<Array<wchar_t>> encodeWideString( + const char (&text)[s], bool nulTerminate=false) { + return encodeWideString(arrayPtr(text, s - 1), nulTerminate); +} +template <size_t s> +inline EncodingResult<String> decodeUtf16(const char16_t (&utf16)[s]) { + return decodeUtf16(arrayPtr(utf16, s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeUtf32(const char32_t (&utf32)[s]) { + return decodeUtf32(arrayPtr(utf32, s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeWideString(const wchar_t (&utf32)[s]) { + return decodeWideString(arrayPtr(utf32, s - 1)); +} +template <size_t s> +inline EncodingResult<Array<byte>> decodeHex(const char (&text)[s]) { + return decodeHex(arrayPtr(text, s - 1)); +} +template <size_t s> +inline String encodeUriComponent(const char (&text)[s]) { + return encodeUriComponent(arrayPtr(text, s - 1)); +} +template <size_t s> +inline Array<byte> decodeBinaryUriComponent(const char (&text)[s]) { + return decodeBinaryUriComponent(arrayPtr(text, s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeUriComponent(const char (&text)[s]) { + return decodeUriComponent(arrayPtr(text, s-1)); +} +template <size_t s> +inline String encodeUriFragment(const char (&text)[s]) { + return encodeUriFragment(arrayPtr(text, s - 1)); +} +template <size_t s> +inline String encodeUriPath(const char (&text)[s]) { + return encodeUriPath(arrayPtr(text, s - 1)); +} +template <size_t s> +inline String encodeUriUserInfo(const char (&text)[s]) { + return encodeUriUserInfo(arrayPtr(text, s - 1)); +} +template <size_t s> +inline String encodeWwwForm(const char (&text)[s]) { + return encodeWwwForm(arrayPtr(text, s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeWwwForm(const char (&text)[s]) { + return decodeWwwForm(arrayPtr(text, s-1)); +} +template <size_t s> +inline String encodeCEscape(const char (&text)[s]) { + return encodeCEscape(arrayPtr(text, s - 1)); +} +template <size_t s> +inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char (&text)[s]) { + return decodeBinaryCEscape(arrayPtr(text, s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeCEscape(const char (&text)[s]) { + return decodeCEscape(arrayPtr(text, s-1)); +} +template <size_t s> +EncodingResult<Array<byte>> decodeBase64(const char (&text)[s]) { + return decodeBase64(arrayPtr(text, s - 1)); +} + +#if __cpp_char8_t +template <size_t s> +inline EncodingResult<Array<char16_t>> encodeUtf16(const char8_t (&text)[s], bool nulTerminate=false) { + return encodeUtf16(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate); +} +template <size_t s> +inline EncodingResult<Array<char32_t>> encodeUtf32(const char8_t (&text)[s], bool nulTerminate=false) { + return encodeUtf32(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate); +} +template <size_t s> +inline EncodingResult<Array<wchar_t>> encodeWideString( + const char8_t (&text)[s], bool nulTerminate=false) { + return encodeWideString(arrayPtr(reinterpret_cast<const char*>(text), s - 1), nulTerminate); +} +template <size_t s> +inline EncodingResult<Array<byte>> decodeHex(const char8_t (&text)[s]) { + return decodeHex(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline String encodeUriComponent(const char8_t (&text)[s]) { + return encodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline Array<byte> decodeBinaryUriComponent(const char8_t (&text)[s]) { + return decodeBinaryUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeUriComponent(const char8_t (&text)[s]) { + return decodeUriComponent(arrayPtr(reinterpret_cast<const char*>(text), s-1)); +} +template <size_t s> +inline String encodeUriFragment(const char8_t (&text)[s]) { + return encodeUriFragment(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline String encodeUriPath(const char8_t (&text)[s]) { + return encodeUriPath(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline String encodeUriUserInfo(const char8_t (&text)[s]) { + return encodeUriUserInfo(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline String encodeWwwForm(const char8_t (&text)[s]) { + return encodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeWwwForm(const char8_t (&text)[s]) { + return decodeWwwForm(arrayPtr(reinterpret_cast<const char*>(text), s-1)); +} +template <size_t s> +inline String encodeCEscape(const char8_t (&text)[s]) { + return encodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline EncodingResult<Array<byte>> decodeBinaryCEscape(const char8_t (&text)[s]) { + return decodeBinaryCEscape(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +template <size_t s> +inline EncodingResult<String> decodeCEscape(const char8_t (&text)[s]) { + return decodeCEscape(arrayPtr(reinterpret_cast<const char*>(text), s-1)); +} +template <size_t s> +EncodingResult<Array<byte>> decodeBase64(const char8_t (&text)[s]) { + return decodeBase64(arrayPtr(reinterpret_cast<const char*>(text), s - 1)); +} +#endif + +} // namespace kj + +KJ_END_HEADER