annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/normalizer2.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 // © 2016 and later: Unicode, Inc. and others.
jpayne@69 2 // License & terms of use: http://www.unicode.org/copyright.html
jpayne@69 3 /*
jpayne@69 4 *******************************************************************************
jpayne@69 5 *
jpayne@69 6 * Copyright (C) 2009-2013, International Business Machines
jpayne@69 7 * Corporation and others. All Rights Reserved.
jpayne@69 8 *
jpayne@69 9 *******************************************************************************
jpayne@69 10 * file name: normalizer2.h
jpayne@69 11 * encoding: UTF-8
jpayne@69 12 * tab size: 8 (not used)
jpayne@69 13 * indentation:4
jpayne@69 14 *
jpayne@69 15 * created on: 2009nov22
jpayne@69 16 * created by: Markus W. Scherer
jpayne@69 17 */
jpayne@69 18
jpayne@69 19 #ifndef __NORMALIZER2_H__
jpayne@69 20 #define __NORMALIZER2_H__
jpayne@69 21
jpayne@69 22 /**
jpayne@69 23 * \file
jpayne@69 24 * \brief C++ API: New API for Unicode Normalization.
jpayne@69 25 */
jpayne@69 26
jpayne@69 27 #include "unicode/utypes.h"
jpayne@69 28
jpayne@69 29 #if U_SHOW_CPLUSPLUS_API
jpayne@69 30
jpayne@69 31 #if !UCONFIG_NO_NORMALIZATION
jpayne@69 32
jpayne@69 33 #include "unicode/stringpiece.h"
jpayne@69 34 #include "unicode/uniset.h"
jpayne@69 35 #include "unicode/unistr.h"
jpayne@69 36 #include "unicode/unorm2.h"
jpayne@69 37
jpayne@69 38 U_NAMESPACE_BEGIN
jpayne@69 39
jpayne@69 40 class ByteSink;
jpayne@69 41
jpayne@69 42 /**
jpayne@69 43 * Unicode normalization functionality for standard Unicode normalization or
jpayne@69 44 * for using custom mapping tables.
jpayne@69 45 * All instances of this class are unmodifiable/immutable.
jpayne@69 46 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
jpayne@69 47 * The Normalizer2 class is not intended for public subclassing.
jpayne@69 48 *
jpayne@69 49 * The primary functions are to produce a normalized string and to detect whether
jpayne@69 50 * a string is already normalized.
jpayne@69 51 * The most commonly used normalization forms are those defined in
jpayne@69 52 * http://www.unicode.org/unicode/reports/tr15/
jpayne@69 53 * However, this API supports additional normalization forms for specialized purposes.
jpayne@69 54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
jpayne@69 55 * and can be used in implementations of UTS #46.
jpayne@69 56 *
jpayne@69 57 * Not only are the standard compose and decompose modes supplied,
jpayne@69 58 * but additional modes are provided as documented in the Mode enum.
jpayne@69 59 *
jpayne@69 60 * Some of the functions in this class identify normalization boundaries.
jpayne@69 61 * At a normalization boundary, the portions of the string
jpayne@69 62 * before it and starting from it do not interact and can be handled independently.
jpayne@69 63 *
jpayne@69 64 * The spanQuickCheckYes() stops at a normalization boundary.
jpayne@69 65 * When the goal is a normalized string, then the text before the boundary
jpayne@69 66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
jpayne@69 67 *
jpayne@69 68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
jpayne@69 69 * a character is guaranteed to be at a normalization boundary,
jpayne@69 70 * regardless of context.
jpayne@69 71 * This is used for moving from one normalization boundary to the next
jpayne@69 72 * or preceding boundary, and for performing iterative normalization.
jpayne@69 73 *
jpayne@69 74 * Iterative normalization is useful when only a small portion of a
jpayne@69 75 * longer string needs to be processed.
jpayne@69 76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
jpayne@69 77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
jpayne@69 78 * (to process only the substring for which sort key bytes are computed).
jpayne@69 79 *
jpayne@69 80 * The set of normalization boundaries returned by these functions may not be
jpayne@69 81 * complete: There may be more boundaries that could be returned.
jpayne@69 82 * Different functions may return different boundaries.
jpayne@69 83 * @stable ICU 4.4
jpayne@69 84 */
jpayne@69 85 class U_COMMON_API Normalizer2 : public UObject {
jpayne@69 86 public:
jpayne@69 87 /**
jpayne@69 88 * Destructor.
jpayne@69 89 * @stable ICU 4.4
jpayne@69 90 */
jpayne@69 91 ~Normalizer2();
jpayne@69 92
jpayne@69 93 /**
jpayne@69 94 * Returns a Normalizer2 instance for Unicode NFC normalization.
jpayne@69 95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
jpayne@69 96 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69 97 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 98 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 99 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 100 * function chaining. (See User Guide for details.)
jpayne@69 101 * @return the requested Normalizer2, if successful
jpayne@69 102 * @stable ICU 49
jpayne@69 103 */
jpayne@69 104 static const Normalizer2 *
jpayne@69 105 getNFCInstance(UErrorCode &errorCode);
jpayne@69 106
jpayne@69 107 /**
jpayne@69 108 * Returns a Normalizer2 instance for Unicode NFD normalization.
jpayne@69 109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
jpayne@69 110 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69 111 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 112 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 113 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 114 * function chaining. (See User Guide for details.)
jpayne@69 115 * @return the requested Normalizer2, if successful
jpayne@69 116 * @stable ICU 49
jpayne@69 117 */
jpayne@69 118 static const Normalizer2 *
jpayne@69 119 getNFDInstance(UErrorCode &errorCode);
jpayne@69 120
jpayne@69 121 /**
jpayne@69 122 * Returns a Normalizer2 instance for Unicode NFKC normalization.
jpayne@69 123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
jpayne@69 124 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69 125 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 126 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 127 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 128 * function chaining. (See User Guide for details.)
jpayne@69 129 * @return the requested Normalizer2, if successful
jpayne@69 130 * @stable ICU 49
jpayne@69 131 */
jpayne@69 132 static const Normalizer2 *
jpayne@69 133 getNFKCInstance(UErrorCode &errorCode);
jpayne@69 134
jpayne@69 135 /**
jpayne@69 136 * Returns a Normalizer2 instance for Unicode NFKD normalization.
jpayne@69 137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
jpayne@69 138 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69 139 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 140 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 141 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 142 * function chaining. (See User Guide for details.)
jpayne@69 143 * @return the requested Normalizer2, if successful
jpayne@69 144 * @stable ICU 49
jpayne@69 145 */
jpayne@69 146 static const Normalizer2 *
jpayne@69 147 getNFKDInstance(UErrorCode &errorCode);
jpayne@69 148
jpayne@69 149 /**
jpayne@69 150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
jpayne@69 151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
jpayne@69 152 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69 153 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 154 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 155 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 156 * function chaining. (See User Guide for details.)
jpayne@69 157 * @return the requested Normalizer2, if successful
jpayne@69 158 * @stable ICU 49
jpayne@69 159 */
jpayne@69 160 static const Normalizer2 *
jpayne@69 161 getNFKCCasefoldInstance(UErrorCode &errorCode);
jpayne@69 162
jpayne@69 163 /**
jpayne@69 164 * Returns a Normalizer2 instance which uses the specified data file
jpayne@69 165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
jpayne@69 166 * and which composes or decomposes text according to the specified mode.
jpayne@69 167 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69 168 *
jpayne@69 169 * Use packageName=NULL for data files that are part of ICU's own data.
jpayne@69 170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
jpayne@69 171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
jpayne@69 172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
jpayne@69 173 *
jpayne@69 174 * @param packageName NULL for ICU built-in data, otherwise application data package name
jpayne@69 175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
jpayne@69 176 * @param mode normalization mode (compose or decompose etc.)
jpayne@69 177 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 178 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 179 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 180 * function chaining. (See User Guide for details.)
jpayne@69 181 * @return the requested Normalizer2, if successful
jpayne@69 182 * @stable ICU 4.4
jpayne@69 183 */
jpayne@69 184 static const Normalizer2 *
jpayne@69 185 getInstance(const char *packageName,
jpayne@69 186 const char *name,
jpayne@69 187 UNormalization2Mode mode,
jpayne@69 188 UErrorCode &errorCode);
jpayne@69 189
jpayne@69 190 /**
jpayne@69 191 * Returns the normalized form of the source string.
jpayne@69 192 * @param src source string
jpayne@69 193 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 194 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 195 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 196 * function chaining. (See User Guide for details.)
jpayne@69 197 * @return normalized src
jpayne@69 198 * @stable ICU 4.4
jpayne@69 199 */
jpayne@69 200 UnicodeString
jpayne@69 201 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
jpayne@69 202 UnicodeString result;
jpayne@69 203 normalize(src, result, errorCode);
jpayne@69 204 return result;
jpayne@69 205 }
jpayne@69 206 /**
jpayne@69 207 * Writes the normalized form of the source string to the destination string
jpayne@69 208 * (replacing its contents) and returns the destination string.
jpayne@69 209 * The source and destination strings must be different objects.
jpayne@69 210 * @param src source string
jpayne@69 211 * @param dest destination string; its contents is replaced with normalized src
jpayne@69 212 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 213 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 214 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 215 * function chaining. (See User Guide for details.)
jpayne@69 216 * @return dest
jpayne@69 217 * @stable ICU 4.4
jpayne@69 218 */
jpayne@69 219 virtual UnicodeString &
jpayne@69 220 normalize(const UnicodeString &src,
jpayne@69 221 UnicodeString &dest,
jpayne@69 222 UErrorCode &errorCode) const = 0;
jpayne@69 223
jpayne@69 224 /**
jpayne@69 225 * Normalizes a UTF-8 string and optionally records how source substrings
jpayne@69 226 * relate to changed and unchanged result substrings.
jpayne@69 227 *
jpayne@69 228 * Currently implemented completely only for "compose" modes,
jpayne@69 229 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69 230 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69 231 * Otherwise currently converts to & from UTF-16 and does not support edits.
jpayne@69 232 *
jpayne@69 233 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
jpayne@69 234 * @param src Source UTF-8 string.
jpayne@69 235 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
jpayne@69 236 * sink.Flush() is called at the end.
jpayne@69 237 * @param edits Records edits for index mapping, working with styled text,
jpayne@69 238 * and getting only changes (if any).
jpayne@69 239 * The Edits contents is undefined if any error occurs.
jpayne@69 240 * This function calls edits->reset() first unless
jpayne@69 241 * options includes U_EDITS_NO_RESET. edits can be nullptr.
jpayne@69 242 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 243 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 244 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 245 * function chaining. (See User Guide for details.)
jpayne@69 246 * @stable ICU 60
jpayne@69 247 */
jpayne@69 248 virtual void
jpayne@69 249 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
jpayne@69 250 Edits *edits, UErrorCode &errorCode) const;
jpayne@69 251
jpayne@69 252 /**
jpayne@69 253 * Appends the normalized form of the second string to the first string
jpayne@69 254 * (merging them at the boundary) and returns the first string.
jpayne@69 255 * The result is normalized if the first string was normalized.
jpayne@69 256 * The first and second strings must be different objects.
jpayne@69 257 * @param first string, should be normalized
jpayne@69 258 * @param second string, will be normalized
jpayne@69 259 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 260 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 261 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 262 * function chaining. (See User Guide for details.)
jpayne@69 263 * @return first
jpayne@69 264 * @stable ICU 4.4
jpayne@69 265 */
jpayne@69 266 virtual UnicodeString &
jpayne@69 267 normalizeSecondAndAppend(UnicodeString &first,
jpayne@69 268 const UnicodeString &second,
jpayne@69 269 UErrorCode &errorCode) const = 0;
jpayne@69 270 /**
jpayne@69 271 * Appends the second string to the first string
jpayne@69 272 * (merging them at the boundary) and returns the first string.
jpayne@69 273 * The result is normalized if both the strings were normalized.
jpayne@69 274 * The first and second strings must be different objects.
jpayne@69 275 * @param first string, should be normalized
jpayne@69 276 * @param second string, should be normalized
jpayne@69 277 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 278 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 279 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 280 * function chaining. (See User Guide for details.)
jpayne@69 281 * @return first
jpayne@69 282 * @stable ICU 4.4
jpayne@69 283 */
jpayne@69 284 virtual UnicodeString &
jpayne@69 285 append(UnicodeString &first,
jpayne@69 286 const UnicodeString &second,
jpayne@69 287 UErrorCode &errorCode) const = 0;
jpayne@69 288
jpayne@69 289 /**
jpayne@69 290 * Gets the decomposition mapping of c.
jpayne@69 291 * Roughly equivalent to normalizing the String form of c
jpayne@69 292 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
jpayne@69 293 * returns FALSE and does not write a string
jpayne@69 294 * if c does not have a decomposition mapping in this instance's data.
jpayne@69 295 * This function is independent of the mode of the Normalizer2.
jpayne@69 296 * @param c code point
jpayne@69 297 * @param decomposition String object which will be set to c's
jpayne@69 298 * decomposition mapping, if there is one.
jpayne@69 299 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69 300 * @stable ICU 4.6
jpayne@69 301 */
jpayne@69 302 virtual UBool
jpayne@69 303 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
jpayne@69 304
jpayne@69 305 /**
jpayne@69 306 * Gets the raw decomposition mapping of c.
jpayne@69 307 *
jpayne@69 308 * This is similar to the getDecomposition() method but returns the
jpayne@69 309 * raw decomposition mapping as specified in UnicodeData.txt or
jpayne@69 310 * (for custom data) in the mapping files processed by the gennorm2 tool.
jpayne@69 311 * By contrast, getDecomposition() returns the processed,
jpayne@69 312 * recursively-decomposed version of this mapping.
jpayne@69 313 *
jpayne@69 314 * When used on a standard NFKC Normalizer2 instance,
jpayne@69 315 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
jpayne@69 316 *
jpayne@69 317 * When used on a standard NFC Normalizer2 instance,
jpayne@69 318 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
jpayne@69 319 * in this case, the result contains either one or two code points (=1..4 char16_ts).
jpayne@69 320 *
jpayne@69 321 * This function is independent of the mode of the Normalizer2.
jpayne@69 322 * The default implementation returns FALSE.
jpayne@69 323 * @param c code point
jpayne@69 324 * @param decomposition String object which will be set to c's
jpayne@69 325 * raw decomposition mapping, if there is one.
jpayne@69 326 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69 327 * @stable ICU 49
jpayne@69 328 */
jpayne@69 329 virtual UBool
jpayne@69 330 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
jpayne@69 331
jpayne@69 332 /**
jpayne@69 333 * Performs pairwise composition of a & b and returns the composite if there is one.
jpayne@69 334 *
jpayne@69 335 * Returns a composite code point c only if c has a two-way mapping to a+b.
jpayne@69 336 * In standard Unicode normalization, this means that
jpayne@69 337 * c has a canonical decomposition to a+b
jpayne@69 338 * and c does not have the Full_Composition_Exclusion property.
jpayne@69 339 *
jpayne@69 340 * This function is independent of the mode of the Normalizer2.
jpayne@69 341 * The default implementation returns a negative value.
jpayne@69 342 * @param a A (normalization starter) code point.
jpayne@69 343 * @param b Another code point.
jpayne@69 344 * @return The non-negative composite code point if there is one; otherwise a negative value.
jpayne@69 345 * @stable ICU 49
jpayne@69 346 */
jpayne@69 347 virtual UChar32
jpayne@69 348 composePair(UChar32 a, UChar32 b) const;
jpayne@69 349
jpayne@69 350 /**
jpayne@69 351 * Gets the combining class of c.
jpayne@69 352 * The default implementation returns 0
jpayne@69 353 * but all standard implementations return the Unicode Canonical_Combining_Class value.
jpayne@69 354 * @param c code point
jpayne@69 355 * @return c's combining class
jpayne@69 356 * @stable ICU 49
jpayne@69 357 */
jpayne@69 358 virtual uint8_t
jpayne@69 359 getCombiningClass(UChar32 c) const;
jpayne@69 360
jpayne@69 361 /**
jpayne@69 362 * Tests if the string is normalized.
jpayne@69 363 * Internally, in cases where the quickCheck() method would return "maybe"
jpayne@69 364 * (which is only possible for the two COMPOSE modes) this method
jpayne@69 365 * resolves to "yes" or "no" to provide a definitive result,
jpayne@69 366 * at the cost of doing more work in those cases.
jpayne@69 367 * @param s input string
jpayne@69 368 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 369 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 370 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 371 * function chaining. (See User Guide for details.)
jpayne@69 372 * @return TRUE if s is normalized
jpayne@69 373 * @stable ICU 4.4
jpayne@69 374 */
jpayne@69 375 virtual UBool
jpayne@69 376 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
jpayne@69 377 /**
jpayne@69 378 * Tests if the UTF-8 string is normalized.
jpayne@69 379 * Internally, in cases where the quickCheck() method would return "maybe"
jpayne@69 380 * (which is only possible for the two COMPOSE modes) this method
jpayne@69 381 * resolves to "yes" or "no" to provide a definitive result,
jpayne@69 382 * at the cost of doing more work in those cases.
jpayne@69 383 *
jpayne@69 384 * This works for all normalization modes,
jpayne@69 385 * but it is currently optimized for UTF-8 only for "compose" modes,
jpayne@69 386 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69 387 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69 388 * For other modes it currently converts to UTF-16 and calls isNormalized().
jpayne@69 389 *
jpayne@69 390 * @param s UTF-8 input string
jpayne@69 391 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 392 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 393 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 394 * function chaining. (See User Guide for details.)
jpayne@69 395 * @return TRUE if s is normalized
jpayne@69 396 * @stable ICU 60
jpayne@69 397 */
jpayne@69 398 virtual UBool
jpayne@69 399 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
jpayne@69 400
jpayne@69 401
jpayne@69 402 /**
jpayne@69 403 * Tests if the string is normalized.
jpayne@69 404 * For the two COMPOSE modes, the result could be "maybe" in cases that
jpayne@69 405 * would take a little more work to resolve definitively.
jpayne@69 406 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
jpayne@69 407 * combination of quick check + normalization, to avoid
jpayne@69 408 * re-checking the "yes" prefix.
jpayne@69 409 * @param s input string
jpayne@69 410 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 411 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 412 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 413 * function chaining. (See User Guide for details.)
jpayne@69 414 * @return UNormalizationCheckResult
jpayne@69 415 * @stable ICU 4.4
jpayne@69 416 */
jpayne@69 417 virtual UNormalizationCheckResult
jpayne@69 418 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
jpayne@69 419
jpayne@69 420 /**
jpayne@69 421 * Returns the end of the normalized substring of the input string.
jpayne@69 422 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
jpayne@69 423 * the substring <code>UnicodeString(s, 0, end)</code>
jpayne@69 424 * will pass the quick check with a "yes" result.
jpayne@69 425 *
jpayne@69 426 * The returned end index is usually one or more characters before the
jpayne@69 427 * "no" or "maybe" character: The end index is at a normalization boundary.
jpayne@69 428 * (See the class documentation for more about normalization boundaries.)
jpayne@69 429 *
jpayne@69 430 * When the goal is a normalized string and most input strings are expected
jpayne@69 431 * to be normalized already, then call this method,
jpayne@69 432 * and if it returns a prefix shorter than the input string,
jpayne@69 433 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
jpayne@69 434 * @param s input string
jpayne@69 435 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 436 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 437 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 438 * function chaining. (See User Guide for details.)
jpayne@69 439 * @return "yes" span end index
jpayne@69 440 * @stable ICU 4.4
jpayne@69 441 */
jpayne@69 442 virtual int32_t
jpayne@69 443 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
jpayne@69 444
jpayne@69 445 /**
jpayne@69 446 * Tests if the character always has a normalization boundary before it,
jpayne@69 447 * regardless of context.
jpayne@69 448 * If true, then the character does not normalization-interact with
jpayne@69 449 * preceding characters.
jpayne@69 450 * In other words, a string containing this character can be normalized
jpayne@69 451 * by processing portions before this character and starting from this
jpayne@69 452 * character independently.
jpayne@69 453 * This is used for iterative normalization. See the class documentation for details.
jpayne@69 454 * @param c character to test
jpayne@69 455 * @return TRUE if c has a normalization boundary before it
jpayne@69 456 * @stable ICU 4.4
jpayne@69 457 */
jpayne@69 458 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
jpayne@69 459
jpayne@69 460 /**
jpayne@69 461 * Tests if the character always has a normalization boundary after it,
jpayne@69 462 * regardless of context.
jpayne@69 463 * If true, then the character does not normalization-interact with
jpayne@69 464 * following characters.
jpayne@69 465 * In other words, a string containing this character can be normalized
jpayne@69 466 * by processing portions up to this character and after this
jpayne@69 467 * character independently.
jpayne@69 468 * This is used for iterative normalization. See the class documentation for details.
jpayne@69 469 * Note that this operation may be significantly slower than hasBoundaryBefore().
jpayne@69 470 * @param c character to test
jpayne@69 471 * @return TRUE if c has a normalization boundary after it
jpayne@69 472 * @stable ICU 4.4
jpayne@69 473 */
jpayne@69 474 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
jpayne@69 475
jpayne@69 476 /**
jpayne@69 477 * Tests if the character is normalization-inert.
jpayne@69 478 * If true, then the character does not change, nor normalization-interact with
jpayne@69 479 * preceding or following characters.
jpayne@69 480 * In other words, a string containing this character can be normalized
jpayne@69 481 * by processing portions before this character and after this
jpayne@69 482 * character independently.
jpayne@69 483 * This is used for iterative normalization. See the class documentation for details.
jpayne@69 484 * Note that this operation may be significantly slower than hasBoundaryBefore().
jpayne@69 485 * @param c character to test
jpayne@69 486 * @return TRUE if c is normalization-inert
jpayne@69 487 * @stable ICU 4.4
jpayne@69 488 */
jpayne@69 489 virtual UBool isInert(UChar32 c) const = 0;
jpayne@69 490 };
jpayne@69 491
jpayne@69 492 /**
jpayne@69 493 * Normalization filtered by a UnicodeSet.
jpayne@69 494 * Normalizes portions of the text contained in the filter set and leaves
jpayne@69 495 * portions not contained in the filter set unchanged.
jpayne@69 496 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
jpayne@69 497 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
jpayne@69 498 * This class implements all of (and only) the Normalizer2 API.
jpayne@69 499 * An instance of this class is unmodifiable/immutable but is constructed and
jpayne@69 500 * must be destructed by the owner.
jpayne@69 501 * @stable ICU 4.4
jpayne@69 502 */
jpayne@69 503 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
jpayne@69 504 public:
jpayne@69 505 /**
jpayne@69 506 * Constructs a filtered normalizer wrapping any Normalizer2 instance
jpayne@69 507 * and a filter set.
jpayne@69 508 * Both are aliased and must not be modified or deleted while this object
jpayne@69 509 * is used.
jpayne@69 510 * The filter set should be frozen; otherwise the performance will suffer greatly.
jpayne@69 511 * @param n2 wrapped Normalizer2 instance
jpayne@69 512 * @param filterSet UnicodeSet which determines the characters to be normalized
jpayne@69 513 * @stable ICU 4.4
jpayne@69 514 */
jpayne@69 515 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
jpayne@69 516 norm2(n2), set(filterSet) {}
jpayne@69 517
jpayne@69 518 /**
jpayne@69 519 * Destructor.
jpayne@69 520 * @stable ICU 4.4
jpayne@69 521 */
jpayne@69 522 ~FilteredNormalizer2();
jpayne@69 523
jpayne@69 524 /**
jpayne@69 525 * Writes the normalized form of the source string to the destination string
jpayne@69 526 * (replacing its contents) and returns the destination string.
jpayne@69 527 * The source and destination strings must be different objects.
jpayne@69 528 * @param src source string
jpayne@69 529 * @param dest destination string; its contents is replaced with normalized src
jpayne@69 530 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 531 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 532 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 533 * function chaining. (See User Guide for details.)
jpayne@69 534 * @return dest
jpayne@69 535 * @stable ICU 4.4
jpayne@69 536 */
jpayne@69 537 virtual UnicodeString &
jpayne@69 538 normalize(const UnicodeString &src,
jpayne@69 539 UnicodeString &dest,
jpayne@69 540 UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 541
jpayne@69 542 /**
jpayne@69 543 * Normalizes a UTF-8 string and optionally records how source substrings
jpayne@69 544 * relate to changed and unchanged result substrings.
jpayne@69 545 *
jpayne@69 546 * Currently implemented completely only for "compose" modes,
jpayne@69 547 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69 548 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69 549 * Otherwise currently converts to & from UTF-16 and does not support edits.
jpayne@69 550 *
jpayne@69 551 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
jpayne@69 552 * @param src Source UTF-8 string.
jpayne@69 553 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
jpayne@69 554 * sink.Flush() is called at the end.
jpayne@69 555 * @param edits Records edits for index mapping, working with styled text,
jpayne@69 556 * and getting only changes (if any).
jpayne@69 557 * The Edits contents is undefined if any error occurs.
jpayne@69 558 * This function calls edits->reset() first unless
jpayne@69 559 * options includes U_EDITS_NO_RESET. edits can be nullptr.
jpayne@69 560 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 561 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 562 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 563 * function chaining. (See User Guide for details.)
jpayne@69 564 * @stable ICU 60
jpayne@69 565 */
jpayne@69 566 virtual void
jpayne@69 567 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
jpayne@69 568 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 569
jpayne@69 570 /**
jpayne@69 571 * Appends the normalized form of the second string to the first string
jpayne@69 572 * (merging them at the boundary) and returns the first string.
jpayne@69 573 * The result is normalized if the first string was normalized.
jpayne@69 574 * The first and second strings must be different objects.
jpayne@69 575 * @param first string, should be normalized
jpayne@69 576 * @param second string, will be normalized
jpayne@69 577 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 578 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 579 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 580 * function chaining. (See User Guide for details.)
jpayne@69 581 * @return first
jpayne@69 582 * @stable ICU 4.4
jpayne@69 583 */
jpayne@69 584 virtual UnicodeString &
jpayne@69 585 normalizeSecondAndAppend(UnicodeString &first,
jpayne@69 586 const UnicodeString &second,
jpayne@69 587 UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 588 /**
jpayne@69 589 * Appends the second string to the first string
jpayne@69 590 * (merging them at the boundary) and returns the first string.
jpayne@69 591 * The result is normalized if both the strings were normalized.
jpayne@69 592 * The first and second strings must be different objects.
jpayne@69 593 * @param first string, should be normalized
jpayne@69 594 * @param second string, should be normalized
jpayne@69 595 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 596 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 597 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 598 * function chaining. (See User Guide for details.)
jpayne@69 599 * @return first
jpayne@69 600 * @stable ICU 4.4
jpayne@69 601 */
jpayne@69 602 virtual UnicodeString &
jpayne@69 603 append(UnicodeString &first,
jpayne@69 604 const UnicodeString &second,
jpayne@69 605 UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 606
jpayne@69 607 /**
jpayne@69 608 * Gets the decomposition mapping of c.
jpayne@69 609 * For details see the base class documentation.
jpayne@69 610 *
jpayne@69 611 * This function is independent of the mode of the Normalizer2.
jpayne@69 612 * @param c code point
jpayne@69 613 * @param decomposition String object which will be set to c's
jpayne@69 614 * decomposition mapping, if there is one.
jpayne@69 615 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69 616 * @stable ICU 4.6
jpayne@69 617 */
jpayne@69 618 virtual UBool
jpayne@69 619 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
jpayne@69 620
jpayne@69 621 /**
jpayne@69 622 * Gets the raw decomposition mapping of c.
jpayne@69 623 * For details see the base class documentation.
jpayne@69 624 *
jpayne@69 625 * This function is independent of the mode of the Normalizer2.
jpayne@69 626 * @param c code point
jpayne@69 627 * @param decomposition String object which will be set to c's
jpayne@69 628 * raw decomposition mapping, if there is one.
jpayne@69 629 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69 630 * @stable ICU 49
jpayne@69 631 */
jpayne@69 632 virtual UBool
jpayne@69 633 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
jpayne@69 634
jpayne@69 635 /**
jpayne@69 636 * Performs pairwise composition of a & b and returns the composite if there is one.
jpayne@69 637 * For details see the base class documentation.
jpayne@69 638 *
jpayne@69 639 * This function is independent of the mode of the Normalizer2.
jpayne@69 640 * @param a A (normalization starter) code point.
jpayne@69 641 * @param b Another code point.
jpayne@69 642 * @return The non-negative composite code point if there is one; otherwise a negative value.
jpayne@69 643 * @stable ICU 49
jpayne@69 644 */
jpayne@69 645 virtual UChar32
jpayne@69 646 composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
jpayne@69 647
jpayne@69 648 /**
jpayne@69 649 * Gets the combining class of c.
jpayne@69 650 * The default implementation returns 0
jpayne@69 651 * but all standard implementations return the Unicode Canonical_Combining_Class value.
jpayne@69 652 * @param c code point
jpayne@69 653 * @return c's combining class
jpayne@69 654 * @stable ICU 49
jpayne@69 655 */
jpayne@69 656 virtual uint8_t
jpayne@69 657 getCombiningClass(UChar32 c) const U_OVERRIDE;
jpayne@69 658
jpayne@69 659 /**
jpayne@69 660 * Tests if the string is normalized.
jpayne@69 661 * For details see the Normalizer2 base class documentation.
jpayne@69 662 * @param s input string
jpayne@69 663 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 664 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 665 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 666 * function chaining. (See User Guide for details.)
jpayne@69 667 * @return TRUE if s is normalized
jpayne@69 668 * @stable ICU 4.4
jpayne@69 669 */
jpayne@69 670 virtual UBool
jpayne@69 671 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 672 /**
jpayne@69 673 * Tests if the UTF-8 string is normalized.
jpayne@69 674 * Internally, in cases where the quickCheck() method would return "maybe"
jpayne@69 675 * (which is only possible for the two COMPOSE modes) this method
jpayne@69 676 * resolves to "yes" or "no" to provide a definitive result,
jpayne@69 677 * at the cost of doing more work in those cases.
jpayne@69 678 *
jpayne@69 679 * This works for all normalization modes,
jpayne@69 680 * but it is currently optimized for UTF-8 only for "compose" modes,
jpayne@69 681 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69 682 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69 683 * For other modes it currently converts to UTF-16 and calls isNormalized().
jpayne@69 684 *
jpayne@69 685 * @param s UTF-8 input string
jpayne@69 686 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 687 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 688 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 689 * function chaining. (See User Guide for details.)
jpayne@69 690 * @return TRUE if s is normalized
jpayne@69 691 * @stable ICU 60
jpayne@69 692 */
jpayne@69 693 virtual UBool
jpayne@69 694 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 695 /**
jpayne@69 696 * Tests if the string is normalized.
jpayne@69 697 * For details see the Normalizer2 base class documentation.
jpayne@69 698 * @param s input string
jpayne@69 699 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 700 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 701 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 702 * function chaining. (See User Guide for details.)
jpayne@69 703 * @return UNormalizationCheckResult
jpayne@69 704 * @stable ICU 4.4
jpayne@69 705 */
jpayne@69 706 virtual UNormalizationCheckResult
jpayne@69 707 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 708 /**
jpayne@69 709 * Returns the end of the normalized substring of the input string.
jpayne@69 710 * For details see the Normalizer2 base class documentation.
jpayne@69 711 * @param s input string
jpayne@69 712 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 713 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 714 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 715 * function chaining. (See User Guide for details.)
jpayne@69 716 * @return "yes" span end index
jpayne@69 717 * @stable ICU 4.4
jpayne@69 718 */
jpayne@69 719 virtual int32_t
jpayne@69 720 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69 721
jpayne@69 722 /**
jpayne@69 723 * Tests if the character always has a normalization boundary before it,
jpayne@69 724 * regardless of context.
jpayne@69 725 * For details see the Normalizer2 base class documentation.
jpayne@69 726 * @param c character to test
jpayne@69 727 * @return TRUE if c has a normalization boundary before it
jpayne@69 728 * @stable ICU 4.4
jpayne@69 729 */
jpayne@69 730 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
jpayne@69 731
jpayne@69 732 /**
jpayne@69 733 * Tests if the character always has a normalization boundary after it,
jpayne@69 734 * regardless of context.
jpayne@69 735 * For details see the Normalizer2 base class documentation.
jpayne@69 736 * @param c character to test
jpayne@69 737 * @return TRUE if c has a normalization boundary after it
jpayne@69 738 * @stable ICU 4.4
jpayne@69 739 */
jpayne@69 740 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
jpayne@69 741
jpayne@69 742 /**
jpayne@69 743 * Tests if the character is normalization-inert.
jpayne@69 744 * For details see the Normalizer2 base class documentation.
jpayne@69 745 * @param c character to test
jpayne@69 746 * @return TRUE if c is normalization-inert
jpayne@69 747 * @stable ICU 4.4
jpayne@69 748 */
jpayne@69 749 virtual UBool isInert(UChar32 c) const U_OVERRIDE;
jpayne@69 750 private:
jpayne@69 751 UnicodeString &
jpayne@69 752 normalize(const UnicodeString &src,
jpayne@69 753 UnicodeString &dest,
jpayne@69 754 USetSpanCondition spanCondition,
jpayne@69 755 UErrorCode &errorCode) const;
jpayne@69 756
jpayne@69 757 void
jpayne@69 758 normalizeUTF8(uint32_t options, const char *src, int32_t length,
jpayne@69 759 ByteSink &sink, Edits *edits,
jpayne@69 760 USetSpanCondition spanCondition,
jpayne@69 761 UErrorCode &errorCode) const;
jpayne@69 762
jpayne@69 763 UnicodeString &
jpayne@69 764 normalizeSecondAndAppend(UnicodeString &first,
jpayne@69 765 const UnicodeString &second,
jpayne@69 766 UBool doNormalize,
jpayne@69 767 UErrorCode &errorCode) const;
jpayne@69 768
jpayne@69 769 const Normalizer2 &norm2;
jpayne@69 770 const UnicodeSet &set;
jpayne@69 771 };
jpayne@69 772
jpayne@69 773 U_NAMESPACE_END
jpayne@69 774
jpayne@69 775 #endif // !UCONFIG_NO_NORMALIZATION
jpayne@69 776
jpayne@69 777 #endif /* U_SHOW_CPLUSPLUS_API */
jpayne@69 778
jpayne@69 779 #endif // __NORMALIZER2_H__