csp2: CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/normalizer2.h annotate

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/normalizer2.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d

author	jpayne
date	Tue, 18 Mar 2025 17:55:14 -0400
parents
children

rev	line source
jpayne@69	1 // © 2016 and later: Unicode, Inc. and others.
jpayne@69	2 // License & terms of use: http://www.unicode.org/copyright.html
jpayne@69	3 /*
jpayne@69	4 *******************************************************************************
jpayne@69	5 *
jpayne@69	6 * Copyright (C) 2009-2013, International Business Machines
jpayne@69	7 * Corporation and others. All Rights Reserved.
jpayne@69	8 *
jpayne@69	9 *******************************************************************************
jpayne@69	10 * file name: normalizer2.h
jpayne@69	11 * encoding: UTF-8
jpayne@69	12 * tab size: 8 (not used)
jpayne@69	13 * indentation:4
jpayne@69	14 *
jpayne@69	15 * created on: 2009nov22
jpayne@69	16 * created by: Markus W. Scherer
jpayne@69	17 */
jpayne@69	18
jpayne@69	19 #ifndef __NORMALIZER2_H__
jpayne@69	20 #define __NORMALIZER2_H__
jpayne@69	21
jpayne@69	22 /**
jpayne@69	23 * \file
jpayne@69	24 * \brief C++ API: New API for Unicode Normalization.
jpayne@69	25 */
jpayne@69	26
jpayne@69	27 #include "unicode/utypes.h"
jpayne@69	28
jpayne@69	29 #if U_SHOW_CPLUSPLUS_API
jpayne@69	30
jpayne@69	31 #if !UCONFIG_NO_NORMALIZATION
jpayne@69	32
jpayne@69	33 #include "unicode/stringpiece.h"
jpayne@69	34 #include "unicode/uniset.h"
jpayne@69	35 #include "unicode/unistr.h"
jpayne@69	36 #include "unicode/unorm2.h"
jpayne@69	37
jpayne@69	38 U_NAMESPACE_BEGIN
jpayne@69	39
jpayne@69	40 class ByteSink;
jpayne@69	41
jpayne@69	42 /**
jpayne@69	43 * Unicode normalization functionality for standard Unicode normalization or
jpayne@69	44 * for using custom mapping tables.
jpayne@69	45 * All instances of this class are unmodifiable/immutable.
jpayne@69	46 * Instances returned by getInstance() are singletons that must not be deleted by the caller.
jpayne@69	47 * The Normalizer2 class is not intended for public subclassing.
jpayne@69	48 *
jpayne@69	49 * The primary functions are to produce a normalized string and to detect whether
jpayne@69	50 * a string is already normalized.
jpayne@69	51 * The most commonly used normalization forms are those defined in
jpayne@69	52 * http://www.unicode.org/unicode/reports/tr15/
jpayne@69	53 * However, this API supports additional normalization forms for specialized purposes.
jpayne@69	54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE)
jpayne@69	55 * and can be used in implementations of UTS #46.
jpayne@69	56 *
jpayne@69	57 * Not only are the standard compose and decompose modes supplied,
jpayne@69	58 * but additional modes are provided as documented in the Mode enum.
jpayne@69	59 *
jpayne@69	60 * Some of the functions in this class identify normalization boundaries.
jpayne@69	61 * At a normalization boundary, the portions of the string
jpayne@69	62 * before it and starting from it do not interact and can be handled independently.
jpayne@69	63 *
jpayne@69	64 * The spanQuickCheckYes() stops at a normalization boundary.
jpayne@69	65 * When the goal is a normalized string, then the text before the boundary
jpayne@69	66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend().
jpayne@69	67 *
jpayne@69	68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether
jpayne@69	69 * a character is guaranteed to be at a normalization boundary,
jpayne@69	70 * regardless of context.
jpayne@69	71 * This is used for moving from one normalization boundary to the next
jpayne@69	72 * or preceding boundary, and for performing iterative normalization.
jpayne@69	73 *
jpayne@69	74 * Iterative normalization is useful when only a small portion of a
jpayne@69	75 * longer string needs to be processed.
jpayne@69	76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator
jpayne@69	77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart()
jpayne@69	78 * (to process only the substring for which sort key bytes are computed).
jpayne@69	79 *
jpayne@69	80 * The set of normalization boundaries returned by these functions may not be
jpayne@69	81 * complete: There may be more boundaries that could be returned.
jpayne@69	82 * Different functions may return different boundaries.
jpayne@69	83 * @stable ICU 4.4
jpayne@69	84 */
jpayne@69	85 class U_COMMON_API Normalizer2 : public UObject {
jpayne@69	86 public:
jpayne@69	87 /**
jpayne@69	88 * Destructor.
jpayne@69	89 * @stable ICU 4.4
jpayne@69	90 */
jpayne@69	91 ~Normalizer2();
jpayne@69	92
jpayne@69	93 /**
jpayne@69	94 * Returns a Normalizer2 instance for Unicode NFC normalization.
jpayne@69	95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode).
jpayne@69	96 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69	97 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	98 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	99 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	100 * function chaining. (See User Guide for details.)
jpayne@69	101 * @return the requested Normalizer2, if successful
jpayne@69	102 * @stable ICU 49
jpayne@69	103 */
jpayne@69	104 static const Normalizer2 *
jpayne@69	105 getNFCInstance(UErrorCode &errorCode);
jpayne@69	106
jpayne@69	107 /**
jpayne@69	108 * Returns a Normalizer2 instance for Unicode NFD normalization.
jpayne@69	109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode).
jpayne@69	110 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69	111 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	112 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	113 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	114 * function chaining. (See User Guide for details.)
jpayne@69	115 * @return the requested Normalizer2, if successful
jpayne@69	116 * @stable ICU 49
jpayne@69	117 */
jpayne@69	118 static const Normalizer2 *
jpayne@69	119 getNFDInstance(UErrorCode &errorCode);
jpayne@69	120
jpayne@69	121 /**
jpayne@69	122 * Returns a Normalizer2 instance for Unicode NFKC normalization.
jpayne@69	123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode).
jpayne@69	124 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69	125 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	126 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	127 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	128 * function chaining. (See User Guide for details.)
jpayne@69	129 * @return the requested Normalizer2, if successful
jpayne@69	130 * @stable ICU 49
jpayne@69	131 */
jpayne@69	132 static const Normalizer2 *
jpayne@69	133 getNFKCInstance(UErrorCode &errorCode);
jpayne@69	134
jpayne@69	135 /**
jpayne@69	136 * Returns a Normalizer2 instance for Unicode NFKD normalization.
jpayne@69	137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode).
jpayne@69	138 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69	139 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	140 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	141 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	142 * function chaining. (See User Guide for details.)
jpayne@69	143 * @return the requested Normalizer2, if successful
jpayne@69	144 * @stable ICU 49
jpayne@69	145 */
jpayne@69	146 static const Normalizer2 *
jpayne@69	147 getNFKDInstance(UErrorCode &errorCode);
jpayne@69	148
jpayne@69	149 /**
jpayne@69	150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
jpayne@69	151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode).
jpayne@69	152 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69	153 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	154 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	155 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	156 * function chaining. (See User Guide for details.)
jpayne@69	157 * @return the requested Normalizer2, if successful
jpayne@69	158 * @stable ICU 49
jpayne@69	159 */
jpayne@69	160 static const Normalizer2 *
jpayne@69	161 getNFKCCasefoldInstance(UErrorCode &errorCode);
jpayne@69	162
jpayne@69	163 /**
jpayne@69	164 * Returns a Normalizer2 instance which uses the specified data file
jpayne@69	165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle)
jpayne@69	166 * and which composes or decomposes text according to the specified mode.
jpayne@69	167 * Returns an unmodifiable singleton instance. Do not delete it.
jpayne@69	168 *
jpayne@69	169 * Use packageName=NULL for data files that are part of ICU's own data.
jpayne@69	170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD.
jpayne@69	171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD.
jpayne@69	172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold.
jpayne@69	173 *
jpayne@69	174 * @param packageName NULL for ICU built-in data, otherwise application data package name
jpayne@69	175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file
jpayne@69	176 * @param mode normalization mode (compose or decompose etc.)
jpayne@69	177 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	178 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	179 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	180 * function chaining. (See User Guide for details.)
jpayne@69	181 * @return the requested Normalizer2, if successful
jpayne@69	182 * @stable ICU 4.4
jpayne@69	183 */
jpayne@69	184 static const Normalizer2 *
jpayne@69	185 getInstance(const char *packageName,
jpayne@69	186 const char *name,
jpayne@69	187 UNormalization2Mode mode,
jpayne@69	188 UErrorCode &errorCode);
jpayne@69	189
jpayne@69	190 /**
jpayne@69	191 * Returns the normalized form of the source string.
jpayne@69	192 * @param src source string
jpayne@69	193 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	194 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	195 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	196 * function chaining. (See User Guide for details.)
jpayne@69	197 * @return normalized src
jpayne@69	198 * @stable ICU 4.4
jpayne@69	199 */
jpayne@69	200 UnicodeString
jpayne@69	201 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
jpayne@69	202 UnicodeString result;
jpayne@69	203 normalize(src, result, errorCode);
jpayne@69	204 return result;
jpayne@69	205 }
jpayne@69	206 /**
jpayne@69	207 * Writes the normalized form of the source string to the destination string
jpayne@69	208 * (replacing its contents) and returns the destination string.
jpayne@69	209 * The source and destination strings must be different objects.
jpayne@69	210 * @param src source string
jpayne@69	211 * @param dest destination string; its contents is replaced with normalized src
jpayne@69	212 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	213 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	214 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	215 * function chaining. (See User Guide for details.)
jpayne@69	216 * @return dest
jpayne@69	217 * @stable ICU 4.4
jpayne@69	218 */
jpayne@69	219 virtual UnicodeString &
jpayne@69	220 normalize(const UnicodeString &src,
jpayne@69	221 UnicodeString &dest,
jpayne@69	222 UErrorCode &errorCode) const = 0;
jpayne@69	223
jpayne@69	224 /**
jpayne@69	225 * Normalizes a UTF-8 string and optionally records how source substrings
jpayne@69	226 * relate to changed and unchanged result substrings.
jpayne@69	227 *
jpayne@69	228 * Currently implemented completely only for "compose" modes,
jpayne@69	229 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69	230 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69	231 * Otherwise currently converts to & from UTF-16 and does not support edits.
jpayne@69	232 *
jpayne@69	233 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
jpayne@69	234 * @param src Source UTF-8 string.
jpayne@69	235 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
jpayne@69	236 * sink.Flush() is called at the end.
jpayne@69	237 * @param edits Records edits for index mapping, working with styled text,
jpayne@69	238 * and getting only changes (if any).
jpayne@69	239 * The Edits contents is undefined if any error occurs.
jpayne@69	240 * This function calls edits->reset() first unless
jpayne@69	241 * options includes U_EDITS_NO_RESET. edits can be nullptr.
jpayne@69	242 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	243 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	244 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	245 * function chaining. (See User Guide for details.)
jpayne@69	246 * @stable ICU 60
jpayne@69	247 */
jpayne@69	248 virtual void
jpayne@69	249 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
jpayne@69	250 Edits *edits, UErrorCode &errorCode) const;
jpayne@69	251
jpayne@69	252 /**
jpayne@69	253 * Appends the normalized form of the second string to the first string
jpayne@69	254 * (merging them at the boundary) and returns the first string.
jpayne@69	255 * The result is normalized if the first string was normalized.
jpayne@69	256 * The first and second strings must be different objects.
jpayne@69	257 * @param first string, should be normalized
jpayne@69	258 * @param second string, will be normalized
jpayne@69	259 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	260 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	261 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	262 * function chaining. (See User Guide for details.)
jpayne@69	263 * @return first
jpayne@69	264 * @stable ICU 4.4
jpayne@69	265 */
jpayne@69	266 virtual UnicodeString &
jpayne@69	267 normalizeSecondAndAppend(UnicodeString &first,
jpayne@69	268 const UnicodeString &second,
jpayne@69	269 UErrorCode &errorCode) const = 0;
jpayne@69	270 /**
jpayne@69	271 * Appends the second string to the first string
jpayne@69	272 * (merging them at the boundary) and returns the first string.
jpayne@69	273 * The result is normalized if both the strings were normalized.
jpayne@69	274 * The first and second strings must be different objects.
jpayne@69	275 * @param first string, should be normalized
jpayne@69	276 * @param second string, should be normalized
jpayne@69	277 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	278 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	279 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	280 * function chaining. (See User Guide for details.)
jpayne@69	281 * @return first
jpayne@69	282 * @stable ICU 4.4
jpayne@69	283 */
jpayne@69	284 virtual UnicodeString &
jpayne@69	285 append(UnicodeString &first,
jpayne@69	286 const UnicodeString &second,
jpayne@69	287 UErrorCode &errorCode) const = 0;
jpayne@69	288
jpayne@69	289 /**
jpayne@69	290 * Gets the decomposition mapping of c.
jpayne@69	291 * Roughly equivalent to normalizing the String form of c
jpayne@69	292 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function
jpayne@69	293 * returns FALSE and does not write a string
jpayne@69	294 * if c does not have a decomposition mapping in this instance's data.
jpayne@69	295 * This function is independent of the mode of the Normalizer2.
jpayne@69	296 * @param c code point
jpayne@69	297 * @param decomposition String object which will be set to c's
jpayne@69	298 * decomposition mapping, if there is one.
jpayne@69	299 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69	300 * @stable ICU 4.6
jpayne@69	301 */
jpayne@69	302 virtual UBool
jpayne@69	303 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0;
jpayne@69	304
jpayne@69	305 /**
jpayne@69	306 * Gets the raw decomposition mapping of c.
jpayne@69	307 *
jpayne@69	308 * This is similar to the getDecomposition() method but returns the
jpayne@69	309 * raw decomposition mapping as specified in UnicodeData.txt or
jpayne@69	310 * (for custom data) in the mapping files processed by the gennorm2 tool.
jpayne@69	311 * By contrast, getDecomposition() returns the processed,
jpayne@69	312 * recursively-decomposed version of this mapping.
jpayne@69	313 *
jpayne@69	314 * When used on a standard NFKC Normalizer2 instance,
jpayne@69	315 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property.
jpayne@69	316 *
jpayne@69	317 * When used on a standard NFC Normalizer2 instance,
jpayne@69	318 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can);
jpayne@69	319 * in this case, the result contains either one or two code points (=1..4 char16_ts).
jpayne@69	320 *
jpayne@69	321 * This function is independent of the mode of the Normalizer2.
jpayne@69	322 * The default implementation returns FALSE.
jpayne@69	323 * @param c code point
jpayne@69	324 * @param decomposition String object which will be set to c's
jpayne@69	325 * raw decomposition mapping, if there is one.
jpayne@69	326 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69	327 * @stable ICU 49
jpayne@69	328 */
jpayne@69	329 virtual UBool
jpayne@69	330 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const;
jpayne@69	331
jpayne@69	332 /**
jpayne@69	333 * Performs pairwise composition of a & b and returns the composite if there is one.
jpayne@69	334 *
jpayne@69	335 * Returns a composite code point c only if c has a two-way mapping to a+b.
jpayne@69	336 * In standard Unicode normalization, this means that
jpayne@69	337 * c has a canonical decomposition to a+b
jpayne@69	338 * and c does not have the Full_Composition_Exclusion property.
jpayne@69	339 *
jpayne@69	340 * This function is independent of the mode of the Normalizer2.
jpayne@69	341 * The default implementation returns a negative value.
jpayne@69	342 * @param a A (normalization starter) code point.
jpayne@69	343 * @param b Another code point.
jpayne@69	344 * @return The non-negative composite code point if there is one; otherwise a negative value.
jpayne@69	345 * @stable ICU 49
jpayne@69	346 */
jpayne@69	347 virtual UChar32
jpayne@69	348 composePair(UChar32 a, UChar32 b) const;
jpayne@69	349
jpayne@69	350 /**
jpayne@69	351 * Gets the combining class of c.
jpayne@69	352 * The default implementation returns 0
jpayne@69	353 * but all standard implementations return the Unicode Canonical_Combining_Class value.
jpayne@69	354 * @param c code point
jpayne@69	355 * @return c's combining class
jpayne@69	356 * @stable ICU 49
jpayne@69	357 */
jpayne@69	358 virtual uint8_t
jpayne@69	359 getCombiningClass(UChar32 c) const;
jpayne@69	360
jpayne@69	361 /**
jpayne@69	362 * Tests if the string is normalized.
jpayne@69	363 * Internally, in cases where the quickCheck() method would return "maybe"
jpayne@69	364 * (which is only possible for the two COMPOSE modes) this method
jpayne@69	365 * resolves to "yes" or "no" to provide a definitive result,
jpayne@69	366 * at the cost of doing more work in those cases.
jpayne@69	367 * @param s input string
jpayne@69	368 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	369 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	370 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	371 * function chaining. (See User Guide for details.)
jpayne@69	372 * @return TRUE if s is normalized
jpayne@69	373 * @stable ICU 4.4
jpayne@69	374 */
jpayne@69	375 virtual UBool
jpayne@69	376 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
jpayne@69	377 /**
jpayne@69	378 * Tests if the UTF-8 string is normalized.
jpayne@69	379 * Internally, in cases where the quickCheck() method would return "maybe"
jpayne@69	380 * (which is only possible for the two COMPOSE modes) this method
jpayne@69	381 * resolves to "yes" or "no" to provide a definitive result,
jpayne@69	382 * at the cost of doing more work in those cases.
jpayne@69	383 *
jpayne@69	384 * This works for all normalization modes,
jpayne@69	385 * but it is currently optimized for UTF-8 only for "compose" modes,
jpayne@69	386 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69	387 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69	388 * For other modes it currently converts to UTF-16 and calls isNormalized().
jpayne@69	389 *
jpayne@69	390 * @param s UTF-8 input string
jpayne@69	391 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	392 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	393 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	394 * function chaining. (See User Guide for details.)
jpayne@69	395 * @return TRUE if s is normalized
jpayne@69	396 * @stable ICU 60
jpayne@69	397 */
jpayne@69	398 virtual UBool
jpayne@69	399 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const;
jpayne@69	400
jpayne@69	401
jpayne@69	402 /**
jpayne@69	403 * Tests if the string is normalized.
jpayne@69	404 * For the two COMPOSE modes, the result could be "maybe" in cases that
jpayne@69	405 * would take a little more work to resolve definitively.
jpayne@69	406 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster
jpayne@69	407 * combination of quick check + normalization, to avoid
jpayne@69	408 * re-checking the "yes" prefix.
jpayne@69	409 * @param s input string
jpayne@69	410 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	411 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	412 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	413 * function chaining. (See User Guide for details.)
jpayne@69	414 * @return UNormalizationCheckResult
jpayne@69	415 * @stable ICU 4.4
jpayne@69	416 */
jpayne@69	417 virtual UNormalizationCheckResult
jpayne@69	418 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
jpayne@69	419
jpayne@69	420 /**
jpayne@69	421 * Returns the end of the normalized substring of the input string.
jpayne@69	422 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code>
jpayne@69	423 * the substring <code>UnicodeString(s, 0, end)</code>
jpayne@69	424 * will pass the quick check with a "yes" result.
jpayne@69	425 *
jpayne@69	426 * The returned end index is usually one or more characters before the
jpayne@69	427 * "no" or "maybe" character: The end index is at a normalization boundary.
jpayne@69	428 * (See the class documentation for more about normalization boundaries.)
jpayne@69	429 *
jpayne@69	430 * When the goal is a normalized string and most input strings are expected
jpayne@69	431 * to be normalized already, then call this method,
jpayne@69	432 * and if it returns a prefix shorter than the input string,
jpayne@69	433 * copy that prefix and use normalizeSecondAndAppend() for the remainder.
jpayne@69	434 * @param s input string
jpayne@69	435 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	436 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	437 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	438 * function chaining. (See User Guide for details.)
jpayne@69	439 * @return "yes" span end index
jpayne@69	440 * @stable ICU 4.4
jpayne@69	441 */
jpayne@69	442 virtual int32_t
jpayne@69	443 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
jpayne@69	444
jpayne@69	445 /**
jpayne@69	446 * Tests if the character always has a normalization boundary before it,
jpayne@69	447 * regardless of context.
jpayne@69	448 * If true, then the character does not normalization-interact with
jpayne@69	449 * preceding characters.
jpayne@69	450 * In other words, a string containing this character can be normalized
jpayne@69	451 * by processing portions before this character and starting from this
jpayne@69	452 * character independently.
jpayne@69	453 * This is used for iterative normalization. See the class documentation for details.
jpayne@69	454 * @param c character to test
jpayne@69	455 * @return TRUE if c has a normalization boundary before it
jpayne@69	456 * @stable ICU 4.4
jpayne@69	457 */
jpayne@69	458 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
jpayne@69	459
jpayne@69	460 /**
jpayne@69	461 * Tests if the character always has a normalization boundary after it,
jpayne@69	462 * regardless of context.
jpayne@69	463 * If true, then the character does not normalization-interact with
jpayne@69	464 * following characters.
jpayne@69	465 * In other words, a string containing this character can be normalized
jpayne@69	466 * by processing portions up to this character and after this
jpayne@69	467 * character independently.
jpayne@69	468 * This is used for iterative normalization. See the class documentation for details.
jpayne@69	469 * Note that this operation may be significantly slower than hasBoundaryBefore().
jpayne@69	470 * @param c character to test
jpayne@69	471 * @return TRUE if c has a normalization boundary after it
jpayne@69	472 * @stable ICU 4.4
jpayne@69	473 */
jpayne@69	474 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
jpayne@69	475
jpayne@69	476 /**
jpayne@69	477 * Tests if the character is normalization-inert.
jpayne@69	478 * If true, then the character does not change, nor normalization-interact with
jpayne@69	479 * preceding or following characters.
jpayne@69	480 * In other words, a string containing this character can be normalized
jpayne@69	481 * by processing portions before this character and after this
jpayne@69	482 * character independently.
jpayne@69	483 * This is used for iterative normalization. See the class documentation for details.
jpayne@69	484 * Note that this operation may be significantly slower than hasBoundaryBefore().
jpayne@69	485 * @param c character to test
jpayne@69	486 * @return TRUE if c is normalization-inert
jpayne@69	487 * @stable ICU 4.4
jpayne@69	488 */
jpayne@69	489 virtual UBool isInert(UChar32 c) const = 0;
jpayne@69	490 };
jpayne@69	491
jpayne@69	492 /**
jpayne@69	493 * Normalization filtered by a UnicodeSet.
jpayne@69	494 * Normalizes portions of the text contained in the filter set and leaves
jpayne@69	495 * portions not contained in the filter set unchanged.
jpayne@69	496 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE).
jpayne@69	497 * Not-in-the-filter text is treated as "is normalized" and "quick check yes".
jpayne@69	498 * This class implements all of (and only) the Normalizer2 API.
jpayne@69	499 * An instance of this class is unmodifiable/immutable but is constructed and
jpayne@69	500 * must be destructed by the owner.
jpayne@69	501 * @stable ICU 4.4
jpayne@69	502 */
jpayne@69	503 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 {
jpayne@69	504 public:
jpayne@69	505 /**
jpayne@69	506 * Constructs a filtered normalizer wrapping any Normalizer2 instance
jpayne@69	507 * and a filter set.
jpayne@69	508 * Both are aliased and must not be modified or deleted while this object
jpayne@69	509 * is used.
jpayne@69	510 * The filter set should be frozen; otherwise the performance will suffer greatly.
jpayne@69	511 * @param n2 wrapped Normalizer2 instance
jpayne@69	512 * @param filterSet UnicodeSet which determines the characters to be normalized
jpayne@69	513 * @stable ICU 4.4
jpayne@69	514 */
jpayne@69	515 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) :
jpayne@69	516 norm2(n2), set(filterSet) {}
jpayne@69	517
jpayne@69	518 /**
jpayne@69	519 * Destructor.
jpayne@69	520 * @stable ICU 4.4
jpayne@69	521 */
jpayne@69	522 ~FilteredNormalizer2();
jpayne@69	523
jpayne@69	524 /**
jpayne@69	525 * Writes the normalized form of the source string to the destination string
jpayne@69	526 * (replacing its contents) and returns the destination string.
jpayne@69	527 * The source and destination strings must be different objects.
jpayne@69	528 * @param src source string
jpayne@69	529 * @param dest destination string; its contents is replaced with normalized src
jpayne@69	530 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	531 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	532 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	533 * function chaining. (See User Guide for details.)
jpayne@69	534 * @return dest
jpayne@69	535 * @stable ICU 4.4
jpayne@69	536 */
jpayne@69	537 virtual UnicodeString &
jpayne@69	538 normalize(const UnicodeString &src,
jpayne@69	539 UnicodeString &dest,
jpayne@69	540 UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	541
jpayne@69	542 /**
jpayne@69	543 * Normalizes a UTF-8 string and optionally records how source substrings
jpayne@69	544 * relate to changed and unchanged result substrings.
jpayne@69	545 *
jpayne@69	546 * Currently implemented completely only for "compose" modes,
jpayne@69	547 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69	548 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69	549 * Otherwise currently converts to & from UTF-16 and does not support edits.
jpayne@69	550 *
jpayne@69	551 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET.
jpayne@69	552 * @param src Source UTF-8 string.
jpayne@69	553 * @param sink A ByteSink to which the normalized UTF-8 result string is written.
jpayne@69	554 * sink.Flush() is called at the end.
jpayne@69	555 * @param edits Records edits for index mapping, working with styled text,
jpayne@69	556 * and getting only changes (if any).
jpayne@69	557 * The Edits contents is undefined if any error occurs.
jpayne@69	558 * This function calls edits->reset() first unless
jpayne@69	559 * options includes U_EDITS_NO_RESET. edits can be nullptr.
jpayne@69	560 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	561 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	562 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	563 * function chaining. (See User Guide for details.)
jpayne@69	564 * @stable ICU 60
jpayne@69	565 */
jpayne@69	566 virtual void
jpayne@69	567 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink,
jpayne@69	568 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	569
jpayne@69	570 /**
jpayne@69	571 * Appends the normalized form of the second string to the first string
jpayne@69	572 * (merging them at the boundary) and returns the first string.
jpayne@69	573 * The result is normalized if the first string was normalized.
jpayne@69	574 * The first and second strings must be different objects.
jpayne@69	575 * @param first string, should be normalized
jpayne@69	576 * @param second string, will be normalized
jpayne@69	577 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	578 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	579 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	580 * function chaining. (See User Guide for details.)
jpayne@69	581 * @return first
jpayne@69	582 * @stable ICU 4.4
jpayne@69	583 */
jpayne@69	584 virtual UnicodeString &
jpayne@69	585 normalizeSecondAndAppend(UnicodeString &first,
jpayne@69	586 const UnicodeString &second,
jpayne@69	587 UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	588 /**
jpayne@69	589 * Appends the second string to the first string
jpayne@69	590 * (merging them at the boundary) and returns the first string.
jpayne@69	591 * The result is normalized if both the strings were normalized.
jpayne@69	592 * The first and second strings must be different objects.
jpayne@69	593 * @param first string, should be normalized
jpayne@69	594 * @param second string, should be normalized
jpayne@69	595 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	596 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	597 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	598 * function chaining. (See User Guide for details.)
jpayne@69	599 * @return first
jpayne@69	600 * @stable ICU 4.4
jpayne@69	601 */
jpayne@69	602 virtual UnicodeString &
jpayne@69	603 append(UnicodeString &first,
jpayne@69	604 const UnicodeString &second,
jpayne@69	605 UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	606
jpayne@69	607 /**
jpayne@69	608 * Gets the decomposition mapping of c.
jpayne@69	609 * For details see the base class documentation.
jpayne@69	610 *
jpayne@69	611 * This function is independent of the mode of the Normalizer2.
jpayne@69	612 * @param c code point
jpayne@69	613 * @param decomposition String object which will be set to c's
jpayne@69	614 * decomposition mapping, if there is one.
jpayne@69	615 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69	616 * @stable ICU 4.6
jpayne@69	617 */
jpayne@69	618 virtual UBool
jpayne@69	619 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
jpayne@69	620
jpayne@69	621 /**
jpayne@69	622 * Gets the raw decomposition mapping of c.
jpayne@69	623 * For details see the base class documentation.
jpayne@69	624 *
jpayne@69	625 * This function is independent of the mode of the Normalizer2.
jpayne@69	626 * @param c code point
jpayne@69	627 * @param decomposition String object which will be set to c's
jpayne@69	628 * raw decomposition mapping, if there is one.
jpayne@69	629 * @return TRUE if c has a decomposition, otherwise FALSE
jpayne@69	630 * @stable ICU 49
jpayne@69	631 */
jpayne@69	632 virtual UBool
jpayne@69	633 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE;
jpayne@69	634
jpayne@69	635 /**
jpayne@69	636 * Performs pairwise composition of a & b and returns the composite if there is one.
jpayne@69	637 * For details see the base class documentation.
jpayne@69	638 *
jpayne@69	639 * This function is independent of the mode of the Normalizer2.
jpayne@69	640 * @param a A (normalization starter) code point.
jpayne@69	641 * @param b Another code point.
jpayne@69	642 * @return The non-negative composite code point if there is one; otherwise a negative value.
jpayne@69	643 * @stable ICU 49
jpayne@69	644 */
jpayne@69	645 virtual UChar32
jpayne@69	646 composePair(UChar32 a, UChar32 b) const U_OVERRIDE;
jpayne@69	647
jpayne@69	648 /**
jpayne@69	649 * Gets the combining class of c.
jpayne@69	650 * The default implementation returns 0
jpayne@69	651 * but all standard implementations return the Unicode Canonical_Combining_Class value.
jpayne@69	652 * @param c code point
jpayne@69	653 * @return c's combining class
jpayne@69	654 * @stable ICU 49
jpayne@69	655 */
jpayne@69	656 virtual uint8_t
jpayne@69	657 getCombiningClass(UChar32 c) const U_OVERRIDE;
jpayne@69	658
jpayne@69	659 /**
jpayne@69	660 * Tests if the string is normalized.
jpayne@69	661 * For details see the Normalizer2 base class documentation.
jpayne@69	662 * @param s input string
jpayne@69	663 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	664 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	665 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	666 * function chaining. (See User Guide for details.)
jpayne@69	667 * @return TRUE if s is normalized
jpayne@69	668 * @stable ICU 4.4
jpayne@69	669 */
jpayne@69	670 virtual UBool
jpayne@69	671 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	672 /**
jpayne@69	673 * Tests if the UTF-8 string is normalized.
jpayne@69	674 * Internally, in cases where the quickCheck() method would return "maybe"
jpayne@69	675 * (which is only possible for the two COMPOSE modes) this method
jpayne@69	676 * resolves to "yes" or "no" to provide a definitive result,
jpayne@69	677 * at the cost of doing more work in those cases.
jpayne@69	678 *
jpayne@69	679 * This works for all normalization modes,
jpayne@69	680 * but it is currently optimized for UTF-8 only for "compose" modes,
jpayne@69	681 * such as for NFC, NFKC, and NFKC_Casefold
jpayne@69	682 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS).
jpayne@69	683 * For other modes it currently converts to UTF-16 and calls isNormalized().
jpayne@69	684 *
jpayne@69	685 * @param s UTF-8 input string
jpayne@69	686 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	687 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	688 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	689 * function chaining. (See User Guide for details.)
jpayne@69	690 * @return TRUE if s is normalized
jpayne@69	691 * @stable ICU 60
jpayne@69	692 */
jpayne@69	693 virtual UBool
jpayne@69	694 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	695 /**
jpayne@69	696 * Tests if the string is normalized.
jpayne@69	697 * For details see the Normalizer2 base class documentation.
jpayne@69	698 * @param s input string
jpayne@69	699 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	700 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	701 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	702 * function chaining. (See User Guide for details.)
jpayne@69	703 * @return UNormalizationCheckResult
jpayne@69	704 * @stable ICU 4.4
jpayne@69	705 */
jpayne@69	706 virtual UNormalizationCheckResult
jpayne@69	707 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	708 /**
jpayne@69	709 * Returns the end of the normalized substring of the input string.
jpayne@69	710 * For details see the Normalizer2 base class documentation.
jpayne@69	711 * @param s input string
jpayne@69	712 * @param errorCode Standard ICU error code. Its input value must
jpayne@69	713 * pass the U_SUCCESS() test, or else the function returns
jpayne@69	714 * immediately. Check for U_FAILURE() on output or use with
jpayne@69	715 * function chaining. (See User Guide for details.)
jpayne@69	716 * @return "yes" span end index
jpayne@69	717 * @stable ICU 4.4
jpayne@69	718 */
jpayne@69	719 virtual int32_t
jpayne@69	720 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
jpayne@69	721
jpayne@69	722 /**
jpayne@69	723 * Tests if the character always has a normalization boundary before it,
jpayne@69	724 * regardless of context.
jpayne@69	725 * For details see the Normalizer2 base class documentation.
jpayne@69	726 * @param c character to test
jpayne@69	727 * @return TRUE if c has a normalization boundary before it
jpayne@69	728 * @stable ICU 4.4
jpayne@69	729 */
jpayne@69	730 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE;
jpayne@69	731
jpayne@69	732 /**
jpayne@69	733 * Tests if the character always has a normalization boundary after it,
jpayne@69	734 * regardless of context.
jpayne@69	735 * For details see the Normalizer2 base class documentation.
jpayne@69	736 * @param c character to test
jpayne@69	737 * @return TRUE if c has a normalization boundary after it
jpayne@69	738 * @stable ICU 4.4
jpayne@69	739 */
jpayne@69	740 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE;
jpayne@69	741
jpayne@69	742 /**
jpayne@69	743 * Tests if the character is normalization-inert.
jpayne@69	744 * For details see the Normalizer2 base class documentation.
jpayne@69	745 * @param c character to test
jpayne@69	746 * @return TRUE if c is normalization-inert
jpayne@69	747 * @stable ICU 4.4
jpayne@69	748 */
jpayne@69	749 virtual UBool isInert(UChar32 c) const U_OVERRIDE;
jpayne@69	750 private:
jpayne@69	751 UnicodeString &
jpayne@69	752 normalize(const UnicodeString &src,
jpayne@69	753 UnicodeString &dest,
jpayne@69	754 USetSpanCondition spanCondition,
jpayne@69	755 UErrorCode &errorCode) const;
jpayne@69	756
jpayne@69	757 void
jpayne@69	758 normalizeUTF8(uint32_t options, const char *src, int32_t length,
jpayne@69	759 ByteSink &sink, Edits *edits,
jpayne@69	760 USetSpanCondition spanCondition,
jpayne@69	761 UErrorCode &errorCode) const;
jpayne@69	762
jpayne@69	763 UnicodeString &
jpayne@69	764 normalizeSecondAndAppend(UnicodeString &first,
jpayne@69	765 const UnicodeString &second,
jpayne@69	766 UBool doNormalize,
jpayne@69	767 UErrorCode &errorCode) const;
jpayne@69	768
jpayne@69	769 const Normalizer2 &norm2;
jpayne@69	770 const UnicodeSet &set;
jpayne@69	771 };
jpayne@69	772
jpayne@69	773 U_NAMESPACE_END
jpayne@69	774
jpayne@69	775 #endif // !UCONFIG_NO_NORMALIZATION
jpayne@69	776
jpayne@69	777 #endif /* U_SHOW_CPLUSPLUS_API */
jpayne@69	778
jpayne@69	779 #endif // __NORMALIZER2_H__

Mercurial > repos > rliterman > csp2

annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/normalizer2.h @ 69:33d812a61356