Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/normalizer2.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // © 2016 and later: Unicode, Inc. and others. | |
2 // License & terms of use: http://www.unicode.org/copyright.html | |
3 /* | |
4 ******************************************************************************* | |
5 * | |
6 * Copyright (C) 2009-2013, International Business Machines | |
7 * Corporation and others. All Rights Reserved. | |
8 * | |
9 ******************************************************************************* | |
10 * file name: normalizer2.h | |
11 * encoding: UTF-8 | |
12 * tab size: 8 (not used) | |
13 * indentation:4 | |
14 * | |
15 * created on: 2009nov22 | |
16 * created by: Markus W. Scherer | |
17 */ | |
18 | |
19 #ifndef __NORMALIZER2_H__ | |
20 #define __NORMALIZER2_H__ | |
21 | |
22 /** | |
23 * \file | |
24 * \brief C++ API: New API for Unicode Normalization. | |
25 */ | |
26 | |
27 #include "unicode/utypes.h" | |
28 | |
29 #if U_SHOW_CPLUSPLUS_API | |
30 | |
31 #if !UCONFIG_NO_NORMALIZATION | |
32 | |
33 #include "unicode/stringpiece.h" | |
34 #include "unicode/uniset.h" | |
35 #include "unicode/unistr.h" | |
36 #include "unicode/unorm2.h" | |
37 | |
38 U_NAMESPACE_BEGIN | |
39 | |
40 class ByteSink; | |
41 | |
42 /** | |
43 * Unicode normalization functionality for standard Unicode normalization or | |
44 * for using custom mapping tables. | |
45 * All instances of this class are unmodifiable/immutable. | |
46 * Instances returned by getInstance() are singletons that must not be deleted by the caller. | |
47 * The Normalizer2 class is not intended for public subclassing. | |
48 * | |
49 * The primary functions are to produce a normalized string and to detect whether | |
50 * a string is already normalized. | |
51 * The most commonly used normalization forms are those defined in | |
52 * http://www.unicode.org/unicode/reports/tr15/ | |
53 * However, this API supports additional normalization forms for specialized purposes. | |
54 * For example, NFKC_Casefold is provided via getInstance("nfkc_cf", COMPOSE) | |
55 * and can be used in implementations of UTS #46. | |
56 * | |
57 * Not only are the standard compose and decompose modes supplied, | |
58 * but additional modes are provided as documented in the Mode enum. | |
59 * | |
60 * Some of the functions in this class identify normalization boundaries. | |
61 * At a normalization boundary, the portions of the string | |
62 * before it and starting from it do not interact and can be handled independently. | |
63 * | |
64 * The spanQuickCheckYes() stops at a normalization boundary. | |
65 * When the goal is a normalized string, then the text before the boundary | |
66 * can be copied, and the remainder can be processed with normalizeSecondAndAppend(). | |
67 * | |
68 * The hasBoundaryBefore(), hasBoundaryAfter() and isInert() functions test whether | |
69 * a character is guaranteed to be at a normalization boundary, | |
70 * regardless of context. | |
71 * This is used for moving from one normalization boundary to the next | |
72 * or preceding boundary, and for performing iterative normalization. | |
73 * | |
74 * Iterative normalization is useful when only a small portion of a | |
75 * longer string needs to be processed. | |
76 * For example, in ICU, iterative normalization is used by the NormalizationTransliterator | |
77 * (to avoid replacing already-normalized text) and ucol_nextSortKeyPart() | |
78 * (to process only the substring for which sort key bytes are computed). | |
79 * | |
80 * The set of normalization boundaries returned by these functions may not be | |
81 * complete: There may be more boundaries that could be returned. | |
82 * Different functions may return different boundaries. | |
83 * @stable ICU 4.4 | |
84 */ | |
85 class U_COMMON_API Normalizer2 : public UObject { | |
86 public: | |
87 /** | |
88 * Destructor. | |
89 * @stable ICU 4.4 | |
90 */ | |
91 ~Normalizer2(); | |
92 | |
93 /** | |
94 * Returns a Normalizer2 instance for Unicode NFC normalization. | |
95 * Same as getInstance(NULL, "nfc", UNORM2_COMPOSE, errorCode). | |
96 * Returns an unmodifiable singleton instance. Do not delete it. | |
97 * @param errorCode Standard ICU error code. Its input value must | |
98 * pass the U_SUCCESS() test, or else the function returns | |
99 * immediately. Check for U_FAILURE() on output or use with | |
100 * function chaining. (See User Guide for details.) | |
101 * @return the requested Normalizer2, if successful | |
102 * @stable ICU 49 | |
103 */ | |
104 static const Normalizer2 * | |
105 getNFCInstance(UErrorCode &errorCode); | |
106 | |
107 /** | |
108 * Returns a Normalizer2 instance for Unicode NFD normalization. | |
109 * Same as getInstance(NULL, "nfc", UNORM2_DECOMPOSE, errorCode). | |
110 * Returns an unmodifiable singleton instance. Do not delete it. | |
111 * @param errorCode Standard ICU error code. Its input value must | |
112 * pass the U_SUCCESS() test, or else the function returns | |
113 * immediately. Check for U_FAILURE() on output or use with | |
114 * function chaining. (See User Guide for details.) | |
115 * @return the requested Normalizer2, if successful | |
116 * @stable ICU 49 | |
117 */ | |
118 static const Normalizer2 * | |
119 getNFDInstance(UErrorCode &errorCode); | |
120 | |
121 /** | |
122 * Returns a Normalizer2 instance for Unicode NFKC normalization. | |
123 * Same as getInstance(NULL, "nfkc", UNORM2_COMPOSE, errorCode). | |
124 * Returns an unmodifiable singleton instance. Do not delete it. | |
125 * @param errorCode Standard ICU error code. Its input value must | |
126 * pass the U_SUCCESS() test, or else the function returns | |
127 * immediately. Check for U_FAILURE() on output or use with | |
128 * function chaining. (See User Guide for details.) | |
129 * @return the requested Normalizer2, if successful | |
130 * @stable ICU 49 | |
131 */ | |
132 static const Normalizer2 * | |
133 getNFKCInstance(UErrorCode &errorCode); | |
134 | |
135 /** | |
136 * Returns a Normalizer2 instance for Unicode NFKD normalization. | |
137 * Same as getInstance(NULL, "nfkc", UNORM2_DECOMPOSE, errorCode). | |
138 * Returns an unmodifiable singleton instance. Do not delete it. | |
139 * @param errorCode Standard ICU error code. Its input value must | |
140 * pass the U_SUCCESS() test, or else the function returns | |
141 * immediately. Check for U_FAILURE() on output or use with | |
142 * function chaining. (See User Guide for details.) | |
143 * @return the requested Normalizer2, if successful | |
144 * @stable ICU 49 | |
145 */ | |
146 static const Normalizer2 * | |
147 getNFKDInstance(UErrorCode &errorCode); | |
148 | |
149 /** | |
150 * Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization. | |
151 * Same as getInstance(NULL, "nfkc_cf", UNORM2_COMPOSE, errorCode). | |
152 * Returns an unmodifiable singleton instance. Do not delete it. | |
153 * @param errorCode Standard ICU error code. Its input value must | |
154 * pass the U_SUCCESS() test, or else the function returns | |
155 * immediately. Check for U_FAILURE() on output or use with | |
156 * function chaining. (See User Guide for details.) | |
157 * @return the requested Normalizer2, if successful | |
158 * @stable ICU 49 | |
159 */ | |
160 static const Normalizer2 * | |
161 getNFKCCasefoldInstance(UErrorCode &errorCode); | |
162 | |
163 /** | |
164 * Returns a Normalizer2 instance which uses the specified data file | |
165 * (packageName/name similar to ucnv_openPackage() and ures_open()/ResourceBundle) | |
166 * and which composes or decomposes text according to the specified mode. | |
167 * Returns an unmodifiable singleton instance. Do not delete it. | |
168 * | |
169 * Use packageName=NULL for data files that are part of ICU's own data. | |
170 * Use name="nfc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFC/NFD. | |
171 * Use name="nfkc" and UNORM2_COMPOSE/UNORM2_DECOMPOSE for Unicode standard NFKC/NFKD. | |
172 * Use name="nfkc_cf" and UNORM2_COMPOSE for Unicode standard NFKC_CF=NFKC_Casefold. | |
173 * | |
174 * @param packageName NULL for ICU built-in data, otherwise application data package name | |
175 * @param name "nfc" or "nfkc" or "nfkc_cf" or name of custom data file | |
176 * @param mode normalization mode (compose or decompose etc.) | |
177 * @param errorCode Standard ICU error code. Its input value must | |
178 * pass the U_SUCCESS() test, or else the function returns | |
179 * immediately. Check for U_FAILURE() on output or use with | |
180 * function chaining. (See User Guide for details.) | |
181 * @return the requested Normalizer2, if successful | |
182 * @stable ICU 4.4 | |
183 */ | |
184 static const Normalizer2 * | |
185 getInstance(const char *packageName, | |
186 const char *name, | |
187 UNormalization2Mode mode, | |
188 UErrorCode &errorCode); | |
189 | |
190 /** | |
191 * Returns the normalized form of the source string. | |
192 * @param src source string | |
193 * @param errorCode Standard ICU error code. Its input value must | |
194 * pass the U_SUCCESS() test, or else the function returns | |
195 * immediately. Check for U_FAILURE() on output or use with | |
196 * function chaining. (See User Guide for details.) | |
197 * @return normalized src | |
198 * @stable ICU 4.4 | |
199 */ | |
200 UnicodeString | |
201 normalize(const UnicodeString &src, UErrorCode &errorCode) const { | |
202 UnicodeString result; | |
203 normalize(src, result, errorCode); | |
204 return result; | |
205 } | |
206 /** | |
207 * Writes the normalized form of the source string to the destination string | |
208 * (replacing its contents) and returns the destination string. | |
209 * The source and destination strings must be different objects. | |
210 * @param src source string | |
211 * @param dest destination string; its contents is replaced with normalized src | |
212 * @param errorCode Standard ICU error code. Its input value must | |
213 * pass the U_SUCCESS() test, or else the function returns | |
214 * immediately. Check for U_FAILURE() on output or use with | |
215 * function chaining. (See User Guide for details.) | |
216 * @return dest | |
217 * @stable ICU 4.4 | |
218 */ | |
219 virtual UnicodeString & | |
220 normalize(const UnicodeString &src, | |
221 UnicodeString &dest, | |
222 UErrorCode &errorCode) const = 0; | |
223 | |
224 /** | |
225 * Normalizes a UTF-8 string and optionally records how source substrings | |
226 * relate to changed and unchanged result substrings. | |
227 * | |
228 * Currently implemented completely only for "compose" modes, | |
229 * such as for NFC, NFKC, and NFKC_Casefold | |
230 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). | |
231 * Otherwise currently converts to & from UTF-16 and does not support edits. | |
232 * | |
233 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. | |
234 * @param src Source UTF-8 string. | |
235 * @param sink A ByteSink to which the normalized UTF-8 result string is written. | |
236 * sink.Flush() is called at the end. | |
237 * @param edits Records edits for index mapping, working with styled text, | |
238 * and getting only changes (if any). | |
239 * The Edits contents is undefined if any error occurs. | |
240 * This function calls edits->reset() first unless | |
241 * options includes U_EDITS_NO_RESET. edits can be nullptr. | |
242 * @param errorCode Standard ICU error code. Its input value must | |
243 * pass the U_SUCCESS() test, or else the function returns | |
244 * immediately. Check for U_FAILURE() on output or use with | |
245 * function chaining. (See User Guide for details.) | |
246 * @stable ICU 60 | |
247 */ | |
248 virtual void | |
249 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, | |
250 Edits *edits, UErrorCode &errorCode) const; | |
251 | |
252 /** | |
253 * Appends the normalized form of the second string to the first string | |
254 * (merging them at the boundary) and returns the first string. | |
255 * The result is normalized if the first string was normalized. | |
256 * The first and second strings must be different objects. | |
257 * @param first string, should be normalized | |
258 * @param second string, will be normalized | |
259 * @param errorCode Standard ICU error code. Its input value must | |
260 * pass the U_SUCCESS() test, or else the function returns | |
261 * immediately. Check for U_FAILURE() on output or use with | |
262 * function chaining. (See User Guide for details.) | |
263 * @return first | |
264 * @stable ICU 4.4 | |
265 */ | |
266 virtual UnicodeString & | |
267 normalizeSecondAndAppend(UnicodeString &first, | |
268 const UnicodeString &second, | |
269 UErrorCode &errorCode) const = 0; | |
270 /** | |
271 * Appends the second string to the first string | |
272 * (merging them at the boundary) and returns the first string. | |
273 * The result is normalized if both the strings were normalized. | |
274 * The first and second strings must be different objects. | |
275 * @param first string, should be normalized | |
276 * @param second string, should be normalized | |
277 * @param errorCode Standard ICU error code. Its input value must | |
278 * pass the U_SUCCESS() test, or else the function returns | |
279 * immediately. Check for U_FAILURE() on output or use with | |
280 * function chaining. (See User Guide for details.) | |
281 * @return first | |
282 * @stable ICU 4.4 | |
283 */ | |
284 virtual UnicodeString & | |
285 append(UnicodeString &first, | |
286 const UnicodeString &second, | |
287 UErrorCode &errorCode) const = 0; | |
288 | |
289 /** | |
290 * Gets the decomposition mapping of c. | |
291 * Roughly equivalent to normalizing the String form of c | |
292 * on a UNORM2_DECOMPOSE Normalizer2 instance, but much faster, and except that this function | |
293 * returns FALSE and does not write a string | |
294 * if c does not have a decomposition mapping in this instance's data. | |
295 * This function is independent of the mode of the Normalizer2. | |
296 * @param c code point | |
297 * @param decomposition String object which will be set to c's | |
298 * decomposition mapping, if there is one. | |
299 * @return TRUE if c has a decomposition, otherwise FALSE | |
300 * @stable ICU 4.6 | |
301 */ | |
302 virtual UBool | |
303 getDecomposition(UChar32 c, UnicodeString &decomposition) const = 0; | |
304 | |
305 /** | |
306 * Gets the raw decomposition mapping of c. | |
307 * | |
308 * This is similar to the getDecomposition() method but returns the | |
309 * raw decomposition mapping as specified in UnicodeData.txt or | |
310 * (for custom data) in the mapping files processed by the gennorm2 tool. | |
311 * By contrast, getDecomposition() returns the processed, | |
312 * recursively-decomposed version of this mapping. | |
313 * | |
314 * When used on a standard NFKC Normalizer2 instance, | |
315 * getRawDecomposition() returns the Unicode Decomposition_Mapping (dm) property. | |
316 * | |
317 * When used on a standard NFC Normalizer2 instance, | |
318 * it returns the Decomposition_Mapping only if the Decomposition_Type (dt) is Canonical (Can); | |
319 * in this case, the result contains either one or two code points (=1..4 char16_ts). | |
320 * | |
321 * This function is independent of the mode of the Normalizer2. | |
322 * The default implementation returns FALSE. | |
323 * @param c code point | |
324 * @param decomposition String object which will be set to c's | |
325 * raw decomposition mapping, if there is one. | |
326 * @return TRUE if c has a decomposition, otherwise FALSE | |
327 * @stable ICU 49 | |
328 */ | |
329 virtual UBool | |
330 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const; | |
331 | |
332 /** | |
333 * Performs pairwise composition of a & b and returns the composite if there is one. | |
334 * | |
335 * Returns a composite code point c only if c has a two-way mapping to a+b. | |
336 * In standard Unicode normalization, this means that | |
337 * c has a canonical decomposition to a+b | |
338 * and c does not have the Full_Composition_Exclusion property. | |
339 * | |
340 * This function is independent of the mode of the Normalizer2. | |
341 * The default implementation returns a negative value. | |
342 * @param a A (normalization starter) code point. | |
343 * @param b Another code point. | |
344 * @return The non-negative composite code point if there is one; otherwise a negative value. | |
345 * @stable ICU 49 | |
346 */ | |
347 virtual UChar32 | |
348 composePair(UChar32 a, UChar32 b) const; | |
349 | |
350 /** | |
351 * Gets the combining class of c. | |
352 * The default implementation returns 0 | |
353 * but all standard implementations return the Unicode Canonical_Combining_Class value. | |
354 * @param c code point | |
355 * @return c's combining class | |
356 * @stable ICU 49 | |
357 */ | |
358 virtual uint8_t | |
359 getCombiningClass(UChar32 c) const; | |
360 | |
361 /** | |
362 * Tests if the string is normalized. | |
363 * Internally, in cases where the quickCheck() method would return "maybe" | |
364 * (which is only possible for the two COMPOSE modes) this method | |
365 * resolves to "yes" or "no" to provide a definitive result, | |
366 * at the cost of doing more work in those cases. | |
367 * @param s input string | |
368 * @param errorCode Standard ICU error code. Its input value must | |
369 * pass the U_SUCCESS() test, or else the function returns | |
370 * immediately. Check for U_FAILURE() on output or use with | |
371 * function chaining. (See User Guide for details.) | |
372 * @return TRUE if s is normalized | |
373 * @stable ICU 4.4 | |
374 */ | |
375 virtual UBool | |
376 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0; | |
377 /** | |
378 * Tests if the UTF-8 string is normalized. | |
379 * Internally, in cases where the quickCheck() method would return "maybe" | |
380 * (which is only possible for the two COMPOSE modes) this method | |
381 * resolves to "yes" or "no" to provide a definitive result, | |
382 * at the cost of doing more work in those cases. | |
383 * | |
384 * This works for all normalization modes, | |
385 * but it is currently optimized for UTF-8 only for "compose" modes, | |
386 * such as for NFC, NFKC, and NFKC_Casefold | |
387 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). | |
388 * For other modes it currently converts to UTF-16 and calls isNormalized(). | |
389 * | |
390 * @param s UTF-8 input string | |
391 * @param errorCode Standard ICU error code. Its input value must | |
392 * pass the U_SUCCESS() test, or else the function returns | |
393 * immediately. Check for U_FAILURE() on output or use with | |
394 * function chaining. (See User Guide for details.) | |
395 * @return TRUE if s is normalized | |
396 * @stable ICU 60 | |
397 */ | |
398 virtual UBool | |
399 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const; | |
400 | |
401 | |
402 /** | |
403 * Tests if the string is normalized. | |
404 * For the two COMPOSE modes, the result could be "maybe" in cases that | |
405 * would take a little more work to resolve definitively. | |
406 * Use spanQuickCheckYes() and normalizeSecondAndAppend() for a faster | |
407 * combination of quick check + normalization, to avoid | |
408 * re-checking the "yes" prefix. | |
409 * @param s input string | |
410 * @param errorCode Standard ICU error code. Its input value must | |
411 * pass the U_SUCCESS() test, or else the function returns | |
412 * immediately. Check for U_FAILURE() on output or use with | |
413 * function chaining. (See User Guide for details.) | |
414 * @return UNormalizationCheckResult | |
415 * @stable ICU 4.4 | |
416 */ | |
417 virtual UNormalizationCheckResult | |
418 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0; | |
419 | |
420 /** | |
421 * Returns the end of the normalized substring of the input string. | |
422 * In other words, with <code>end=spanQuickCheckYes(s, ec);</code> | |
423 * the substring <code>UnicodeString(s, 0, end)</code> | |
424 * will pass the quick check with a "yes" result. | |
425 * | |
426 * The returned end index is usually one or more characters before the | |
427 * "no" or "maybe" character: The end index is at a normalization boundary. | |
428 * (See the class documentation for more about normalization boundaries.) | |
429 * | |
430 * When the goal is a normalized string and most input strings are expected | |
431 * to be normalized already, then call this method, | |
432 * and if it returns a prefix shorter than the input string, | |
433 * copy that prefix and use normalizeSecondAndAppend() for the remainder. | |
434 * @param s input string | |
435 * @param errorCode Standard ICU error code. Its input value must | |
436 * pass the U_SUCCESS() test, or else the function returns | |
437 * immediately. Check for U_FAILURE() on output or use with | |
438 * function chaining. (See User Guide for details.) | |
439 * @return "yes" span end index | |
440 * @stable ICU 4.4 | |
441 */ | |
442 virtual int32_t | |
443 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0; | |
444 | |
445 /** | |
446 * Tests if the character always has a normalization boundary before it, | |
447 * regardless of context. | |
448 * If true, then the character does not normalization-interact with | |
449 * preceding characters. | |
450 * In other words, a string containing this character can be normalized | |
451 * by processing portions before this character and starting from this | |
452 * character independently. | |
453 * This is used for iterative normalization. See the class documentation for details. | |
454 * @param c character to test | |
455 * @return TRUE if c has a normalization boundary before it | |
456 * @stable ICU 4.4 | |
457 */ | |
458 virtual UBool hasBoundaryBefore(UChar32 c) const = 0; | |
459 | |
460 /** | |
461 * Tests if the character always has a normalization boundary after it, | |
462 * regardless of context. | |
463 * If true, then the character does not normalization-interact with | |
464 * following characters. | |
465 * In other words, a string containing this character can be normalized | |
466 * by processing portions up to this character and after this | |
467 * character independently. | |
468 * This is used for iterative normalization. See the class documentation for details. | |
469 * Note that this operation may be significantly slower than hasBoundaryBefore(). | |
470 * @param c character to test | |
471 * @return TRUE if c has a normalization boundary after it | |
472 * @stable ICU 4.4 | |
473 */ | |
474 virtual UBool hasBoundaryAfter(UChar32 c) const = 0; | |
475 | |
476 /** | |
477 * Tests if the character is normalization-inert. | |
478 * If true, then the character does not change, nor normalization-interact with | |
479 * preceding or following characters. | |
480 * In other words, a string containing this character can be normalized | |
481 * by processing portions before this character and after this | |
482 * character independently. | |
483 * This is used for iterative normalization. See the class documentation for details. | |
484 * Note that this operation may be significantly slower than hasBoundaryBefore(). | |
485 * @param c character to test | |
486 * @return TRUE if c is normalization-inert | |
487 * @stable ICU 4.4 | |
488 */ | |
489 virtual UBool isInert(UChar32 c) const = 0; | |
490 }; | |
491 | |
492 /** | |
493 * Normalization filtered by a UnicodeSet. | |
494 * Normalizes portions of the text contained in the filter set and leaves | |
495 * portions not contained in the filter set unchanged. | |
496 * Filtering is done via UnicodeSet::span(..., USET_SPAN_SIMPLE). | |
497 * Not-in-the-filter text is treated as "is normalized" and "quick check yes". | |
498 * This class implements all of (and only) the Normalizer2 API. | |
499 * An instance of this class is unmodifiable/immutable but is constructed and | |
500 * must be destructed by the owner. | |
501 * @stable ICU 4.4 | |
502 */ | |
503 class U_COMMON_API FilteredNormalizer2 : public Normalizer2 { | |
504 public: | |
505 /** | |
506 * Constructs a filtered normalizer wrapping any Normalizer2 instance | |
507 * and a filter set. | |
508 * Both are aliased and must not be modified or deleted while this object | |
509 * is used. | |
510 * The filter set should be frozen; otherwise the performance will suffer greatly. | |
511 * @param n2 wrapped Normalizer2 instance | |
512 * @param filterSet UnicodeSet which determines the characters to be normalized | |
513 * @stable ICU 4.4 | |
514 */ | |
515 FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet) : | |
516 norm2(n2), set(filterSet) {} | |
517 | |
518 /** | |
519 * Destructor. | |
520 * @stable ICU 4.4 | |
521 */ | |
522 ~FilteredNormalizer2(); | |
523 | |
524 /** | |
525 * Writes the normalized form of the source string to the destination string | |
526 * (replacing its contents) and returns the destination string. | |
527 * The source and destination strings must be different objects. | |
528 * @param src source string | |
529 * @param dest destination string; its contents is replaced with normalized src | |
530 * @param errorCode Standard ICU error code. Its input value must | |
531 * pass the U_SUCCESS() test, or else the function returns | |
532 * immediately. Check for U_FAILURE() on output or use with | |
533 * function chaining. (See User Guide for details.) | |
534 * @return dest | |
535 * @stable ICU 4.4 | |
536 */ | |
537 virtual UnicodeString & | |
538 normalize(const UnicodeString &src, | |
539 UnicodeString &dest, | |
540 UErrorCode &errorCode) const U_OVERRIDE; | |
541 | |
542 /** | |
543 * Normalizes a UTF-8 string and optionally records how source substrings | |
544 * relate to changed and unchanged result substrings. | |
545 * | |
546 * Currently implemented completely only for "compose" modes, | |
547 * such as for NFC, NFKC, and NFKC_Casefold | |
548 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). | |
549 * Otherwise currently converts to & from UTF-16 and does not support edits. | |
550 * | |
551 * @param options Options bit set, usually 0. See U_OMIT_UNCHANGED_TEXT and U_EDITS_NO_RESET. | |
552 * @param src Source UTF-8 string. | |
553 * @param sink A ByteSink to which the normalized UTF-8 result string is written. | |
554 * sink.Flush() is called at the end. | |
555 * @param edits Records edits for index mapping, working with styled text, | |
556 * and getting only changes (if any). | |
557 * The Edits contents is undefined if any error occurs. | |
558 * This function calls edits->reset() first unless | |
559 * options includes U_EDITS_NO_RESET. edits can be nullptr. | |
560 * @param errorCode Standard ICU error code. Its input value must | |
561 * pass the U_SUCCESS() test, or else the function returns | |
562 * immediately. Check for U_FAILURE() on output or use with | |
563 * function chaining. (See User Guide for details.) | |
564 * @stable ICU 60 | |
565 */ | |
566 virtual void | |
567 normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, | |
568 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE; | |
569 | |
570 /** | |
571 * Appends the normalized form of the second string to the first string | |
572 * (merging them at the boundary) and returns the first string. | |
573 * The result is normalized if the first string was normalized. | |
574 * The first and second strings must be different objects. | |
575 * @param first string, should be normalized | |
576 * @param second string, will be normalized | |
577 * @param errorCode Standard ICU error code. Its input value must | |
578 * pass the U_SUCCESS() test, or else the function returns | |
579 * immediately. Check for U_FAILURE() on output or use with | |
580 * function chaining. (See User Guide for details.) | |
581 * @return first | |
582 * @stable ICU 4.4 | |
583 */ | |
584 virtual UnicodeString & | |
585 normalizeSecondAndAppend(UnicodeString &first, | |
586 const UnicodeString &second, | |
587 UErrorCode &errorCode) const U_OVERRIDE; | |
588 /** | |
589 * Appends the second string to the first string | |
590 * (merging them at the boundary) and returns the first string. | |
591 * The result is normalized if both the strings were normalized. | |
592 * The first and second strings must be different objects. | |
593 * @param first string, should be normalized | |
594 * @param second string, should be normalized | |
595 * @param errorCode Standard ICU error code. Its input value must | |
596 * pass the U_SUCCESS() test, or else the function returns | |
597 * immediately. Check for U_FAILURE() on output or use with | |
598 * function chaining. (See User Guide for details.) | |
599 * @return first | |
600 * @stable ICU 4.4 | |
601 */ | |
602 virtual UnicodeString & | |
603 append(UnicodeString &first, | |
604 const UnicodeString &second, | |
605 UErrorCode &errorCode) const U_OVERRIDE; | |
606 | |
607 /** | |
608 * Gets the decomposition mapping of c. | |
609 * For details see the base class documentation. | |
610 * | |
611 * This function is independent of the mode of the Normalizer2. | |
612 * @param c code point | |
613 * @param decomposition String object which will be set to c's | |
614 * decomposition mapping, if there is one. | |
615 * @return TRUE if c has a decomposition, otherwise FALSE | |
616 * @stable ICU 4.6 | |
617 */ | |
618 virtual UBool | |
619 getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; | |
620 | |
621 /** | |
622 * Gets the raw decomposition mapping of c. | |
623 * For details see the base class documentation. | |
624 * | |
625 * This function is independent of the mode of the Normalizer2. | |
626 * @param c code point | |
627 * @param decomposition String object which will be set to c's | |
628 * raw decomposition mapping, if there is one. | |
629 * @return TRUE if c has a decomposition, otherwise FALSE | |
630 * @stable ICU 49 | |
631 */ | |
632 virtual UBool | |
633 getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE; | |
634 | |
635 /** | |
636 * Performs pairwise composition of a & b and returns the composite if there is one. | |
637 * For details see the base class documentation. | |
638 * | |
639 * This function is independent of the mode of the Normalizer2. | |
640 * @param a A (normalization starter) code point. | |
641 * @param b Another code point. | |
642 * @return The non-negative composite code point if there is one; otherwise a negative value. | |
643 * @stable ICU 49 | |
644 */ | |
645 virtual UChar32 | |
646 composePair(UChar32 a, UChar32 b) const U_OVERRIDE; | |
647 | |
648 /** | |
649 * Gets the combining class of c. | |
650 * The default implementation returns 0 | |
651 * but all standard implementations return the Unicode Canonical_Combining_Class value. | |
652 * @param c code point | |
653 * @return c's combining class | |
654 * @stable ICU 49 | |
655 */ | |
656 virtual uint8_t | |
657 getCombiningClass(UChar32 c) const U_OVERRIDE; | |
658 | |
659 /** | |
660 * Tests if the string is normalized. | |
661 * For details see the Normalizer2 base class documentation. | |
662 * @param s input string | |
663 * @param errorCode Standard ICU error code. Its input value must | |
664 * pass the U_SUCCESS() test, or else the function returns | |
665 * immediately. Check for U_FAILURE() on output or use with | |
666 * function chaining. (See User Guide for details.) | |
667 * @return TRUE if s is normalized | |
668 * @stable ICU 4.4 | |
669 */ | |
670 virtual UBool | |
671 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; | |
672 /** | |
673 * Tests if the UTF-8 string is normalized. | |
674 * Internally, in cases where the quickCheck() method would return "maybe" | |
675 * (which is only possible for the two COMPOSE modes) this method | |
676 * resolves to "yes" or "no" to provide a definitive result, | |
677 * at the cost of doing more work in those cases. | |
678 * | |
679 * This works for all normalization modes, | |
680 * but it is currently optimized for UTF-8 only for "compose" modes, | |
681 * such as for NFC, NFKC, and NFKC_Casefold | |
682 * (UNORM2_COMPOSE and UNORM2_COMPOSE_CONTIGUOUS). | |
683 * For other modes it currently converts to UTF-16 and calls isNormalized(). | |
684 * | |
685 * @param s UTF-8 input string | |
686 * @param errorCode Standard ICU error code. Its input value must | |
687 * pass the U_SUCCESS() test, or else the function returns | |
688 * immediately. Check for U_FAILURE() on output or use with | |
689 * function chaining. (See User Guide for details.) | |
690 * @return TRUE if s is normalized | |
691 * @stable ICU 60 | |
692 */ | |
693 virtual UBool | |
694 isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE; | |
695 /** | |
696 * Tests if the string is normalized. | |
697 * For details see the Normalizer2 base class documentation. | |
698 * @param s input string | |
699 * @param errorCode Standard ICU error code. Its input value must | |
700 * pass the U_SUCCESS() test, or else the function returns | |
701 * immediately. Check for U_FAILURE() on output or use with | |
702 * function chaining. (See User Guide for details.) | |
703 * @return UNormalizationCheckResult | |
704 * @stable ICU 4.4 | |
705 */ | |
706 virtual UNormalizationCheckResult | |
707 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; | |
708 /** | |
709 * Returns the end of the normalized substring of the input string. | |
710 * For details see the Normalizer2 base class documentation. | |
711 * @param s input string | |
712 * @param errorCode Standard ICU error code. Its input value must | |
713 * pass the U_SUCCESS() test, or else the function returns | |
714 * immediately. Check for U_FAILURE() on output or use with | |
715 * function chaining. (See User Guide for details.) | |
716 * @return "yes" span end index | |
717 * @stable ICU 4.4 | |
718 */ | |
719 virtual int32_t | |
720 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE; | |
721 | |
722 /** | |
723 * Tests if the character always has a normalization boundary before it, | |
724 * regardless of context. | |
725 * For details see the Normalizer2 base class documentation. | |
726 * @param c character to test | |
727 * @return TRUE if c has a normalization boundary before it | |
728 * @stable ICU 4.4 | |
729 */ | |
730 virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE; | |
731 | |
732 /** | |
733 * Tests if the character always has a normalization boundary after it, | |
734 * regardless of context. | |
735 * For details see the Normalizer2 base class documentation. | |
736 * @param c character to test | |
737 * @return TRUE if c has a normalization boundary after it | |
738 * @stable ICU 4.4 | |
739 */ | |
740 virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE; | |
741 | |
742 /** | |
743 * Tests if the character is normalization-inert. | |
744 * For details see the Normalizer2 base class documentation. | |
745 * @param c character to test | |
746 * @return TRUE if c is normalization-inert | |
747 * @stable ICU 4.4 | |
748 */ | |
749 virtual UBool isInert(UChar32 c) const U_OVERRIDE; | |
750 private: | |
751 UnicodeString & | |
752 normalize(const UnicodeString &src, | |
753 UnicodeString &dest, | |
754 USetSpanCondition spanCondition, | |
755 UErrorCode &errorCode) const; | |
756 | |
757 void | |
758 normalizeUTF8(uint32_t options, const char *src, int32_t length, | |
759 ByteSink &sink, Edits *edits, | |
760 USetSpanCondition spanCondition, | |
761 UErrorCode &errorCode) const; | |
762 | |
763 UnicodeString & | |
764 normalizeSecondAndAppend(UnicodeString &first, | |
765 const UnicodeString &second, | |
766 UBool doNormalize, | |
767 UErrorCode &errorCode) const; | |
768 | |
769 const Normalizer2 &norm2; | |
770 const UnicodeSet &set; | |
771 }; | |
772 | |
773 U_NAMESPACE_END | |
774 | |
775 #endif // !UCONFIG_NO_NORMALIZATION | |
776 | |
777 #endif /* U_SHOW_CPLUSPLUS_API */ | |
778 | |
779 #endif // __NORMALIZER2_H__ |