Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/stsearch.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // © 2016 and later: Unicode, Inc. and others. | |
2 // License & terms of use: http://www.unicode.org/copyright.html | |
3 /* | |
4 ********************************************************************** | |
5 * Copyright (C) 2001-2014 IBM and others. All rights reserved. | |
6 ********************************************************************** | |
7 * Date Name Description | |
8 * 03/22/2000 helena Creation. | |
9 ********************************************************************** | |
10 */ | |
11 | |
12 #ifndef STSEARCH_H | |
13 #define STSEARCH_H | |
14 | |
15 #include "unicode/utypes.h" | |
16 | |
17 #if U_SHOW_CPLUSPLUS_API | |
18 | |
19 /** | |
20 * \file | |
21 * \brief C++ API: Service for searching text based on RuleBasedCollator. | |
22 */ | |
23 | |
24 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION | |
25 | |
26 #include "unicode/tblcoll.h" | |
27 #include "unicode/coleitr.h" | |
28 #include "unicode/search.h" | |
29 | |
30 U_NAMESPACE_BEGIN | |
31 | |
32 /** | |
33 * | |
34 * <tt>StringSearch</tt> is a <tt>SearchIterator</tt> that provides | |
35 * language-sensitive text searching based on the comparison rules defined | |
36 * in a {@link RuleBasedCollator} object. | |
37 * StringSearch ensures that language eccentricity can be | |
38 * handled, e.g. for the German collator, characters ß and SS will be matched | |
39 * if case is chosen to be ignored. | |
40 * See the <a href="http://source.icu-project.org/repos/icu/icuhtml/trunk/design/collation/ICU_collation_design.htm"> | |
41 * "ICU Collation Design Document"</a> for more information. | |
42 * <p> | |
43 * There are 2 match options for selection:<br> | |
44 * Let S' be the sub-string of a text string S between the offsets start and | |
45 * end [start, end]. | |
46 * <br> | |
47 * A pattern string P matches a text string S at the offsets [start, end] | |
48 * if | |
49 * <pre> | |
50 * option 1. Some canonical equivalent of P matches some canonical equivalent | |
51 * of S' | |
52 * option 2. P matches S' and if P starts or ends with a combining mark, | |
53 * there exists no non-ignorable combining mark before or after S? | |
54 * in S respectively. | |
55 * </pre> | |
56 * Option 2. will be the default. | |
57 * <p> | |
58 * This search has APIs similar to that of other text iteration mechanisms | |
59 * such as the break iterators in <tt>BreakIterator</tt>. Using these | |
60 * APIs, it is easy to scan through text looking for all occurrences of | |
61 * a given pattern. This search iterator allows changing of direction by | |
62 * calling a <tt>reset</tt> followed by a <tt>next</tt> or <tt>previous</tt>. | |
63 * Though a direction change can occur without calling <tt>reset</tt> first, | |
64 * this operation comes with some speed penalty. | |
65 * Match results in the forward direction will match the result matches in | |
66 * the backwards direction in the reverse order | |
67 * <p> | |
68 * <tt>SearchIterator</tt> provides APIs to specify the starting position | |
69 * within the text string to be searched, e.g. <tt>setOffset</tt>, | |
70 * <tt>preceding</tt> and <tt>following</tt>. Since the | |
71 * starting position will be set as it is specified, please take note that | |
72 * there are some danger points which the search may render incorrect | |
73 * results: | |
74 * <ul> | |
75 * <li> The midst of a substring that requires normalization. | |
76 * <li> If the following match is to be found, the position should not be the | |
77 * second character which requires to be swapped with the preceding | |
78 * character. Vice versa, if the preceding match is to be found, | |
79 * position to search from should not be the first character which | |
80 * requires to be swapped with the next character. E.g certain Thai and | |
81 * Lao characters require swapping. | |
82 * <li> If a following pattern match is to be found, any position within a | |
83 * contracting sequence except the first will fail. Vice versa if a | |
84 * preceding pattern match is to be found, a invalid starting point | |
85 * would be any character within a contracting sequence except the last. | |
86 * </ul> | |
87 * <p> | |
88 * A <tt>BreakIterator</tt> can be used if only matches at logical breaks are desired. | |
89 * Using a <tt>BreakIterator</tt> will only give you results that exactly matches the | |
90 * boundaries given by the breakiterator. For instance the pattern "e" will | |
91 * not be found in the string "\u00e9" if a character break iterator is used. | |
92 * <p> | |
93 * Options are provided to handle overlapping matches. | |
94 * E.g. In English, overlapping matches produces the result 0 and 2 | |
95 * for the pattern "abab" in the text "ababab", where else mutually | |
96 * exclusive matches only produce the result of 0. | |
97 * <p> | |
98 * Though collator attributes will be taken into consideration while | |
99 * performing matches, there are no APIs here for setting and getting the | |
100 * attributes. These attributes can be set by getting the collator | |
101 * from <tt>getCollator</tt> and using the APIs in <tt>coll.h</tt>. | |
102 * Lastly to update <tt>StringSearch</tt> to the new collator attributes, | |
103 * <tt>reset</tt> has to be called. | |
104 * <p> | |
105 * Restriction: <br> | |
106 * Currently there are no composite characters that consists of a | |
107 * character with combining class > 0 before a character with combining | |
108 * class == 0. However, if such a character exists in the future, | |
109 * <tt>StringSearch</tt> does not guarantee the results for option 1. | |
110 * <p> | |
111 * Consult the <tt>SearchIterator</tt> documentation for information on | |
112 * and examples of how to use instances of this class to implement text | |
113 * searching. | |
114 * <pre><code> | |
115 * UnicodeString target("The quick brown fox jumps over the lazy dog."); | |
116 * UnicodeString pattern("fox"); | |
117 * | |
118 * UErrorCode error = U_ZERO_ERROR; | |
119 * StringSearch iter(pattern, target, Locale::getUS(), NULL, status); | |
120 * for (int pos = iter.first(error); | |
121 * pos != USEARCH_DONE; | |
122 * pos = iter.next(error)) | |
123 * { | |
124 * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); | |
125 * } | |
126 * </code></pre> | |
127 * <p> | |
128 * Note, <tt>StringSearch</tt> is not to be subclassed. | |
129 * </p> | |
130 * @see SearchIterator | |
131 * @see RuleBasedCollator | |
132 * @since ICU 2.0 | |
133 */ | |
134 | |
135 class U_I18N_API StringSearch U_FINAL : public SearchIterator | |
136 { | |
137 public: | |
138 | |
139 // public constructors and destructors -------------------------------- | |
140 | |
141 /** | |
142 * Creating a <tt>StringSearch</tt> instance using the argument locale | |
143 * language rule set. A collator will be created in the process, which | |
144 * will be owned by this instance and will be deleted during | |
145 * destruction | |
146 * @param pattern The text for which this object will search. | |
147 * @param text The text in which to search for the pattern. | |
148 * @param locale A locale which defines the language-sensitive | |
149 * comparison rules used to determine whether text in the | |
150 * pattern and target matches. | |
151 * @param breakiter A <tt>BreakIterator</tt> object used to constrain | |
152 * the matches that are found. Matches whose start and end | |
153 * indices in the target text are not boundaries as | |
154 * determined by the <tt>BreakIterator</tt> are | |
155 * ignored. If this behavior is not desired, | |
156 * <tt>NULL</tt> can be passed in instead. | |
157 * @param status for errors if any. If pattern or text is NULL, or if | |
158 * either the length of pattern or text is 0 then an | |
159 * U_ILLEGAL_ARGUMENT_ERROR is returned. | |
160 * @stable ICU 2.0 | |
161 */ | |
162 StringSearch(const UnicodeString &pattern, const UnicodeString &text, | |
163 const Locale &locale, | |
164 BreakIterator *breakiter, | |
165 UErrorCode &status); | |
166 | |
167 /** | |
168 * Creating a <tt>StringSearch</tt> instance using the argument collator | |
169 * language rule set. Note, user retains the ownership of this collator, | |
170 * it does not get destroyed during this instance's destruction. | |
171 * @param pattern The text for which this object will search. | |
172 * @param text The text in which to search for the pattern. | |
173 * @param coll A <tt>RuleBasedCollator</tt> object which defines | |
174 * the language-sensitive comparison rules used to | |
175 * determine whether text in the pattern and target | |
176 * matches. User is responsible for the clearing of this | |
177 * object. | |
178 * @param breakiter A <tt>BreakIterator</tt> object used to constrain | |
179 * the matches that are found. Matches whose start and end | |
180 * indices in the target text are not boundaries as | |
181 * determined by the <tt>BreakIterator</tt> are | |
182 * ignored. If this behavior is not desired, | |
183 * <tt>NULL</tt> can be passed in instead. | |
184 * @param status for errors if any. If either the length of pattern or | |
185 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. | |
186 * @stable ICU 2.0 | |
187 */ | |
188 StringSearch(const UnicodeString &pattern, | |
189 const UnicodeString &text, | |
190 RuleBasedCollator *coll, | |
191 BreakIterator *breakiter, | |
192 UErrorCode &status); | |
193 | |
194 /** | |
195 * Creating a <tt>StringSearch</tt> instance using the argument locale | |
196 * language rule set. A collator will be created in the process, which | |
197 * will be owned by this instance and will be deleted during | |
198 * destruction | |
199 * <p> | |
200 * Note: No parsing of the text within the <tt>CharacterIterator</tt> | |
201 * will be done during searching for this version. The block of text | |
202 * in <tt>CharacterIterator</tt> will be used as it is. | |
203 * @param pattern The text for which this object will search. | |
204 * @param text The text iterator in which to search for the pattern. | |
205 * @param locale A locale which defines the language-sensitive | |
206 * comparison rules used to determine whether text in the | |
207 * pattern and target matches. User is responsible for | |
208 * the clearing of this object. | |
209 * @param breakiter A <tt>BreakIterator</tt> object used to constrain | |
210 * the matches that are found. Matches whose start and end | |
211 * indices in the target text are not boundaries as | |
212 * determined by the <tt>BreakIterator</tt> are | |
213 * ignored. If this behavior is not desired, | |
214 * <tt>NULL</tt> can be passed in instead. | |
215 * @param status for errors if any. If either the length of pattern or | |
216 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. | |
217 * @stable ICU 2.0 | |
218 */ | |
219 StringSearch(const UnicodeString &pattern, CharacterIterator &text, | |
220 const Locale &locale, | |
221 BreakIterator *breakiter, | |
222 UErrorCode &status); | |
223 | |
224 /** | |
225 * Creating a <tt>StringSearch</tt> instance using the argument collator | |
226 * language rule set. Note, user retains the ownership of this collator, | |
227 * it does not get destroyed during this instance's destruction. | |
228 * <p> | |
229 * Note: No parsing of the text within the <tt>CharacterIterator</tt> | |
230 * will be done during searching for this version. The block of text | |
231 * in <tt>CharacterIterator</tt> will be used as it is. | |
232 * @param pattern The text for which this object will search. | |
233 * @param text The text in which to search for the pattern. | |
234 * @param coll A <tt>RuleBasedCollator</tt> object which defines | |
235 * the language-sensitive comparison rules used to | |
236 * determine whether text in the pattern and target | |
237 * matches. User is responsible for the clearing of this | |
238 * object. | |
239 * @param breakiter A <tt>BreakIterator</tt> object used to constrain | |
240 * the matches that are found. Matches whose start and end | |
241 * indices in the target text are not boundaries as | |
242 * determined by the <tt>BreakIterator</tt> are | |
243 * ignored. If this behavior is not desired, | |
244 * <tt>NULL</tt> can be passed in instead. | |
245 * @param status for errors if any. If either the length of pattern or | |
246 * text is 0 then an U_ILLEGAL_ARGUMENT_ERROR is returned. | |
247 * @stable ICU 2.0 | |
248 */ | |
249 StringSearch(const UnicodeString &pattern, CharacterIterator &text, | |
250 RuleBasedCollator *coll, | |
251 BreakIterator *breakiter, | |
252 UErrorCode &status); | |
253 | |
254 /** | |
255 * Copy constructor that creates a StringSearch instance with the same | |
256 * behavior, and iterating over the same text. | |
257 * @param that StringSearch instance to be copied. | |
258 * @stable ICU 2.0 | |
259 */ | |
260 StringSearch(const StringSearch &that); | |
261 | |
262 /** | |
263 * Destructor. Cleans up the search iterator data struct. | |
264 * If a collator is created in the constructor, it will be destroyed here. | |
265 * @stable ICU 2.0 | |
266 */ | |
267 virtual ~StringSearch(void); | |
268 | |
269 /** | |
270 * Clone this object. | |
271 * Clones can be used concurrently in multiple threads. | |
272 * If an error occurs, then NULL is returned. | |
273 * The caller must delete the clone. | |
274 * | |
275 * @return a clone of this object | |
276 * | |
277 * @see getDynamicClassID | |
278 * @stable ICU 2.8 | |
279 */ | |
280 StringSearch *clone() const; | |
281 | |
282 // operator overloading --------------------------------------------- | |
283 | |
284 /** | |
285 * Assignment operator. Sets this iterator to have the same behavior, | |
286 * and iterate over the same text, as the one passed in. | |
287 * @param that instance to be copied. | |
288 * @stable ICU 2.0 | |
289 */ | |
290 StringSearch & operator=(const StringSearch &that); | |
291 | |
292 /** | |
293 * Equality operator. | |
294 * @param that instance to be compared. | |
295 * @return TRUE if both instances have the same attributes, | |
296 * breakiterators, collators and iterate over the same text | |
297 * while looking for the same pattern. | |
298 * @stable ICU 2.0 | |
299 */ | |
300 virtual UBool operator==(const SearchIterator &that) const; | |
301 | |
302 // public get and set methods ---------------------------------------- | |
303 | |
304 /** | |
305 * Sets the index to point to the given position, and clears any state | |
306 * that's affected. | |
307 * <p> | |
308 * This method takes the argument index and sets the position in the text | |
309 * string accordingly without checking if the index is pointing to a | |
310 * valid starting point to begin searching. | |
311 * @param position within the text to be set. If position is less | |
312 * than or greater than the text range for searching, | |
313 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned | |
314 * @param status for errors if it occurs | |
315 * @stable ICU 2.0 | |
316 */ | |
317 virtual void setOffset(int32_t position, UErrorCode &status); | |
318 | |
319 /** | |
320 * Return the current index in the text being searched. | |
321 * If the iteration has gone past the end of the text | |
322 * (or past the beginning for a backwards search), USEARCH_DONE | |
323 * is returned. | |
324 * @return current index in the text being searched. | |
325 * @stable ICU 2.0 | |
326 */ | |
327 virtual int32_t getOffset(void) const; | |
328 | |
329 /** | |
330 * Set the target text to be searched. | |
331 * Text iteration will hence begin at the start of the text string. | |
332 * This method is | |
333 * useful if you want to re-use an iterator to search for the same | |
334 * pattern within a different body of text. | |
335 * @param text text string to be searched | |
336 * @param status for errors if any. If the text length is 0 then an | |
337 * U_ILLEGAL_ARGUMENT_ERROR is returned. | |
338 * @stable ICU 2.0 | |
339 */ | |
340 virtual void setText(const UnicodeString &text, UErrorCode &status); | |
341 | |
342 /** | |
343 * Set the target text to be searched. | |
344 * Text iteration will hence begin at the start of the text string. | |
345 * This method is | |
346 * useful if you want to re-use an iterator to search for the same | |
347 * pattern within a different body of text. | |
348 * Note: No parsing of the text within the <tt>CharacterIterator</tt> | |
349 * will be done during searching for this version. The block of text | |
350 * in <tt>CharacterIterator</tt> will be used as it is. | |
351 * @param text text string to be searched | |
352 * @param status for errors if any. If the text length is 0 then an | |
353 * U_ILLEGAL_ARGUMENT_ERROR is returned. | |
354 * @stable ICU 2.0 | |
355 */ | |
356 virtual void setText(CharacterIterator &text, UErrorCode &status); | |
357 | |
358 /** | |
359 * Gets the collator used for the language rules. | |
360 * <p> | |
361 * Caller may modify but <b>must not</b> delete the <tt>RuleBasedCollator</tt>! | |
362 * Modifications to this collator will affect the original collator passed in to | |
363 * the <tt>StringSearch></tt> constructor or to setCollator, if any. | |
364 * @return collator used for string search | |
365 * @stable ICU 2.0 | |
366 */ | |
367 RuleBasedCollator * getCollator() const; | |
368 | |
369 /** | |
370 * Sets the collator used for the language rules. User retains the | |
371 * ownership of this collator, thus the responsibility of deletion lies | |
372 * with the user. The iterator's position will not be changed by this method. | |
373 * @param coll collator | |
374 * @param status for errors if any | |
375 * @stable ICU 2.0 | |
376 */ | |
377 void setCollator(RuleBasedCollator *coll, UErrorCode &status); | |
378 | |
379 /** | |
380 * Sets the pattern used for matching. | |
381 * The iterator's position will not be changed by this method. | |
382 * @param pattern search pattern to be found | |
383 * @param status for errors if any. If the pattern length is 0 then an | |
384 * U_ILLEGAL_ARGUMENT_ERROR is returned. | |
385 * @stable ICU 2.0 | |
386 */ | |
387 void setPattern(const UnicodeString &pattern, UErrorCode &status); | |
388 | |
389 /** | |
390 * Gets the search pattern. | |
391 * @return pattern used for matching | |
392 * @stable ICU 2.0 | |
393 */ | |
394 const UnicodeString & getPattern() const; | |
395 | |
396 // public methods ---------------------------------------------------- | |
397 | |
398 /** | |
399 * Reset the iteration. | |
400 * Search will begin at the start of the text string if a forward | |
401 * iteration is initiated before a backwards iteration. Otherwise if | |
402 * a backwards iteration is initiated before a forwards iteration, the | |
403 * search will begin at the end of the text string. | |
404 * @stable ICU 2.0 | |
405 */ | |
406 virtual void reset(); | |
407 | |
408 /** | |
409 * Returns a copy of StringSearch with the same behavior, and | |
410 * iterating over the same text, as this one. Note that all data will be | |
411 * replicated, except for the user-specified collator and the | |
412 * breakiterator. | |
413 * @return cloned object | |
414 * @stable ICU 2.0 | |
415 */ | |
416 virtual StringSearch * safeClone() const; | |
417 | |
418 /** | |
419 * ICU "poor man's RTTI", returns a UClassID for the actual class. | |
420 * | |
421 * @stable ICU 2.2 | |
422 */ | |
423 virtual UClassID getDynamicClassID() const; | |
424 | |
425 /** | |
426 * ICU "poor man's RTTI", returns a UClassID for this class. | |
427 * | |
428 * @stable ICU 2.2 | |
429 */ | |
430 static UClassID U_EXPORT2 getStaticClassID(); | |
431 | |
432 protected: | |
433 | |
434 // protected method ------------------------------------------------- | |
435 | |
436 /** | |
437 * Search forward for matching text, starting at a given location. | |
438 * Clients should not call this method directly; instead they should | |
439 * call {@link SearchIterator#next }. | |
440 * <p> | |
441 * If a match is found, this method returns the index at which the match | |
442 * starts and calls {@link SearchIterator#setMatchLength } with the number | |
443 * of characters in the target text that make up the match. If no match | |
444 * is found, the method returns <tt>USEARCH_DONE</tt>. | |
445 * <p> | |
446 * The <tt>StringSearch</tt> is adjusted so that its current index | |
447 * (as returned by {@link #getOffset }) is the match position if one was | |
448 * found. | |
449 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
450 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE. | |
451 * @param position The index in the target text at which the search | |
452 * starts | |
453 * @param status for errors if any occurs | |
454 * @return The index at which the matched text in the target starts, or | |
455 * USEARCH_DONE if no match was found. | |
456 * @stable ICU 2.0 | |
457 */ | |
458 virtual int32_t handleNext(int32_t position, UErrorCode &status); | |
459 | |
460 /** | |
461 * Search backward for matching text, starting at a given location. | |
462 * Clients should not call this method directly; instead they should call | |
463 * <tt>SearchIterator.previous()</tt>, which this method overrides. | |
464 * <p> | |
465 * If a match is found, this method returns the index at which the match | |
466 * starts and calls {@link SearchIterator#setMatchLength } with the number | |
467 * of characters in the target text that make up the match. If no match | |
468 * is found, the method returns <tt>USEARCH_DONE</tt>. | |
469 * <p> | |
470 * The <tt>StringSearch</tt> is adjusted so that its current index | |
471 * (as returned by {@link #getOffset }) is the match position if one was | |
472 * found. | |
473 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
474 * the <tt>StringSearch</tt> will be adjusted to the index USEARCH_DONE. | |
475 * @param position The index in the target text at which the search | |
476 * starts. | |
477 * @param status for errors if any occurs | |
478 * @return The index at which the matched text in the target starts, or | |
479 * USEARCH_DONE if no match was found. | |
480 * @stable ICU 2.0 | |
481 */ | |
482 virtual int32_t handlePrev(int32_t position, UErrorCode &status); | |
483 | |
484 private : | |
485 StringSearch(); // default constructor not implemented | |
486 | |
487 // private data members ---------------------------------------------- | |
488 | |
489 /** | |
490 * Pattern text | |
491 * @stable ICU 2.0 | |
492 */ | |
493 UnicodeString m_pattern_; | |
494 /** | |
495 * String search struct data | |
496 * @stable ICU 2.0 | |
497 */ | |
498 UStringSearch *m_strsrch_; | |
499 | |
500 }; | |
501 | |
502 U_NAMESPACE_END | |
503 | |
504 #endif /* #if !UCONFIG_NO_COLLATION */ | |
505 | |
506 #endif /* U_SHOW_CPLUSPLUS_API */ | |
507 | |
508 #endif | |
509 |