Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/search.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // © 2016 and later: Unicode, Inc. and others. | |
2 // License & terms of use: http://www.unicode.org/copyright.html | |
3 /* | |
4 ********************************************************************** | |
5 * Copyright (C) 2001-2011 IBM and others. All rights reserved. | |
6 ********************************************************************** | |
7 * Date Name Description | |
8 * 03/22/2000 helena Creation. | |
9 ********************************************************************** | |
10 */ | |
11 | |
12 #ifndef SEARCH_H | |
13 #define SEARCH_H | |
14 | |
15 #include "unicode/utypes.h" | |
16 | |
17 #if U_SHOW_CPLUSPLUS_API | |
18 | |
19 /** | |
20 * \file | |
21 * \brief C++ API: SearchIterator object. | |
22 */ | |
23 | |
24 #if !UCONFIG_NO_COLLATION && !UCONFIG_NO_BREAK_ITERATION | |
25 | |
26 #include "unicode/uobject.h" | |
27 #include "unicode/unistr.h" | |
28 #include "unicode/chariter.h" | |
29 #include "unicode/brkiter.h" | |
30 #include "unicode/usearch.h" | |
31 | |
32 /** | |
33 * @stable ICU 2.0 | |
34 */ | |
35 struct USearch; | |
36 /** | |
37 * @stable ICU 2.0 | |
38 */ | |
39 typedef struct USearch USearch; | |
40 | |
41 U_NAMESPACE_BEGIN | |
42 | |
43 /** | |
44 * | |
45 * <tt>SearchIterator</tt> is an abstract base class that provides | |
46 * methods to search for a pattern within a text string. Instances of | |
47 * <tt>SearchIterator</tt> maintain a current position and scans over the | |
48 * target text, returning the indices the pattern is matched and the length | |
49 * of each match. | |
50 * <p> | |
51 * <tt>SearchIterator</tt> defines a protocol for text searching. | |
52 * Subclasses provide concrete implementations of various search algorithms. | |
53 * For example, <tt>StringSearch</tt> implements language-sensitive pattern | |
54 * matching based on the comparison rules defined in a | |
55 * <tt>RuleBasedCollator</tt> object. | |
56 * <p> | |
57 * Other options for searching includes using a BreakIterator to restrict | |
58 * the points at which matches are detected. | |
59 * <p> | |
60 * <tt>SearchIterator</tt> provides an API that is similar to that of | |
61 * other text iteration classes such as <tt>BreakIterator</tt>. Using | |
62 * this class, it is easy to scan through text looking for all occurances of | |
63 * a given pattern. The following example uses a <tt>StringSearch</tt> | |
64 * object to find all instances of "fox" in the target string. Any other | |
65 * subclass of <tt>SearchIterator</tt> can be used in an identical | |
66 * manner. | |
67 * <pre><code> | |
68 * UnicodeString target("The quick brown fox jumped over the lazy fox"); | |
69 * UnicodeString pattern("fox"); | |
70 * | |
71 * SearchIterator *iter = new StringSearch(pattern, target); | |
72 * UErrorCode error = U_ZERO_ERROR; | |
73 * for (int pos = iter->first(error); pos != USEARCH_DONE; | |
74 * pos = iter->next(error)) { | |
75 * printf("Found match at %d pos, length is %d\n", pos, iter.getMatchedLength()); | |
76 * } | |
77 * </code></pre> | |
78 * | |
79 * @see StringSearch | |
80 * @see RuleBasedCollator | |
81 */ | |
82 class U_I18N_API SearchIterator : public UObject { | |
83 | |
84 public: | |
85 | |
86 // public constructors and destructors ------------------------------- | |
87 | |
88 /** | |
89 * Copy constructor that creates a SearchIterator instance with the same | |
90 * behavior, and iterating over the same text. | |
91 * @param other the SearchIterator instance to be copied. | |
92 * @stable ICU 2.0 | |
93 */ | |
94 SearchIterator(const SearchIterator &other); | |
95 | |
96 /** | |
97 * Destructor. Cleans up the search iterator data struct. | |
98 * @stable ICU 2.0 | |
99 */ | |
100 virtual ~SearchIterator(); | |
101 | |
102 // public get and set methods ---------------------------------------- | |
103 | |
104 /** | |
105 * Sets the index to point to the given position, and clears any state | |
106 * that's affected. | |
107 * <p> | |
108 * This method takes the argument index and sets the position in the text | |
109 * string accordingly without checking if the index is pointing to a | |
110 * valid starting point to begin searching. | |
111 * @param position within the text to be set. If position is less | |
112 * than or greater than the text range for searching, | |
113 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned | |
114 * @param status for errors if it occurs | |
115 * @stable ICU 2.0 | |
116 */ | |
117 virtual void setOffset(int32_t position, UErrorCode &status) = 0; | |
118 | |
119 /** | |
120 * Return the current index in the text being searched. | |
121 * If the iteration has gone past the end of the text | |
122 * (or past the beginning for a backwards search), USEARCH_DONE | |
123 * is returned. | |
124 * @return current index in the text being searched. | |
125 * @stable ICU 2.0 | |
126 */ | |
127 virtual int32_t getOffset(void) const = 0; | |
128 | |
129 /** | |
130 * Sets the text searching attributes located in the enum | |
131 * USearchAttribute with values from the enum USearchAttributeValue. | |
132 * USEARCH_DEFAULT can be used for all attributes for resetting. | |
133 * @param attribute text attribute (enum USearchAttribute) to be set | |
134 * @param value text attribute value | |
135 * @param status for errors if it occurs | |
136 * @stable ICU 2.0 | |
137 */ | |
138 void setAttribute(USearchAttribute attribute, | |
139 USearchAttributeValue value, | |
140 UErrorCode &status); | |
141 | |
142 /** | |
143 * Gets the text searching attributes | |
144 * @param attribute text attribute (enum USearchAttribute) to be retrieve | |
145 * @return text attribute value | |
146 * @stable ICU 2.0 | |
147 */ | |
148 USearchAttributeValue getAttribute(USearchAttribute attribute) const; | |
149 | |
150 /** | |
151 * Returns the index to the match in the text string that was searched. | |
152 * This call returns a valid result only after a successful call to | |
153 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. | |
154 * Just after construction, or after a searching method returns | |
155 * <tt>USEARCH_DONE</tt>, this method will return <tt>USEARCH_DONE</tt>. | |
156 * <p> | |
157 * Use getMatchedLength to get the matched string length. | |
158 * @return index of a substring within the text string that is being | |
159 * searched. | |
160 * @see #first | |
161 * @see #next | |
162 * @see #previous | |
163 * @see #last | |
164 * @stable ICU 2.0 | |
165 */ | |
166 int32_t getMatchedStart(void) const; | |
167 | |
168 /** | |
169 * Returns the length of text in the string which matches the search | |
170 * pattern. This call returns a valid result only after a successful call | |
171 * to <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. | |
172 * Just after construction, or after a searching method returns | |
173 * <tt>USEARCH_DONE</tt>, this method will return 0. | |
174 * @return The length of the match in the target text, or 0 if there | |
175 * is no match currently. | |
176 * @see #first | |
177 * @see #next | |
178 * @see #previous | |
179 * @see #last | |
180 * @stable ICU 2.0 | |
181 */ | |
182 int32_t getMatchedLength(void) const; | |
183 | |
184 /** | |
185 * Returns the text that was matched by the most recent call to | |
186 * <tt>first</tt>, <tt>next</tt>, <tt>previous</tt>, or <tt>last</tt>. | |
187 * If the iterator is not pointing at a valid match (e.g. just after | |
188 * construction or after <tt>USEARCH_DONE</tt> has been returned, | |
189 * returns an empty string. | |
190 * @param result stores the matched string or an empty string if a match | |
191 * is not found. | |
192 * @see #first | |
193 * @see #next | |
194 * @see #previous | |
195 * @see #last | |
196 * @stable ICU 2.0 | |
197 */ | |
198 void getMatchedText(UnicodeString &result) const; | |
199 | |
200 /** | |
201 * Set the BreakIterator that will be used to restrict the points | |
202 * at which matches are detected. The user is responsible for deleting | |
203 * the breakiterator. | |
204 * @param breakiter A BreakIterator that will be used to restrict the | |
205 * points at which matches are detected. If a match is | |
206 * found, but the match's start or end index is not a | |
207 * boundary as determined by the <tt>BreakIterator</tt>, | |
208 * the match will be rejected and another will be searched | |
209 * for. If this parameter is <tt>NULL</tt>, no break | |
210 * detection is attempted. | |
211 * @param status for errors if it occurs | |
212 * @see BreakIterator | |
213 * @stable ICU 2.0 | |
214 */ | |
215 void setBreakIterator(BreakIterator *breakiter, UErrorCode &status); | |
216 | |
217 /** | |
218 * Returns the BreakIterator that is used to restrict the points at | |
219 * which matches are detected. This will be the same object that was | |
220 * passed to the constructor or to <tt>setBreakIterator</tt>. | |
221 * Note that <tt>NULL</tt> is a legal value; it means that break | |
222 * detection should not be attempted. | |
223 * @return BreakIterator used to restrict matchings. | |
224 * @see #setBreakIterator | |
225 * @stable ICU 2.0 | |
226 */ | |
227 const BreakIterator * getBreakIterator(void) const; | |
228 | |
229 /** | |
230 * Set the string text to be searched. Text iteration will hence begin at | |
231 * the start of the text string. This method is useful if you want to | |
232 * re-use an iterator to search for the same pattern within a different | |
233 * body of text. The user is responsible for deleting the text. | |
234 * @param text string to be searched. | |
235 * @param status for errors. If the text length is 0, | |
236 * an U_ILLEGAL_ARGUMENT_ERROR is returned. | |
237 * @stable ICU 2.0 | |
238 */ | |
239 virtual void setText(const UnicodeString &text, UErrorCode &status); | |
240 | |
241 /** | |
242 * Set the string text to be searched. Text iteration will hence begin at | |
243 * the start of the text string. This method is useful if you want to | |
244 * re-use an iterator to search for the same pattern within a different | |
245 * body of text. | |
246 * <p> | |
247 * Note: No parsing of the text within the <tt>CharacterIterator</tt> | |
248 * will be done during searching for this version. The block of text | |
249 * in <tt>CharacterIterator</tt> will be used as it is. | |
250 * The user is responsible for deleting the text. | |
251 * @param text string iterator to be searched. | |
252 * @param status for errors if any. If the text length is 0 then an | |
253 * U_ILLEGAL_ARGUMENT_ERROR is returned. | |
254 * @stable ICU 2.0 | |
255 */ | |
256 virtual void setText(CharacterIterator &text, UErrorCode &status); | |
257 | |
258 /** | |
259 * Return the string text to be searched. | |
260 * @return text string to be searched. | |
261 * @stable ICU 2.0 | |
262 */ | |
263 const UnicodeString & getText(void) const; | |
264 | |
265 // operator overloading ---------------------------------------------- | |
266 | |
267 /** | |
268 * Equality operator. | |
269 * @param that SearchIterator instance to be compared. | |
270 * @return TRUE if both BreakIterators are of the same class, have the | |
271 * same behavior, terates over the same text and have the same | |
272 * attributes. FALSE otherwise. | |
273 * @stable ICU 2.0 | |
274 */ | |
275 virtual UBool operator==(const SearchIterator &that) const; | |
276 | |
277 /** | |
278 * Not-equal operator. | |
279 * @param that SearchIterator instance to be compared. | |
280 * @return FALSE if operator== returns TRUE, and vice versa. | |
281 * @stable ICU 2.0 | |
282 */ | |
283 UBool operator!=(const SearchIterator &that) const; | |
284 | |
285 // public methods ---------------------------------------------------- | |
286 | |
287 /** | |
288 * Returns a copy of SearchIterator with the same behavior, and | |
289 * iterating over the same text, as this one. Note that all data will be | |
290 * replicated, except for the text string to be searched. | |
291 * @return cloned object | |
292 * @stable ICU 2.0 | |
293 */ | |
294 virtual SearchIterator* safeClone(void) const = 0; | |
295 | |
296 /** | |
297 * Returns the first index at which the string text matches the search | |
298 * pattern. The iterator is adjusted so that its current index (as | |
299 * returned by <tt>getOffset</tt>) is the match position if one | |
300 * was found. | |
301 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
302 * the iterator will be adjusted to the index USEARCH_DONE | |
303 * @param status for errors if it occurs | |
304 * @return The character index of the first match, or | |
305 * <tt>USEARCH_DONE</tt> if there are no matches. | |
306 * @see #getOffset | |
307 * @stable ICU 2.0 | |
308 */ | |
309 int32_t first(UErrorCode &status); | |
310 | |
311 /** | |
312 * Returns the first index equal or greater than <tt>position</tt> at which the | |
313 * string text matches the search pattern. The iterator is adjusted so | |
314 * that its current index (as returned by <tt>getOffset</tt>) is the | |
315 * match position if one was found. | |
316 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and the | |
317 * iterator will be adjusted to the index <tt>USEARCH_DONE</tt>. | |
318 * @param position where search if to start from. If position is less | |
319 * than or greater than the text range for searching, | |
320 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned | |
321 * @param status for errors if it occurs | |
322 * @return The character index of the first match following | |
323 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are no | |
324 * matches. | |
325 * @see #getOffset | |
326 * @stable ICU 2.0 | |
327 */ | |
328 int32_t following(int32_t position, UErrorCode &status); | |
329 | |
330 /** | |
331 * Returns the last index in the target text at which it matches the | |
332 * search pattern. The iterator is adjusted so that its current index | |
333 * (as returned by <tt>getOffset</tt>) is the match position if one was | |
334 * found. | |
335 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
336 * the iterator will be adjusted to the index USEARCH_DONE. | |
337 * @param status for errors if it occurs | |
338 * @return The index of the first match, or <tt>USEARCH_DONE</tt> if | |
339 * there are no matches. | |
340 * @see #getOffset | |
341 * @stable ICU 2.0 | |
342 */ | |
343 int32_t last(UErrorCode &status); | |
344 | |
345 /** | |
346 * Returns the first index less than <tt>position</tt> at which the string | |
347 * text matches the search pattern. The iterator is adjusted so that its | |
348 * current index (as returned by <tt>getOffset</tt>) is the match | |
349 * position if one was found. If a match is not found, | |
350 * <tt>USEARCH_DONE</tt> will be returned and the iterator will be | |
351 * adjusted to the index USEARCH_DONE | |
352 * <p> | |
353 * When <tt>USEARCH_OVERLAP</tt> option is off, the last index of the | |
354 * result match is always less than <tt>position</tt>. | |
355 * When <tt>USERARCH_OVERLAP</tt> is on, the result match may span across | |
356 * <tt>position</tt>. | |
357 * | |
358 * @param position where search is to start from. If position is less | |
359 * than or greater than the text range for searching, | |
360 * an U_INDEX_OUTOFBOUNDS_ERROR will be returned | |
361 * @param status for errors if it occurs | |
362 * @return The character index of the first match preceding | |
363 * <tt>position</tt>, or <tt>USEARCH_DONE</tt> if there are | |
364 * no matches. | |
365 * @see #getOffset | |
366 * @stable ICU 2.0 | |
367 */ | |
368 int32_t preceding(int32_t position, UErrorCode &status); | |
369 | |
370 /** | |
371 * Returns the index of the next point at which the text matches the | |
372 * search pattern, starting from the current position | |
373 * The iterator is adjusted so that its current index (as returned by | |
374 * <tt>getOffset</tt>) is the match position if one was found. | |
375 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
376 * the iterator will be adjusted to a position after the end of the text | |
377 * string. | |
378 * @param status for errors if it occurs | |
379 * @return The index of the next match after the current position, | |
380 * or <tt>USEARCH_DONE</tt> if there are no more matches. | |
381 * @see #getOffset | |
382 * @stable ICU 2.0 | |
383 */ | |
384 int32_t next(UErrorCode &status); | |
385 | |
386 /** | |
387 * Returns the index of the previous point at which the string text | |
388 * matches the search pattern, starting at the current position. | |
389 * The iterator is adjusted so that its current index (as returned by | |
390 * <tt>getOffset</tt>) is the match position if one was found. | |
391 * If a match is not found, <tt>USEARCH_DONE</tt> will be returned and | |
392 * the iterator will be adjusted to the index USEARCH_DONE | |
393 * @param status for errors if it occurs | |
394 * @return The index of the previous match before the current position, | |
395 * or <tt>USEARCH_DONE</tt> if there are no more matches. | |
396 * @see #getOffset | |
397 * @stable ICU 2.0 | |
398 */ | |
399 int32_t previous(UErrorCode &status); | |
400 | |
401 /** | |
402 * Resets the iteration. | |
403 * Search will begin at the start of the text string if a forward | |
404 * iteration is initiated before a backwards iteration. Otherwise if a | |
405 * backwards iteration is initiated before a forwards iteration, the | |
406 * search will begin at the end of the text string. | |
407 * @stable ICU 2.0 | |
408 */ | |
409 virtual void reset(); | |
410 | |
411 protected: | |
412 // protected data members --------------------------------------------- | |
413 | |
414 /** | |
415 * C search data struct | |
416 * @stable ICU 2.0 | |
417 */ | |
418 USearch *m_search_; | |
419 | |
420 /** | |
421 * Break iterator. | |
422 * Currently the C++ breakiterator does not have getRules etc to reproduce | |
423 * another in C. Hence we keep the original around and do the verification | |
424 * at the end of the match. The user is responsible for deleting this | |
425 * break iterator. | |
426 * @stable ICU 2.0 | |
427 */ | |
428 BreakIterator *m_breakiterator_; | |
429 | |
430 /** | |
431 * Unicode string version of the search text | |
432 * @stable ICU 2.0 | |
433 */ | |
434 UnicodeString m_text_; | |
435 | |
436 // protected constructors and destructors ----------------------------- | |
437 | |
438 /** | |
439 * Default constructor. | |
440 * Initializes data to the default values. | |
441 * @stable ICU 2.0 | |
442 */ | |
443 SearchIterator(); | |
444 | |
445 /** | |
446 * Constructor for use by subclasses. | |
447 * @param text The target text to be searched. | |
448 * @param breakiter A {@link BreakIterator} that is used to restrict the | |
449 * points at which matches are detected. If | |
450 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a | |
451 * match, but the match's start or end index is not a | |
452 * boundary as determined by the <tt>BreakIterator</tt>, | |
453 * the match is rejected and <tt>handleNext</tt> or | |
454 * <tt>handlePrev</tt> is called again. If this parameter | |
455 * is <tt>NULL</tt>, no break detection is attempted. | |
456 * @see #handleNext | |
457 * @see #handlePrev | |
458 * @stable ICU 2.0 | |
459 */ | |
460 SearchIterator(const UnicodeString &text, | |
461 BreakIterator *breakiter = NULL); | |
462 | |
463 /** | |
464 * Constructor for use by subclasses. | |
465 * <p> | |
466 * Note: No parsing of the text within the <tt>CharacterIterator</tt> | |
467 * will be done during searching for this version. The block of text | |
468 * in <tt>CharacterIterator</tt> will be used as it is. | |
469 * @param text The target text to be searched. | |
470 * @param breakiter A {@link BreakIterator} that is used to restrict the | |
471 * points at which matches are detected. If | |
472 * <tt>handleNext</tt> or <tt>handlePrev</tt> finds a | |
473 * match, but the match's start or end index is not a | |
474 * boundary as determined by the <tt>BreakIterator</tt>, | |
475 * the match is rejected and <tt>handleNext</tt> or | |
476 * <tt>handlePrev</tt> is called again. If this parameter | |
477 * is <tt>NULL</tt>, no break detection is attempted. | |
478 * @see #handleNext | |
479 * @see #handlePrev | |
480 * @stable ICU 2.0 | |
481 */ | |
482 SearchIterator(CharacterIterator &text, BreakIterator *breakiter = NULL); | |
483 | |
484 // protected methods -------------------------------------------------- | |
485 | |
486 /** | |
487 * Assignment operator. Sets this iterator to have the same behavior, | |
488 * and iterate over the same text, as the one passed in. | |
489 * @param that instance to be copied. | |
490 * @stable ICU 2.0 | |
491 */ | |
492 SearchIterator & operator=(const SearchIterator &that); | |
493 | |
494 /** | |
495 * Abstract method which subclasses override to provide the mechanism | |
496 * for finding the next match in the target text. This allows different | |
497 * subclasses to provide different search algorithms. | |
498 * <p> | |
499 * If a match is found, the implementation should return the index at | |
500 * which the match starts and should call | |
501 * <tt>setMatchLength</tt> with the number of characters | |
502 * in the target text that make up the match. If no match is found, the | |
503 * method should return USEARCH_DONE. | |
504 * <p> | |
505 * @param position The index in the target text at which the search | |
506 * should start. | |
507 * @param status for error codes if it occurs. | |
508 * @return index at which the match starts, else if match is not found | |
509 * USEARCH_DONE is returned | |
510 * @see #setMatchLength | |
511 * @stable ICU 2.0 | |
512 */ | |
513 virtual int32_t handleNext(int32_t position, UErrorCode &status) | |
514 = 0; | |
515 | |
516 /** | |
517 * Abstract method which subclasses override to provide the mechanism for | |
518 * finding the previous match in the target text. This allows different | |
519 * subclasses to provide different search algorithms. | |
520 * <p> | |
521 * If a match is found, the implementation should return the index at | |
522 * which the match starts and should call | |
523 * <tt>setMatchLength</tt> with the number of characters | |
524 * in the target text that make up the match. If no match is found, the | |
525 * method should return USEARCH_DONE. | |
526 * <p> | |
527 * @param position The index in the target text at which the search | |
528 * should start. | |
529 * @param status for error codes if it occurs. | |
530 * @return index at which the match starts, else if match is not found | |
531 * USEARCH_DONE is returned | |
532 * @see #setMatchLength | |
533 * @stable ICU 2.0 | |
534 */ | |
535 virtual int32_t handlePrev(int32_t position, UErrorCode &status) | |
536 = 0; | |
537 | |
538 /** | |
539 * Sets the length of the currently matched string in the text string to | |
540 * be searched. | |
541 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> | |
542 * methods should call this when they find a match in the target text. | |
543 * @param length length of the matched text. | |
544 * @see #handleNext | |
545 * @see #handlePrev | |
546 * @stable ICU 2.0 | |
547 */ | |
548 virtual void setMatchLength(int32_t length); | |
549 | |
550 /** | |
551 * Sets the offset of the currently matched string in the text string to | |
552 * be searched. | |
553 * Subclasses' <tt>handleNext</tt> and <tt>handlePrev</tt> | |
554 * methods should call this when they find a match in the target text. | |
555 * @param position start offset of the matched text. | |
556 * @see #handleNext | |
557 * @see #handlePrev | |
558 * @stable ICU 2.0 | |
559 */ | |
560 virtual void setMatchStart(int32_t position); | |
561 | |
562 /** | |
563 * sets match not found | |
564 * @stable ICU 2.0 | |
565 */ | |
566 void setMatchNotFound(); | |
567 }; | |
568 | |
569 inline UBool SearchIterator::operator!=(const SearchIterator &that) const | |
570 { | |
571 return !operator==(that); | |
572 } | |
573 U_NAMESPACE_END | |
574 | |
575 #endif /* #if !UCONFIG_NO_COLLATION */ | |
576 | |
577 #endif /* U_SHOW_CPLUSPLUS_API */ | |
578 | |
579 #endif | |
580 |