Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/rbbi.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // © 2016 and later: Unicode, Inc. and others. | |
2 // License & terms of use: http://www.unicode.org/copyright.html | |
3 /* | |
4 *************************************************************************** | |
5 * Copyright (C) 1999-2016 International Business Machines Corporation * | |
6 * and others. All rights reserved. * | |
7 *************************************************************************** | |
8 | |
9 ********************************************************************** | |
10 * Date Name Description | |
11 * 10/22/99 alan Creation. | |
12 * 11/11/99 rgillam Complete port from Java. | |
13 ********************************************************************** | |
14 */ | |
15 | |
16 #ifndef RBBI_H | |
17 #define RBBI_H | |
18 | |
19 #include "unicode/utypes.h" | |
20 | |
21 #if U_SHOW_CPLUSPLUS_API | |
22 | |
23 /** | |
24 * \file | |
25 * \brief C++ API: Rule Based Break Iterator | |
26 */ | |
27 | |
28 #if !UCONFIG_NO_BREAK_ITERATION | |
29 | |
30 #include "unicode/brkiter.h" | |
31 #include "unicode/udata.h" | |
32 #include "unicode/parseerr.h" | |
33 #include "unicode/schriter.h" | |
34 | |
35 U_NAMESPACE_BEGIN | |
36 | |
37 /** @internal */ | |
38 class LanguageBreakEngine; | |
39 struct RBBIDataHeader; | |
40 class RBBIDataWrapper; | |
41 class UnhandledEngine; | |
42 class UStack; | |
43 | |
44 /** | |
45 * | |
46 * A subclass of BreakIterator whose behavior is specified using a list of rules. | |
47 * <p>Instances of this class are most commonly created by the factory methods of | |
48 * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc., | |
49 * and then used via the abstract API in class BreakIterator</p> | |
50 * | |
51 * <p>See the ICU User Guide for information on Break Iterator Rules.</p> | |
52 * | |
53 * <p>This class is not intended to be subclassed.</p> | |
54 */ | |
55 class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator { | |
56 | |
57 private: | |
58 /** | |
59 * The UText through which this BreakIterator accesses the text | |
60 * @internal (private) | |
61 */ | |
62 UText fText; | |
63 | |
64 #ifndef U_HIDE_INTERNAL_API | |
65 public: | |
66 #endif /* U_HIDE_INTERNAL_API */ | |
67 /** | |
68 * The rule data for this BreakIterator instance. | |
69 * Not for general use; Public only for testing purposes. | |
70 * @internal | |
71 */ | |
72 RBBIDataWrapper *fData; | |
73 private: | |
74 | |
75 /** | |
76 * The current position of the iterator. Pinned, 0 < fPosition <= text.length. | |
77 * Never has the value UBRK_DONE (-1). | |
78 */ | |
79 int32_t fPosition; | |
80 | |
81 /** | |
82 * TODO: | |
83 */ | |
84 int32_t fRuleStatusIndex; | |
85 | |
86 /** | |
87 * Cache of previously determined boundary positions. | |
88 */ | |
89 class BreakCache; | |
90 BreakCache *fBreakCache; | |
91 | |
92 /** | |
93 * Cache of boundary positions within a region of text that has been | |
94 * sub-divided by dictionary based breaking. | |
95 */ | |
96 class DictionaryCache; | |
97 DictionaryCache *fDictionaryCache; | |
98 | |
99 /** | |
100 * | |
101 * If present, UStack of LanguageBreakEngine objects that might handle | |
102 * dictionary characters. Searched from top to bottom to find an object to | |
103 * handle a given character. | |
104 * @internal (private) | |
105 */ | |
106 UStack *fLanguageBreakEngines; | |
107 | |
108 /** | |
109 * | |
110 * If present, the special LanguageBreakEngine used for handling | |
111 * characters that are in the dictionary set, but not handled by any | |
112 * LanguageBreakEngine. | |
113 * @internal (private) | |
114 */ | |
115 UnhandledEngine *fUnhandledBreakEngine; | |
116 | |
117 /** | |
118 * Counter for the number of characters encountered with the "dictionary" | |
119 * flag set. | |
120 * @internal (private) | |
121 */ | |
122 uint32_t fDictionaryCharCount; | |
123 | |
124 /** | |
125 * A character iterator that refers to the same text as the UText, above. | |
126 * Only included for compatibility with old API, which was based on CharacterIterators. | |
127 * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below. | |
128 */ | |
129 CharacterIterator *fCharIter; | |
130 | |
131 /** | |
132 * When the input text is provided by a UnicodeString, this will point to | |
133 * a characterIterator that wraps that data. Needed only for the | |
134 * implementation of getText(), a backwards compatibility issue. | |
135 */ | |
136 StringCharacterIterator fSCharIter; | |
137 | |
138 /** | |
139 * True when iteration has run off the end, and iterator functions should return UBRK_DONE. | |
140 */ | |
141 UBool fDone; | |
142 | |
143 //======================================================================= | |
144 // constructors | |
145 //======================================================================= | |
146 | |
147 /** | |
148 * Constructor from a flattened set of RBBI data in malloced memory. | |
149 * RulesBasedBreakIterators built from a custom set of rules | |
150 * are created via this constructor; the rules are compiled | |
151 * into memory, then the break iterator is constructed here. | |
152 * | |
153 * The break iterator adopts the memory, and will | |
154 * free it when done. | |
155 * @internal (private) | |
156 */ | |
157 RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status); | |
158 | |
159 /** @internal */ | |
160 friend class RBBIRuleBuilder; | |
161 /** @internal */ | |
162 friend class BreakIterator; | |
163 | |
164 public: | |
165 | |
166 /** Default constructor. Creates an empty shell of an iterator, with no | |
167 * rules or text to iterate over. Object can subsequently be assigned to. | |
168 * @stable ICU 2.2 | |
169 */ | |
170 RuleBasedBreakIterator(); | |
171 | |
172 /** | |
173 * Copy constructor. Will produce a break iterator with the same behavior, | |
174 * and which iterates over the same text, as the one passed in. | |
175 * @param that The RuleBasedBreakIterator passed to be copied | |
176 * @stable ICU 2.0 | |
177 */ | |
178 RuleBasedBreakIterator(const RuleBasedBreakIterator& that); | |
179 | |
180 /** | |
181 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string. | |
182 * @param rules The break rules to be used. | |
183 * @param parseError In the event of a syntax error in the rules, provides the location | |
184 * within the rules of the problem. | |
185 * @param status Information on any errors encountered. | |
186 * @stable ICU 2.2 | |
187 */ | |
188 RuleBasedBreakIterator( const UnicodeString &rules, | |
189 UParseError &parseError, | |
190 UErrorCode &status); | |
191 | |
192 /** | |
193 * Construct a RuleBasedBreakIterator from a set of precompiled binary rules. | |
194 * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules(). | |
195 * Construction of a break iterator in this way is substantially faster than | |
196 * construction from source rules. | |
197 * | |
198 * Ownership of the storage containing the compiled rules remains with the | |
199 * caller of this function. The compiled rules must not be modified or | |
200 * deleted during the life of the break iterator. | |
201 * | |
202 * The compiled rules are not compatible across different major versions of ICU. | |
203 * The compiled rules are compatible only between machines with the same | |
204 * byte ordering (little or big endian) and the same base character set family | |
205 * (ASCII or EBCDIC). | |
206 * | |
207 * @see #getBinaryRules | |
208 * @param compiledRules A pointer to the compiled break rules to be used. | |
209 * @param ruleLength The length of the compiled break rules, in bytes. This | |
210 * corresponds to the length value produced by getBinaryRules(). | |
211 * @param status Information on any errors encountered, including invalid | |
212 * binary rules. | |
213 * @stable ICU 4.8 | |
214 */ | |
215 RuleBasedBreakIterator(const uint8_t *compiledRules, | |
216 uint32_t ruleLength, | |
217 UErrorCode &status); | |
218 | |
219 /** | |
220 * This constructor uses the udata interface to create a BreakIterator | |
221 * whose internal tables live in a memory-mapped file. "image" is an | |
222 * ICU UDataMemory handle for the pre-compiled break iterator tables. | |
223 * @param image handle to the memory image for the break iterator data. | |
224 * Ownership of the UDataMemory handle passes to the Break Iterator, | |
225 * which will be responsible for closing it when it is no longer needed. | |
226 * @param status Information on any errors encountered. | |
227 * @see udata_open | |
228 * @see #getBinaryRules | |
229 * @stable ICU 2.8 | |
230 */ | |
231 RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status); | |
232 | |
233 /** | |
234 * Destructor | |
235 * @stable ICU 2.0 | |
236 */ | |
237 virtual ~RuleBasedBreakIterator(); | |
238 | |
239 /** | |
240 * Assignment operator. Sets this iterator to have the same behavior, | |
241 * and iterate over the same text, as the one passed in. | |
242 * @param that The RuleBasedBreakItertor passed in | |
243 * @return the newly created RuleBasedBreakIterator | |
244 * @stable ICU 2.0 | |
245 */ | |
246 RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that); | |
247 | |
248 /** | |
249 * Equality operator. Returns TRUE if both BreakIterators are of the | |
250 * same class, have the same behavior, and iterate over the same text. | |
251 * @param that The BreakIterator to be compared for equality | |
252 * @return TRUE if both BreakIterators are of the | |
253 * same class, have the same behavior, and iterate over the same text. | |
254 * @stable ICU 2.0 | |
255 */ | |
256 virtual UBool operator==(const BreakIterator& that) const; | |
257 | |
258 /** | |
259 * Not-equal operator. If operator== returns TRUE, this returns FALSE, | |
260 * and vice versa. | |
261 * @param that The BreakIterator to be compared for inequality | |
262 * @return TRUE if both BreakIterators are not same. | |
263 * @stable ICU 2.0 | |
264 */ | |
265 inline UBool operator!=(const BreakIterator& that) const; | |
266 | |
267 /** | |
268 * Returns a newly-constructed RuleBasedBreakIterator with the same | |
269 * behavior, and iterating over the same text, as this one. | |
270 * Differs from the copy constructor in that it is polymorphic, and | |
271 * will correctly clone (copy) a derived class. | |
272 * clone() is thread safe. Multiple threads may simultaneously | |
273 * clone the same source break iterator. | |
274 * @return a newly-constructed RuleBasedBreakIterator | |
275 * @stable ICU 2.0 | |
276 */ | |
277 virtual RuleBasedBreakIterator* clone() const; | |
278 | |
279 /** | |
280 * Compute a hash code for this BreakIterator | |
281 * @return A hash code | |
282 * @stable ICU 2.0 | |
283 */ | |
284 virtual int32_t hashCode(void) const; | |
285 | |
286 /** | |
287 * Returns the description used to create this iterator | |
288 * @return the description used to create this iterator | |
289 * @stable ICU 2.0 | |
290 */ | |
291 virtual const UnicodeString& getRules(void) const; | |
292 | |
293 //======================================================================= | |
294 // BreakIterator overrides | |
295 //======================================================================= | |
296 | |
297 /** | |
298 * <p> | |
299 * Return a CharacterIterator over the text being analyzed. | |
300 * The returned character iterator is owned by the break iterator, and must | |
301 * not be deleted by the caller. Repeated calls to this function may | |
302 * return the same CharacterIterator. | |
303 * </p> | |
304 * <p> | |
305 * The returned character iterator must not be used concurrently with | |
306 * the break iterator. If concurrent operation is needed, clone the | |
307 * returned character iterator first and operate on the clone. | |
308 * </p> | |
309 * <p> | |
310 * When the break iterator is operating on text supplied via a UText, | |
311 * this function will fail. Lacking any way to signal failures, it | |
312 * returns an CharacterIterator containing no text. | |
313 * The function getUText() provides similar functionality, | |
314 * is reliable, and is more efficient. | |
315 * </p> | |
316 * | |
317 * TODO: deprecate this function? | |
318 * | |
319 * @return An iterator over the text being analyzed. | |
320 * @stable ICU 2.0 | |
321 */ | |
322 virtual CharacterIterator& getText(void) const; | |
323 | |
324 | |
325 /** | |
326 * Get a UText for the text being analyzed. | |
327 * The returned UText is a shallow clone of the UText used internally | |
328 * by the break iterator implementation. It can safely be used to | |
329 * access the text without impacting any break iterator operations, | |
330 * but the underlying text itself must not be altered. | |
331 * | |
332 * @param fillIn A UText to be filled in. If NULL, a new UText will be | |
333 * allocated to hold the result. | |
334 * @param status receives any error codes. | |
335 * @return The current UText for this break iterator. If an input | |
336 * UText was provided, it will always be returned. | |
337 * @stable ICU 3.4 | |
338 */ | |
339 virtual UText *getUText(UText *fillIn, UErrorCode &status) const; | |
340 | |
341 /** | |
342 * Set the iterator to analyze a new piece of text. This function resets | |
343 * the current iteration position to the beginning of the text. | |
344 * @param newText An iterator over the text to analyze. The BreakIterator | |
345 * takes ownership of the character iterator. The caller MUST NOT delete it! | |
346 * @stable ICU 2.0 | |
347 */ | |
348 virtual void adoptText(CharacterIterator* newText); | |
349 | |
350 /** | |
351 * Set the iterator to analyze a new piece of text. This function resets | |
352 * the current iteration position to the beginning of the text. | |
353 * | |
354 * The BreakIterator will retain a reference to the supplied string. | |
355 * The caller must not modify or delete the text while the BreakIterator | |
356 * retains the reference. | |
357 * | |
358 * @param newText The text to analyze. | |
359 * @stable ICU 2.0 | |
360 */ | |
361 virtual void setText(const UnicodeString& newText); | |
362 | |
363 /** | |
364 * Reset the break iterator to operate over the text represented by | |
365 * the UText. The iterator position is reset to the start. | |
366 * | |
367 * This function makes a shallow clone of the supplied UText. This means | |
368 * that the caller is free to immediately close or otherwise reuse the | |
369 * Utext that was passed as a parameter, but that the underlying text itself | |
370 * must not be altered while being referenced by the break iterator. | |
371 * | |
372 * @param text The UText used to change the text. | |
373 * @param status Receives any error codes. | |
374 * @stable ICU 3.4 | |
375 */ | |
376 virtual void setText(UText *text, UErrorCode &status); | |
377 | |
378 /** | |
379 * Sets the current iteration position to the beginning of the text, position zero. | |
380 * @return The offset of the beginning of the text, zero. | |
381 * @stable ICU 2.0 | |
382 */ | |
383 virtual int32_t first(void); | |
384 | |
385 /** | |
386 * Sets the current iteration position to the end of the text. | |
387 * @return The text's past-the-end offset. | |
388 * @stable ICU 2.0 | |
389 */ | |
390 virtual int32_t last(void); | |
391 | |
392 /** | |
393 * Advances the iterator either forward or backward the specified number of steps. | |
394 * Negative values move backward, and positive values move forward. This is | |
395 * equivalent to repeatedly calling next() or previous(). | |
396 * @param n The number of steps to move. The sign indicates the direction | |
397 * (negative is backwards, and positive is forwards). | |
398 * @return The character offset of the boundary position n boundaries away from | |
399 * the current one. | |
400 * @stable ICU 2.0 | |
401 */ | |
402 virtual int32_t next(int32_t n); | |
403 | |
404 /** | |
405 * Advances the iterator to the next boundary position. | |
406 * @return The position of the first boundary after this one. | |
407 * @stable ICU 2.0 | |
408 */ | |
409 virtual int32_t next(void); | |
410 | |
411 /** | |
412 * Moves the iterator backwards, to the last boundary preceding this one. | |
413 * @return The position of the last boundary position preceding this one. | |
414 * @stable ICU 2.0 | |
415 */ | |
416 virtual int32_t previous(void); | |
417 | |
418 /** | |
419 * Sets the iterator to refer to the first boundary position following | |
420 * the specified position. | |
421 * @param offset The position from which to begin searching for a break position. | |
422 * @return The position of the first break after the current position. | |
423 * @stable ICU 2.0 | |
424 */ | |
425 virtual int32_t following(int32_t offset); | |
426 | |
427 /** | |
428 * Sets the iterator to refer to the last boundary position before the | |
429 * specified position. | |
430 * @param offset The position to begin searching for a break from. | |
431 * @return The position of the last boundary before the starting position. | |
432 * @stable ICU 2.0 | |
433 */ | |
434 virtual int32_t preceding(int32_t offset); | |
435 | |
436 /** | |
437 * Returns true if the specified position is a boundary position. As a side | |
438 * effect, leaves the iterator pointing to the first boundary position at | |
439 * or after "offset". | |
440 * @param offset the offset to check. | |
441 * @return True if "offset" is a boundary position. | |
442 * @stable ICU 2.0 | |
443 */ | |
444 virtual UBool isBoundary(int32_t offset); | |
445 | |
446 /** | |
447 * Returns the current iteration position. Note that UBRK_DONE is never | |
448 * returned from this function; if iteration has run to the end of a | |
449 * string, current() will return the length of the string while | |
450 * next() will return UBRK_DONE). | |
451 * @return The current iteration position. | |
452 * @stable ICU 2.0 | |
453 */ | |
454 virtual int32_t current(void) const; | |
455 | |
456 | |
457 /** | |
458 * Return the status tag from the break rule that determined the boundary at | |
459 * the current iteration position. For break rules that do not specify a | |
460 * status, a default value of 0 is returned. If more than one break rule | |
461 * would cause a boundary to be located at some position in the text, | |
462 * the numerically largest of the applicable status values is returned. | |
463 * <p> | |
464 * Of the standard types of ICU break iterators, only word break and | |
465 * line break provide status values. The values are defined in | |
466 * the header file ubrk.h. For Word breaks, the status allows distinguishing between words | |
467 * that contain alphabetic letters, "words" that appear to be numbers, | |
468 * punctuation and spaces, words containing ideographic characters, and | |
469 * more. For Line Break, the status distinguishes between hard (mandatory) breaks | |
470 * and soft (potential) break positions. | |
471 * <p> | |
472 * <code>getRuleStatus()</code> can be called after obtaining a boundary | |
473 * position from <code>next()</code>, <code>previous()</code>, or | |
474 * any other break iterator functions that returns a boundary position. | |
475 * <p> | |
476 * Note that <code>getRuleStatus()</code> returns the value corresponding to | |
477 * <code>current()</code> index even after <code>next()</code> has returned DONE. | |
478 * <p> | |
479 * When creating custom break rules, one is free to define whatever | |
480 * status values may be convenient for the application. | |
481 * <p> | |
482 * @return the status from the break rule that determined the boundary | |
483 * at the current iteration position. | |
484 * | |
485 * @see UWordBreak | |
486 * @stable ICU 2.2 | |
487 */ | |
488 virtual int32_t getRuleStatus() const; | |
489 | |
490 /** | |
491 * Get the status (tag) values from the break rule(s) that determined the boundary | |
492 * at the current iteration position. | |
493 * <p> | |
494 * The returned status value(s) are stored into an array provided by the caller. | |
495 * The values are stored in sorted (ascending) order. | |
496 * If the capacity of the output array is insufficient to hold the data, | |
497 * the output will be truncated to the available length, and a | |
498 * U_BUFFER_OVERFLOW_ERROR will be signaled. | |
499 * | |
500 * @param fillInVec an array to be filled in with the status values. | |
501 * @param capacity the length of the supplied vector. A length of zero causes | |
502 * the function to return the number of status values, in the | |
503 * normal way, without attempting to store any values. | |
504 * @param status receives error codes. | |
505 * @return The number of rule status values from the rules that determined | |
506 * the boundary at the current iteration position. | |
507 * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value | |
508 * is the total number of status values that were available, | |
509 * not the reduced number that were actually returned. | |
510 * @see getRuleStatus | |
511 * @stable ICU 3.0 | |
512 */ | |
513 virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status); | |
514 | |
515 /** | |
516 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override. | |
517 * This method is to implement a simple version of RTTI, since not all | |
518 * C++ compilers support genuine RTTI. Polymorphic operator==() and | |
519 * clone() methods call this method. | |
520 * | |
521 * @return The class ID for this object. All objects of a | |
522 * given class have the same class ID. Objects of | |
523 * other classes have different class IDs. | |
524 * @stable ICU 2.0 | |
525 */ | |
526 virtual UClassID getDynamicClassID(void) const; | |
527 | |
528 /** | |
529 * Returns the class ID for this class. This is useful only for | |
530 * comparing to a return value from getDynamicClassID(). For example: | |
531 * | |
532 * Base* polymorphic_pointer = createPolymorphicObject(); | |
533 * if (polymorphic_pointer->getDynamicClassID() == | |
534 * Derived::getStaticClassID()) ... | |
535 * | |
536 * @return The class ID for all objects of this class. | |
537 * @stable ICU 2.0 | |
538 */ | |
539 static UClassID U_EXPORT2 getStaticClassID(void); | |
540 | |
541 #ifndef U_FORCE_HIDE_DEPRECATED_API | |
542 /** | |
543 * Deprecated functionality. Use clone() instead. | |
544 * | |
545 * Create a clone (copy) of this break iterator in memory provided | |
546 * by the caller. The idea is to increase performance by avoiding | |
547 * a storage allocation. Use of this function is NOT RECOMMENDED. | |
548 * Performance gains are minimal, and correct buffer management is | |
549 * tricky. Use clone() instead. | |
550 * | |
551 * @param stackBuffer The pointer to the memory into which the cloned object | |
552 * should be placed. If NULL, allocate heap memory | |
553 * for the cloned object. | |
554 * @param BufferSize The size of the buffer. If zero, return the required | |
555 * buffer size, but do not clone the object. If the | |
556 * size was too small (but not zero), allocate heap | |
557 * storage for the cloned object. | |
558 * | |
559 * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be | |
560 * returned if the provided buffer was too small, and | |
561 * the clone was therefore put on the heap. | |
562 * | |
563 * @return Pointer to the clone object. This may differ from the stackBuffer | |
564 * address if the byte alignment of the stack buffer was not suitable | |
565 * or if the stackBuffer was too small to hold the clone. | |
566 * @deprecated ICU 52. Use clone() instead. | |
567 */ | |
568 virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer, | |
569 int32_t &BufferSize, | |
570 UErrorCode &status); | |
571 #endif // U_FORCE_HIDE_DEPRECATED_API | |
572 | |
573 /** | |
574 * Return the binary form of compiled break rules, | |
575 * which can then be used to create a new break iterator at some | |
576 * time in the future. Creating a break iterator from pre-compiled rules | |
577 * is much faster than building one from the source form of the | |
578 * break rules. | |
579 * | |
580 * The binary data can only be used with the same version of ICU | |
581 * and on the same platform type (processor endian-ness) | |
582 * | |
583 * @param length Returns the length of the binary data. (Out parameter.) | |
584 * | |
585 * @return A pointer to the binary (compiled) rule data. The storage | |
586 * belongs to the RulesBasedBreakIterator object, not the | |
587 * caller, and must not be modified or deleted. | |
588 * @stable ICU 4.8 | |
589 */ | |
590 virtual const uint8_t *getBinaryRules(uint32_t &length); | |
591 | |
592 /** | |
593 * Set the subject text string upon which the break iterator is operating | |
594 * without changing any other aspect of the matching state. | |
595 * The new and previous text strings must have the same content. | |
596 * | |
597 * This function is intended for use in environments where ICU is operating on | |
598 * strings that may move around in memory. It provides a mechanism for notifying | |
599 * ICU that the string has been relocated, and providing a new UText to access the | |
600 * string in its new position. | |
601 * | |
602 * Note that the break iterator implementation never copies the underlying text | |
603 * of a string being processed, but always operates directly on the original text | |
604 * provided by the user. Refreshing simply drops the references to the old text | |
605 * and replaces them with references to the new. | |
606 * | |
607 * Caution: this function is normally used only by very specialized, | |
608 * system-level code. One example use case is with garbage collection that moves | |
609 * the text in memory. | |
610 * | |
611 * @param input The new (moved) text string. | |
612 * @param status Receives errors detected by this function. | |
613 * @return *this | |
614 * | |
615 * @stable ICU 49 | |
616 */ | |
617 virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status); | |
618 | |
619 | |
620 private: | |
621 //======================================================================= | |
622 // implementation | |
623 //======================================================================= | |
624 /** | |
625 * Dumps caches and performs other actions associated with a complete change | |
626 * in text or iteration position. | |
627 * @internal (private) | |
628 */ | |
629 void reset(void); | |
630 | |
631 /** | |
632 * Common initialization function, used by constructors and bufferClone. | |
633 * @internal (private) | |
634 */ | |
635 void init(UErrorCode &status); | |
636 | |
637 /** | |
638 * Iterate backwards from an arbitrary position in the input text using the | |
639 * synthesized Safe Reverse rules. | |
640 * This locates a "Safe Position" from which the forward break rules | |
641 * will operate correctly. A Safe Position is not necessarily a boundary itself. | |
642 * | |
643 * @param fromPosition the position in the input text to begin the iteration. | |
644 * @internal (private) | |
645 */ | |
646 int32_t handleSafePrevious(int32_t fromPosition); | |
647 | |
648 /** | |
649 * Find a rule-based boundary by running the state machine. | |
650 * Input | |
651 * fPosition, the position in the text to begin from. | |
652 * Output | |
653 * fPosition: the boundary following the starting position. | |
654 * fDictionaryCharCount the number of dictionary characters encountered. | |
655 * If > 0, the segment will be further subdivided | |
656 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary. | |
657 * | |
658 * @internal (private) | |
659 */ | |
660 int32_t handleNext(); | |
661 | |
662 | |
663 /** | |
664 * This function returns the appropriate LanguageBreakEngine for a | |
665 * given character c. | |
666 * @param c A character in the dictionary set | |
667 * @internal (private) | |
668 */ | |
669 const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c); | |
670 | |
671 public: | |
672 #ifndef U_HIDE_INTERNAL_API | |
673 /** | |
674 * Debugging function only. | |
675 * @internal | |
676 */ | |
677 void dumpCache(); | |
678 | |
679 /** | |
680 * Debugging function only. | |
681 * @internal | |
682 */ | |
683 void dumpTables(); | |
684 | |
685 #endif /* U_HIDE_INTERNAL_API */ | |
686 }; | |
687 | |
688 //------------------------------------------------------------------------------ | |
689 // | |
690 // Inline Functions Definitions ... | |
691 // | |
692 //------------------------------------------------------------------------------ | |
693 | |
694 inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const { | |
695 return !operator==(that); | |
696 } | |
697 | |
698 U_NAMESPACE_END | |
699 | |
700 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ | |
701 | |
702 #endif /* U_SHOW_CPLUSPLUS_API */ | |
703 | |
704 #endif |