annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/rbbi.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 // © 2016 and later: Unicode, Inc. and others.
jpayne@69 2 // License & terms of use: http://www.unicode.org/copyright.html
jpayne@69 3 /*
jpayne@69 4 ***************************************************************************
jpayne@69 5 * Copyright (C) 1999-2016 International Business Machines Corporation *
jpayne@69 6 * and others. All rights reserved. *
jpayne@69 7 ***************************************************************************
jpayne@69 8
jpayne@69 9 **********************************************************************
jpayne@69 10 * Date Name Description
jpayne@69 11 * 10/22/99 alan Creation.
jpayne@69 12 * 11/11/99 rgillam Complete port from Java.
jpayne@69 13 **********************************************************************
jpayne@69 14 */
jpayne@69 15
jpayne@69 16 #ifndef RBBI_H
jpayne@69 17 #define RBBI_H
jpayne@69 18
jpayne@69 19 #include "unicode/utypes.h"
jpayne@69 20
jpayne@69 21 #if U_SHOW_CPLUSPLUS_API
jpayne@69 22
jpayne@69 23 /**
jpayne@69 24 * \file
jpayne@69 25 * \brief C++ API: Rule Based Break Iterator
jpayne@69 26 */
jpayne@69 27
jpayne@69 28 #if !UCONFIG_NO_BREAK_ITERATION
jpayne@69 29
jpayne@69 30 #include "unicode/brkiter.h"
jpayne@69 31 #include "unicode/udata.h"
jpayne@69 32 #include "unicode/parseerr.h"
jpayne@69 33 #include "unicode/schriter.h"
jpayne@69 34
jpayne@69 35 U_NAMESPACE_BEGIN
jpayne@69 36
jpayne@69 37 /** @internal */
jpayne@69 38 class LanguageBreakEngine;
jpayne@69 39 struct RBBIDataHeader;
jpayne@69 40 class RBBIDataWrapper;
jpayne@69 41 class UnhandledEngine;
jpayne@69 42 class UStack;
jpayne@69 43
jpayne@69 44 /**
jpayne@69 45 *
jpayne@69 46 * A subclass of BreakIterator whose behavior is specified using a list of rules.
jpayne@69 47 * <p>Instances of this class are most commonly created by the factory methods of
jpayne@69 48 * BreakIterator::createWordInstance(), BreakIterator::createLineInstance(), etc.,
jpayne@69 49 * and then used via the abstract API in class BreakIterator</p>
jpayne@69 50 *
jpayne@69 51 * <p>See the ICU User Guide for information on Break Iterator Rules.</p>
jpayne@69 52 *
jpayne@69 53 * <p>This class is not intended to be subclassed.</p>
jpayne@69 54 */
jpayne@69 55 class U_COMMON_API RuleBasedBreakIterator /*U_FINAL*/ : public BreakIterator {
jpayne@69 56
jpayne@69 57 private:
jpayne@69 58 /**
jpayne@69 59 * The UText through which this BreakIterator accesses the text
jpayne@69 60 * @internal (private)
jpayne@69 61 */
jpayne@69 62 UText fText;
jpayne@69 63
jpayne@69 64 #ifndef U_HIDE_INTERNAL_API
jpayne@69 65 public:
jpayne@69 66 #endif /* U_HIDE_INTERNAL_API */
jpayne@69 67 /**
jpayne@69 68 * The rule data for this BreakIterator instance.
jpayne@69 69 * Not for general use; Public only for testing purposes.
jpayne@69 70 * @internal
jpayne@69 71 */
jpayne@69 72 RBBIDataWrapper *fData;
jpayne@69 73 private:
jpayne@69 74
jpayne@69 75 /**
jpayne@69 76 * The current position of the iterator. Pinned, 0 < fPosition <= text.length.
jpayne@69 77 * Never has the value UBRK_DONE (-1).
jpayne@69 78 */
jpayne@69 79 int32_t fPosition;
jpayne@69 80
jpayne@69 81 /**
jpayne@69 82 * TODO:
jpayne@69 83 */
jpayne@69 84 int32_t fRuleStatusIndex;
jpayne@69 85
jpayne@69 86 /**
jpayne@69 87 * Cache of previously determined boundary positions.
jpayne@69 88 */
jpayne@69 89 class BreakCache;
jpayne@69 90 BreakCache *fBreakCache;
jpayne@69 91
jpayne@69 92 /**
jpayne@69 93 * Cache of boundary positions within a region of text that has been
jpayne@69 94 * sub-divided by dictionary based breaking.
jpayne@69 95 */
jpayne@69 96 class DictionaryCache;
jpayne@69 97 DictionaryCache *fDictionaryCache;
jpayne@69 98
jpayne@69 99 /**
jpayne@69 100 *
jpayne@69 101 * If present, UStack of LanguageBreakEngine objects that might handle
jpayne@69 102 * dictionary characters. Searched from top to bottom to find an object to
jpayne@69 103 * handle a given character.
jpayne@69 104 * @internal (private)
jpayne@69 105 */
jpayne@69 106 UStack *fLanguageBreakEngines;
jpayne@69 107
jpayne@69 108 /**
jpayne@69 109 *
jpayne@69 110 * If present, the special LanguageBreakEngine used for handling
jpayne@69 111 * characters that are in the dictionary set, but not handled by any
jpayne@69 112 * LanguageBreakEngine.
jpayne@69 113 * @internal (private)
jpayne@69 114 */
jpayne@69 115 UnhandledEngine *fUnhandledBreakEngine;
jpayne@69 116
jpayne@69 117 /**
jpayne@69 118 * Counter for the number of characters encountered with the "dictionary"
jpayne@69 119 * flag set.
jpayne@69 120 * @internal (private)
jpayne@69 121 */
jpayne@69 122 uint32_t fDictionaryCharCount;
jpayne@69 123
jpayne@69 124 /**
jpayne@69 125 * A character iterator that refers to the same text as the UText, above.
jpayne@69 126 * Only included for compatibility with old API, which was based on CharacterIterators.
jpayne@69 127 * Value may be adopted from outside, or one of fSCharIter or fDCharIter, below.
jpayne@69 128 */
jpayne@69 129 CharacterIterator *fCharIter;
jpayne@69 130
jpayne@69 131 /**
jpayne@69 132 * When the input text is provided by a UnicodeString, this will point to
jpayne@69 133 * a characterIterator that wraps that data. Needed only for the
jpayne@69 134 * implementation of getText(), a backwards compatibility issue.
jpayne@69 135 */
jpayne@69 136 StringCharacterIterator fSCharIter;
jpayne@69 137
jpayne@69 138 /**
jpayne@69 139 * True when iteration has run off the end, and iterator functions should return UBRK_DONE.
jpayne@69 140 */
jpayne@69 141 UBool fDone;
jpayne@69 142
jpayne@69 143 //=======================================================================
jpayne@69 144 // constructors
jpayne@69 145 //=======================================================================
jpayne@69 146
jpayne@69 147 /**
jpayne@69 148 * Constructor from a flattened set of RBBI data in malloced memory.
jpayne@69 149 * RulesBasedBreakIterators built from a custom set of rules
jpayne@69 150 * are created via this constructor; the rules are compiled
jpayne@69 151 * into memory, then the break iterator is constructed here.
jpayne@69 152 *
jpayne@69 153 * The break iterator adopts the memory, and will
jpayne@69 154 * free it when done.
jpayne@69 155 * @internal (private)
jpayne@69 156 */
jpayne@69 157 RuleBasedBreakIterator(RBBIDataHeader* data, UErrorCode &status);
jpayne@69 158
jpayne@69 159 /** @internal */
jpayne@69 160 friend class RBBIRuleBuilder;
jpayne@69 161 /** @internal */
jpayne@69 162 friend class BreakIterator;
jpayne@69 163
jpayne@69 164 public:
jpayne@69 165
jpayne@69 166 /** Default constructor. Creates an empty shell of an iterator, with no
jpayne@69 167 * rules or text to iterate over. Object can subsequently be assigned to.
jpayne@69 168 * @stable ICU 2.2
jpayne@69 169 */
jpayne@69 170 RuleBasedBreakIterator();
jpayne@69 171
jpayne@69 172 /**
jpayne@69 173 * Copy constructor. Will produce a break iterator with the same behavior,
jpayne@69 174 * and which iterates over the same text, as the one passed in.
jpayne@69 175 * @param that The RuleBasedBreakIterator passed to be copied
jpayne@69 176 * @stable ICU 2.0
jpayne@69 177 */
jpayne@69 178 RuleBasedBreakIterator(const RuleBasedBreakIterator& that);
jpayne@69 179
jpayne@69 180 /**
jpayne@69 181 * Construct a RuleBasedBreakIterator from a set of rules supplied as a string.
jpayne@69 182 * @param rules The break rules to be used.
jpayne@69 183 * @param parseError In the event of a syntax error in the rules, provides the location
jpayne@69 184 * within the rules of the problem.
jpayne@69 185 * @param status Information on any errors encountered.
jpayne@69 186 * @stable ICU 2.2
jpayne@69 187 */
jpayne@69 188 RuleBasedBreakIterator( const UnicodeString &rules,
jpayne@69 189 UParseError &parseError,
jpayne@69 190 UErrorCode &status);
jpayne@69 191
jpayne@69 192 /**
jpayne@69 193 * Construct a RuleBasedBreakIterator from a set of precompiled binary rules.
jpayne@69 194 * Binary rules are obtained from RulesBasedBreakIterator::getBinaryRules().
jpayne@69 195 * Construction of a break iterator in this way is substantially faster than
jpayne@69 196 * construction from source rules.
jpayne@69 197 *
jpayne@69 198 * Ownership of the storage containing the compiled rules remains with the
jpayne@69 199 * caller of this function. The compiled rules must not be modified or
jpayne@69 200 * deleted during the life of the break iterator.
jpayne@69 201 *
jpayne@69 202 * The compiled rules are not compatible across different major versions of ICU.
jpayne@69 203 * The compiled rules are compatible only between machines with the same
jpayne@69 204 * byte ordering (little or big endian) and the same base character set family
jpayne@69 205 * (ASCII or EBCDIC).
jpayne@69 206 *
jpayne@69 207 * @see #getBinaryRules
jpayne@69 208 * @param compiledRules A pointer to the compiled break rules to be used.
jpayne@69 209 * @param ruleLength The length of the compiled break rules, in bytes. This
jpayne@69 210 * corresponds to the length value produced by getBinaryRules().
jpayne@69 211 * @param status Information on any errors encountered, including invalid
jpayne@69 212 * binary rules.
jpayne@69 213 * @stable ICU 4.8
jpayne@69 214 */
jpayne@69 215 RuleBasedBreakIterator(const uint8_t *compiledRules,
jpayne@69 216 uint32_t ruleLength,
jpayne@69 217 UErrorCode &status);
jpayne@69 218
jpayne@69 219 /**
jpayne@69 220 * This constructor uses the udata interface to create a BreakIterator
jpayne@69 221 * whose internal tables live in a memory-mapped file. "image" is an
jpayne@69 222 * ICU UDataMemory handle for the pre-compiled break iterator tables.
jpayne@69 223 * @param image handle to the memory image for the break iterator data.
jpayne@69 224 * Ownership of the UDataMemory handle passes to the Break Iterator,
jpayne@69 225 * which will be responsible for closing it when it is no longer needed.
jpayne@69 226 * @param status Information on any errors encountered.
jpayne@69 227 * @see udata_open
jpayne@69 228 * @see #getBinaryRules
jpayne@69 229 * @stable ICU 2.8
jpayne@69 230 */
jpayne@69 231 RuleBasedBreakIterator(UDataMemory* image, UErrorCode &status);
jpayne@69 232
jpayne@69 233 /**
jpayne@69 234 * Destructor
jpayne@69 235 * @stable ICU 2.0
jpayne@69 236 */
jpayne@69 237 virtual ~RuleBasedBreakIterator();
jpayne@69 238
jpayne@69 239 /**
jpayne@69 240 * Assignment operator. Sets this iterator to have the same behavior,
jpayne@69 241 * and iterate over the same text, as the one passed in.
jpayne@69 242 * @param that The RuleBasedBreakItertor passed in
jpayne@69 243 * @return the newly created RuleBasedBreakIterator
jpayne@69 244 * @stable ICU 2.0
jpayne@69 245 */
jpayne@69 246 RuleBasedBreakIterator& operator=(const RuleBasedBreakIterator& that);
jpayne@69 247
jpayne@69 248 /**
jpayne@69 249 * Equality operator. Returns TRUE if both BreakIterators are of the
jpayne@69 250 * same class, have the same behavior, and iterate over the same text.
jpayne@69 251 * @param that The BreakIterator to be compared for equality
jpayne@69 252 * @return TRUE if both BreakIterators are of the
jpayne@69 253 * same class, have the same behavior, and iterate over the same text.
jpayne@69 254 * @stable ICU 2.0
jpayne@69 255 */
jpayne@69 256 virtual UBool operator==(const BreakIterator& that) const;
jpayne@69 257
jpayne@69 258 /**
jpayne@69 259 * Not-equal operator. If operator== returns TRUE, this returns FALSE,
jpayne@69 260 * and vice versa.
jpayne@69 261 * @param that The BreakIterator to be compared for inequality
jpayne@69 262 * @return TRUE if both BreakIterators are not same.
jpayne@69 263 * @stable ICU 2.0
jpayne@69 264 */
jpayne@69 265 inline UBool operator!=(const BreakIterator& that) const;
jpayne@69 266
jpayne@69 267 /**
jpayne@69 268 * Returns a newly-constructed RuleBasedBreakIterator with the same
jpayne@69 269 * behavior, and iterating over the same text, as this one.
jpayne@69 270 * Differs from the copy constructor in that it is polymorphic, and
jpayne@69 271 * will correctly clone (copy) a derived class.
jpayne@69 272 * clone() is thread safe. Multiple threads may simultaneously
jpayne@69 273 * clone the same source break iterator.
jpayne@69 274 * @return a newly-constructed RuleBasedBreakIterator
jpayne@69 275 * @stable ICU 2.0
jpayne@69 276 */
jpayne@69 277 virtual RuleBasedBreakIterator* clone() const;
jpayne@69 278
jpayne@69 279 /**
jpayne@69 280 * Compute a hash code for this BreakIterator
jpayne@69 281 * @return A hash code
jpayne@69 282 * @stable ICU 2.0
jpayne@69 283 */
jpayne@69 284 virtual int32_t hashCode(void) const;
jpayne@69 285
jpayne@69 286 /**
jpayne@69 287 * Returns the description used to create this iterator
jpayne@69 288 * @return the description used to create this iterator
jpayne@69 289 * @stable ICU 2.0
jpayne@69 290 */
jpayne@69 291 virtual const UnicodeString& getRules(void) const;
jpayne@69 292
jpayne@69 293 //=======================================================================
jpayne@69 294 // BreakIterator overrides
jpayne@69 295 //=======================================================================
jpayne@69 296
jpayne@69 297 /**
jpayne@69 298 * <p>
jpayne@69 299 * Return a CharacterIterator over the text being analyzed.
jpayne@69 300 * The returned character iterator is owned by the break iterator, and must
jpayne@69 301 * not be deleted by the caller. Repeated calls to this function may
jpayne@69 302 * return the same CharacterIterator.
jpayne@69 303 * </p>
jpayne@69 304 * <p>
jpayne@69 305 * The returned character iterator must not be used concurrently with
jpayne@69 306 * the break iterator. If concurrent operation is needed, clone the
jpayne@69 307 * returned character iterator first and operate on the clone.
jpayne@69 308 * </p>
jpayne@69 309 * <p>
jpayne@69 310 * When the break iterator is operating on text supplied via a UText,
jpayne@69 311 * this function will fail. Lacking any way to signal failures, it
jpayne@69 312 * returns an CharacterIterator containing no text.
jpayne@69 313 * The function getUText() provides similar functionality,
jpayne@69 314 * is reliable, and is more efficient.
jpayne@69 315 * </p>
jpayne@69 316 *
jpayne@69 317 * TODO: deprecate this function?
jpayne@69 318 *
jpayne@69 319 * @return An iterator over the text being analyzed.
jpayne@69 320 * @stable ICU 2.0
jpayne@69 321 */
jpayne@69 322 virtual CharacterIterator& getText(void) const;
jpayne@69 323
jpayne@69 324
jpayne@69 325 /**
jpayne@69 326 * Get a UText for the text being analyzed.
jpayne@69 327 * The returned UText is a shallow clone of the UText used internally
jpayne@69 328 * by the break iterator implementation. It can safely be used to
jpayne@69 329 * access the text without impacting any break iterator operations,
jpayne@69 330 * but the underlying text itself must not be altered.
jpayne@69 331 *
jpayne@69 332 * @param fillIn A UText to be filled in. If NULL, a new UText will be
jpayne@69 333 * allocated to hold the result.
jpayne@69 334 * @param status receives any error codes.
jpayne@69 335 * @return The current UText for this break iterator. If an input
jpayne@69 336 * UText was provided, it will always be returned.
jpayne@69 337 * @stable ICU 3.4
jpayne@69 338 */
jpayne@69 339 virtual UText *getUText(UText *fillIn, UErrorCode &status) const;
jpayne@69 340
jpayne@69 341 /**
jpayne@69 342 * Set the iterator to analyze a new piece of text. This function resets
jpayne@69 343 * the current iteration position to the beginning of the text.
jpayne@69 344 * @param newText An iterator over the text to analyze. The BreakIterator
jpayne@69 345 * takes ownership of the character iterator. The caller MUST NOT delete it!
jpayne@69 346 * @stable ICU 2.0
jpayne@69 347 */
jpayne@69 348 virtual void adoptText(CharacterIterator* newText);
jpayne@69 349
jpayne@69 350 /**
jpayne@69 351 * Set the iterator to analyze a new piece of text. This function resets
jpayne@69 352 * the current iteration position to the beginning of the text.
jpayne@69 353 *
jpayne@69 354 * The BreakIterator will retain a reference to the supplied string.
jpayne@69 355 * The caller must not modify or delete the text while the BreakIterator
jpayne@69 356 * retains the reference.
jpayne@69 357 *
jpayne@69 358 * @param newText The text to analyze.
jpayne@69 359 * @stable ICU 2.0
jpayne@69 360 */
jpayne@69 361 virtual void setText(const UnicodeString& newText);
jpayne@69 362
jpayne@69 363 /**
jpayne@69 364 * Reset the break iterator to operate over the text represented by
jpayne@69 365 * the UText. The iterator position is reset to the start.
jpayne@69 366 *
jpayne@69 367 * This function makes a shallow clone of the supplied UText. This means
jpayne@69 368 * that the caller is free to immediately close or otherwise reuse the
jpayne@69 369 * Utext that was passed as a parameter, but that the underlying text itself
jpayne@69 370 * must not be altered while being referenced by the break iterator.
jpayne@69 371 *
jpayne@69 372 * @param text The UText used to change the text.
jpayne@69 373 * @param status Receives any error codes.
jpayne@69 374 * @stable ICU 3.4
jpayne@69 375 */
jpayne@69 376 virtual void setText(UText *text, UErrorCode &status);
jpayne@69 377
jpayne@69 378 /**
jpayne@69 379 * Sets the current iteration position to the beginning of the text, position zero.
jpayne@69 380 * @return The offset of the beginning of the text, zero.
jpayne@69 381 * @stable ICU 2.0
jpayne@69 382 */
jpayne@69 383 virtual int32_t first(void);
jpayne@69 384
jpayne@69 385 /**
jpayne@69 386 * Sets the current iteration position to the end of the text.
jpayne@69 387 * @return The text's past-the-end offset.
jpayne@69 388 * @stable ICU 2.0
jpayne@69 389 */
jpayne@69 390 virtual int32_t last(void);
jpayne@69 391
jpayne@69 392 /**
jpayne@69 393 * Advances the iterator either forward or backward the specified number of steps.
jpayne@69 394 * Negative values move backward, and positive values move forward. This is
jpayne@69 395 * equivalent to repeatedly calling next() or previous().
jpayne@69 396 * @param n The number of steps to move. The sign indicates the direction
jpayne@69 397 * (negative is backwards, and positive is forwards).
jpayne@69 398 * @return The character offset of the boundary position n boundaries away from
jpayne@69 399 * the current one.
jpayne@69 400 * @stable ICU 2.0
jpayne@69 401 */
jpayne@69 402 virtual int32_t next(int32_t n);
jpayne@69 403
jpayne@69 404 /**
jpayne@69 405 * Advances the iterator to the next boundary position.
jpayne@69 406 * @return The position of the first boundary after this one.
jpayne@69 407 * @stable ICU 2.0
jpayne@69 408 */
jpayne@69 409 virtual int32_t next(void);
jpayne@69 410
jpayne@69 411 /**
jpayne@69 412 * Moves the iterator backwards, to the last boundary preceding this one.
jpayne@69 413 * @return The position of the last boundary position preceding this one.
jpayne@69 414 * @stable ICU 2.0
jpayne@69 415 */
jpayne@69 416 virtual int32_t previous(void);
jpayne@69 417
jpayne@69 418 /**
jpayne@69 419 * Sets the iterator to refer to the first boundary position following
jpayne@69 420 * the specified position.
jpayne@69 421 * @param offset The position from which to begin searching for a break position.
jpayne@69 422 * @return The position of the first break after the current position.
jpayne@69 423 * @stable ICU 2.0
jpayne@69 424 */
jpayne@69 425 virtual int32_t following(int32_t offset);
jpayne@69 426
jpayne@69 427 /**
jpayne@69 428 * Sets the iterator to refer to the last boundary position before the
jpayne@69 429 * specified position.
jpayne@69 430 * @param offset The position to begin searching for a break from.
jpayne@69 431 * @return The position of the last boundary before the starting position.
jpayne@69 432 * @stable ICU 2.0
jpayne@69 433 */
jpayne@69 434 virtual int32_t preceding(int32_t offset);
jpayne@69 435
jpayne@69 436 /**
jpayne@69 437 * Returns true if the specified position is a boundary position. As a side
jpayne@69 438 * effect, leaves the iterator pointing to the first boundary position at
jpayne@69 439 * or after "offset".
jpayne@69 440 * @param offset the offset to check.
jpayne@69 441 * @return True if "offset" is a boundary position.
jpayne@69 442 * @stable ICU 2.0
jpayne@69 443 */
jpayne@69 444 virtual UBool isBoundary(int32_t offset);
jpayne@69 445
jpayne@69 446 /**
jpayne@69 447 * Returns the current iteration position. Note that UBRK_DONE is never
jpayne@69 448 * returned from this function; if iteration has run to the end of a
jpayne@69 449 * string, current() will return the length of the string while
jpayne@69 450 * next() will return UBRK_DONE).
jpayne@69 451 * @return The current iteration position.
jpayne@69 452 * @stable ICU 2.0
jpayne@69 453 */
jpayne@69 454 virtual int32_t current(void) const;
jpayne@69 455
jpayne@69 456
jpayne@69 457 /**
jpayne@69 458 * Return the status tag from the break rule that determined the boundary at
jpayne@69 459 * the current iteration position. For break rules that do not specify a
jpayne@69 460 * status, a default value of 0 is returned. If more than one break rule
jpayne@69 461 * would cause a boundary to be located at some position in the text,
jpayne@69 462 * the numerically largest of the applicable status values is returned.
jpayne@69 463 * <p>
jpayne@69 464 * Of the standard types of ICU break iterators, only word break and
jpayne@69 465 * line break provide status values. The values are defined in
jpayne@69 466 * the header file ubrk.h. For Word breaks, the status allows distinguishing between words
jpayne@69 467 * that contain alphabetic letters, "words" that appear to be numbers,
jpayne@69 468 * punctuation and spaces, words containing ideographic characters, and
jpayne@69 469 * more. For Line Break, the status distinguishes between hard (mandatory) breaks
jpayne@69 470 * and soft (potential) break positions.
jpayne@69 471 * <p>
jpayne@69 472 * <code>getRuleStatus()</code> can be called after obtaining a boundary
jpayne@69 473 * position from <code>next()</code>, <code>previous()</code>, or
jpayne@69 474 * any other break iterator functions that returns a boundary position.
jpayne@69 475 * <p>
jpayne@69 476 * Note that <code>getRuleStatus()</code> returns the value corresponding to
jpayne@69 477 * <code>current()</code> index even after <code>next()</code> has returned DONE.
jpayne@69 478 * <p>
jpayne@69 479 * When creating custom break rules, one is free to define whatever
jpayne@69 480 * status values may be convenient for the application.
jpayne@69 481 * <p>
jpayne@69 482 * @return the status from the break rule that determined the boundary
jpayne@69 483 * at the current iteration position.
jpayne@69 484 *
jpayne@69 485 * @see UWordBreak
jpayne@69 486 * @stable ICU 2.2
jpayne@69 487 */
jpayne@69 488 virtual int32_t getRuleStatus() const;
jpayne@69 489
jpayne@69 490 /**
jpayne@69 491 * Get the status (tag) values from the break rule(s) that determined the boundary
jpayne@69 492 * at the current iteration position.
jpayne@69 493 * <p>
jpayne@69 494 * The returned status value(s) are stored into an array provided by the caller.
jpayne@69 495 * The values are stored in sorted (ascending) order.
jpayne@69 496 * If the capacity of the output array is insufficient to hold the data,
jpayne@69 497 * the output will be truncated to the available length, and a
jpayne@69 498 * U_BUFFER_OVERFLOW_ERROR will be signaled.
jpayne@69 499 *
jpayne@69 500 * @param fillInVec an array to be filled in with the status values.
jpayne@69 501 * @param capacity the length of the supplied vector. A length of zero causes
jpayne@69 502 * the function to return the number of status values, in the
jpayne@69 503 * normal way, without attempting to store any values.
jpayne@69 504 * @param status receives error codes.
jpayne@69 505 * @return The number of rule status values from the rules that determined
jpayne@69 506 * the boundary at the current iteration position.
jpayne@69 507 * In the event of a U_BUFFER_OVERFLOW_ERROR, the return value
jpayne@69 508 * is the total number of status values that were available,
jpayne@69 509 * not the reduced number that were actually returned.
jpayne@69 510 * @see getRuleStatus
jpayne@69 511 * @stable ICU 3.0
jpayne@69 512 */
jpayne@69 513 virtual int32_t getRuleStatusVec(int32_t *fillInVec, int32_t capacity, UErrorCode &status);
jpayne@69 514
jpayne@69 515 /**
jpayne@69 516 * Returns a unique class ID POLYMORPHICALLY. Pure virtual override.
jpayne@69 517 * This method is to implement a simple version of RTTI, since not all
jpayne@69 518 * C++ compilers support genuine RTTI. Polymorphic operator==() and
jpayne@69 519 * clone() methods call this method.
jpayne@69 520 *
jpayne@69 521 * @return The class ID for this object. All objects of a
jpayne@69 522 * given class have the same class ID. Objects of
jpayne@69 523 * other classes have different class IDs.
jpayne@69 524 * @stable ICU 2.0
jpayne@69 525 */
jpayne@69 526 virtual UClassID getDynamicClassID(void) const;
jpayne@69 527
jpayne@69 528 /**
jpayne@69 529 * Returns the class ID for this class. This is useful only for
jpayne@69 530 * comparing to a return value from getDynamicClassID(). For example:
jpayne@69 531 *
jpayne@69 532 * Base* polymorphic_pointer = createPolymorphicObject();
jpayne@69 533 * if (polymorphic_pointer->getDynamicClassID() ==
jpayne@69 534 * Derived::getStaticClassID()) ...
jpayne@69 535 *
jpayne@69 536 * @return The class ID for all objects of this class.
jpayne@69 537 * @stable ICU 2.0
jpayne@69 538 */
jpayne@69 539 static UClassID U_EXPORT2 getStaticClassID(void);
jpayne@69 540
jpayne@69 541 #ifndef U_FORCE_HIDE_DEPRECATED_API
jpayne@69 542 /**
jpayne@69 543 * Deprecated functionality. Use clone() instead.
jpayne@69 544 *
jpayne@69 545 * Create a clone (copy) of this break iterator in memory provided
jpayne@69 546 * by the caller. The idea is to increase performance by avoiding
jpayne@69 547 * a storage allocation. Use of this function is NOT RECOMMENDED.
jpayne@69 548 * Performance gains are minimal, and correct buffer management is
jpayne@69 549 * tricky. Use clone() instead.
jpayne@69 550 *
jpayne@69 551 * @param stackBuffer The pointer to the memory into which the cloned object
jpayne@69 552 * should be placed. If NULL, allocate heap memory
jpayne@69 553 * for the cloned object.
jpayne@69 554 * @param BufferSize The size of the buffer. If zero, return the required
jpayne@69 555 * buffer size, but do not clone the object. If the
jpayne@69 556 * size was too small (but not zero), allocate heap
jpayne@69 557 * storage for the cloned object.
jpayne@69 558 *
jpayne@69 559 * @param status Error status. U_SAFECLONE_ALLOCATED_WARNING will be
jpayne@69 560 * returned if the provided buffer was too small, and
jpayne@69 561 * the clone was therefore put on the heap.
jpayne@69 562 *
jpayne@69 563 * @return Pointer to the clone object. This may differ from the stackBuffer
jpayne@69 564 * address if the byte alignment of the stack buffer was not suitable
jpayne@69 565 * or if the stackBuffer was too small to hold the clone.
jpayne@69 566 * @deprecated ICU 52. Use clone() instead.
jpayne@69 567 */
jpayne@69 568 virtual RuleBasedBreakIterator *createBufferClone(void *stackBuffer,
jpayne@69 569 int32_t &BufferSize,
jpayne@69 570 UErrorCode &status);
jpayne@69 571 #endif // U_FORCE_HIDE_DEPRECATED_API
jpayne@69 572
jpayne@69 573 /**
jpayne@69 574 * Return the binary form of compiled break rules,
jpayne@69 575 * which can then be used to create a new break iterator at some
jpayne@69 576 * time in the future. Creating a break iterator from pre-compiled rules
jpayne@69 577 * is much faster than building one from the source form of the
jpayne@69 578 * break rules.
jpayne@69 579 *
jpayne@69 580 * The binary data can only be used with the same version of ICU
jpayne@69 581 * and on the same platform type (processor endian-ness)
jpayne@69 582 *
jpayne@69 583 * @param length Returns the length of the binary data. (Out parameter.)
jpayne@69 584 *
jpayne@69 585 * @return A pointer to the binary (compiled) rule data. The storage
jpayne@69 586 * belongs to the RulesBasedBreakIterator object, not the
jpayne@69 587 * caller, and must not be modified or deleted.
jpayne@69 588 * @stable ICU 4.8
jpayne@69 589 */
jpayne@69 590 virtual const uint8_t *getBinaryRules(uint32_t &length);
jpayne@69 591
jpayne@69 592 /**
jpayne@69 593 * Set the subject text string upon which the break iterator is operating
jpayne@69 594 * without changing any other aspect of the matching state.
jpayne@69 595 * The new and previous text strings must have the same content.
jpayne@69 596 *
jpayne@69 597 * This function is intended for use in environments where ICU is operating on
jpayne@69 598 * strings that may move around in memory. It provides a mechanism for notifying
jpayne@69 599 * ICU that the string has been relocated, and providing a new UText to access the
jpayne@69 600 * string in its new position.
jpayne@69 601 *
jpayne@69 602 * Note that the break iterator implementation never copies the underlying text
jpayne@69 603 * of a string being processed, but always operates directly on the original text
jpayne@69 604 * provided by the user. Refreshing simply drops the references to the old text
jpayne@69 605 * and replaces them with references to the new.
jpayne@69 606 *
jpayne@69 607 * Caution: this function is normally used only by very specialized,
jpayne@69 608 * system-level code. One example use case is with garbage collection that moves
jpayne@69 609 * the text in memory.
jpayne@69 610 *
jpayne@69 611 * @param input The new (moved) text string.
jpayne@69 612 * @param status Receives errors detected by this function.
jpayne@69 613 * @return *this
jpayne@69 614 *
jpayne@69 615 * @stable ICU 49
jpayne@69 616 */
jpayne@69 617 virtual RuleBasedBreakIterator &refreshInputText(UText *input, UErrorCode &status);
jpayne@69 618
jpayne@69 619
jpayne@69 620 private:
jpayne@69 621 //=======================================================================
jpayne@69 622 // implementation
jpayne@69 623 //=======================================================================
jpayne@69 624 /**
jpayne@69 625 * Dumps caches and performs other actions associated with a complete change
jpayne@69 626 * in text or iteration position.
jpayne@69 627 * @internal (private)
jpayne@69 628 */
jpayne@69 629 void reset(void);
jpayne@69 630
jpayne@69 631 /**
jpayne@69 632 * Common initialization function, used by constructors and bufferClone.
jpayne@69 633 * @internal (private)
jpayne@69 634 */
jpayne@69 635 void init(UErrorCode &status);
jpayne@69 636
jpayne@69 637 /**
jpayne@69 638 * Iterate backwards from an arbitrary position in the input text using the
jpayne@69 639 * synthesized Safe Reverse rules.
jpayne@69 640 * This locates a "Safe Position" from which the forward break rules
jpayne@69 641 * will operate correctly. A Safe Position is not necessarily a boundary itself.
jpayne@69 642 *
jpayne@69 643 * @param fromPosition the position in the input text to begin the iteration.
jpayne@69 644 * @internal (private)
jpayne@69 645 */
jpayne@69 646 int32_t handleSafePrevious(int32_t fromPosition);
jpayne@69 647
jpayne@69 648 /**
jpayne@69 649 * Find a rule-based boundary by running the state machine.
jpayne@69 650 * Input
jpayne@69 651 * fPosition, the position in the text to begin from.
jpayne@69 652 * Output
jpayne@69 653 * fPosition: the boundary following the starting position.
jpayne@69 654 * fDictionaryCharCount the number of dictionary characters encountered.
jpayne@69 655 * If > 0, the segment will be further subdivided
jpayne@69 656 * fRuleStatusIndex Info from the state table indicating which rules caused the boundary.
jpayne@69 657 *
jpayne@69 658 * @internal (private)
jpayne@69 659 */
jpayne@69 660 int32_t handleNext();
jpayne@69 661
jpayne@69 662
jpayne@69 663 /**
jpayne@69 664 * This function returns the appropriate LanguageBreakEngine for a
jpayne@69 665 * given character c.
jpayne@69 666 * @param c A character in the dictionary set
jpayne@69 667 * @internal (private)
jpayne@69 668 */
jpayne@69 669 const LanguageBreakEngine *getLanguageBreakEngine(UChar32 c);
jpayne@69 670
jpayne@69 671 public:
jpayne@69 672 #ifndef U_HIDE_INTERNAL_API
jpayne@69 673 /**
jpayne@69 674 * Debugging function only.
jpayne@69 675 * @internal
jpayne@69 676 */
jpayne@69 677 void dumpCache();
jpayne@69 678
jpayne@69 679 /**
jpayne@69 680 * Debugging function only.
jpayne@69 681 * @internal
jpayne@69 682 */
jpayne@69 683 void dumpTables();
jpayne@69 684
jpayne@69 685 #endif /* U_HIDE_INTERNAL_API */
jpayne@69 686 };
jpayne@69 687
jpayne@69 688 //------------------------------------------------------------------------------
jpayne@69 689 //
jpayne@69 690 // Inline Functions Definitions ...
jpayne@69 691 //
jpayne@69 692 //------------------------------------------------------------------------------
jpayne@69 693
jpayne@69 694 inline UBool RuleBasedBreakIterator::operator!=(const BreakIterator& that) const {
jpayne@69 695 return !operator==(that);
jpayne@69 696 }
jpayne@69 697
jpayne@69 698 U_NAMESPACE_END
jpayne@69 699
jpayne@69 700 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */
jpayne@69 701
jpayne@69 702 #endif /* U_SHOW_CPLUSPLUS_API */
jpayne@69 703
jpayne@69 704 #endif