annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/ucharstrie.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 // © 2016 and later: Unicode, Inc. and others.
jpayne@69 2 // License & terms of use: http://www.unicode.org/copyright.html
jpayne@69 3 /*
jpayne@69 4 *******************************************************************************
jpayne@69 5 * Copyright (C) 2010-2012, International Business Machines
jpayne@69 6 * Corporation and others. All Rights Reserved.
jpayne@69 7 *******************************************************************************
jpayne@69 8 * file name: ucharstrie.h
jpayne@69 9 * encoding: UTF-8
jpayne@69 10 * tab size: 8 (not used)
jpayne@69 11 * indentation:4
jpayne@69 12 *
jpayne@69 13 * created on: 2010nov14
jpayne@69 14 * created by: Markus W. Scherer
jpayne@69 15 */
jpayne@69 16
jpayne@69 17 #ifndef __UCHARSTRIE_H__
jpayne@69 18 #define __UCHARSTRIE_H__
jpayne@69 19
jpayne@69 20 /**
jpayne@69 21 * \file
jpayne@69 22 * \brief C++ API: Trie for mapping Unicode strings (or 16-bit-unit sequences)
jpayne@69 23 * to integer values.
jpayne@69 24 */
jpayne@69 25
jpayne@69 26 #include "unicode/utypes.h"
jpayne@69 27
jpayne@69 28 #if U_SHOW_CPLUSPLUS_API
jpayne@69 29
jpayne@69 30 #include "unicode/unistr.h"
jpayne@69 31 #include "unicode/uobject.h"
jpayne@69 32 #include "unicode/ustringtrie.h"
jpayne@69 33
jpayne@69 34 U_NAMESPACE_BEGIN
jpayne@69 35
jpayne@69 36 class Appendable;
jpayne@69 37 class UCharsTrieBuilder;
jpayne@69 38 class UVector32;
jpayne@69 39
jpayne@69 40 /**
jpayne@69 41 * Light-weight, non-const reader class for a UCharsTrie.
jpayne@69 42 * Traverses a char16_t-serialized data structure with minimal state,
jpayne@69 43 * for mapping strings (16-bit-unit sequences) to non-negative integer values.
jpayne@69 44 *
jpayne@69 45 * This class owns the serialized trie data only if it was constructed by
jpayne@69 46 * the builder's build() method.
jpayne@69 47 * The public constructor and the copy constructor only alias the data (only copy the pointer).
jpayne@69 48 * There is no assignment operator.
jpayne@69 49 *
jpayne@69 50 * This class is not intended for public subclassing.
jpayne@69 51 * @stable ICU 4.8
jpayne@69 52 */
jpayne@69 53 class U_COMMON_API UCharsTrie : public UMemory {
jpayne@69 54 public:
jpayne@69 55 /**
jpayne@69 56 * Constructs a UCharsTrie reader instance.
jpayne@69 57 *
jpayne@69 58 * The trieUChars must contain a copy of a char16_t sequence from the UCharsTrieBuilder,
jpayne@69 59 * starting with the first char16_t of that sequence.
jpayne@69 60 * The UCharsTrie object will not read more char16_ts than
jpayne@69 61 * the UCharsTrieBuilder generated in the corresponding build() call.
jpayne@69 62 *
jpayne@69 63 * The array is not copied/cloned and must not be modified while
jpayne@69 64 * the UCharsTrie object is in use.
jpayne@69 65 *
jpayne@69 66 * @param trieUChars The char16_t array that contains the serialized trie.
jpayne@69 67 * @stable ICU 4.8
jpayne@69 68 */
jpayne@69 69 UCharsTrie(ConstChar16Ptr trieUChars)
jpayne@69 70 : ownedArray_(NULL), uchars_(trieUChars),
jpayne@69 71 pos_(uchars_), remainingMatchLength_(-1) {}
jpayne@69 72
jpayne@69 73 /**
jpayne@69 74 * Destructor.
jpayne@69 75 * @stable ICU 4.8
jpayne@69 76 */
jpayne@69 77 ~UCharsTrie();
jpayne@69 78
jpayne@69 79 /**
jpayne@69 80 * Copy constructor, copies the other trie reader object and its state,
jpayne@69 81 * but not the char16_t array which will be shared. (Shallow copy.)
jpayne@69 82 * @param other Another UCharsTrie object.
jpayne@69 83 * @stable ICU 4.8
jpayne@69 84 */
jpayne@69 85 UCharsTrie(const UCharsTrie &other)
jpayne@69 86 : ownedArray_(NULL), uchars_(other.uchars_),
jpayne@69 87 pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {}
jpayne@69 88
jpayne@69 89 /**
jpayne@69 90 * Resets this trie to its initial state.
jpayne@69 91 * @return *this
jpayne@69 92 * @stable ICU 4.8
jpayne@69 93 */
jpayne@69 94 UCharsTrie &reset() {
jpayne@69 95 pos_=uchars_;
jpayne@69 96 remainingMatchLength_=-1;
jpayne@69 97 return *this;
jpayne@69 98 }
jpayne@69 99
jpayne@69 100 #ifndef U_HIDE_DRAFT_API
jpayne@69 101 /**
jpayne@69 102 * Returns the state of this trie as a 64-bit integer.
jpayne@69 103 * The state value is never 0.
jpayne@69 104 *
jpayne@69 105 * @return opaque state value
jpayne@69 106 * @see resetToState64
jpayne@69 107 * @draft ICU 65
jpayne@69 108 */
jpayne@69 109 uint64_t getState64() const {
jpayne@69 110 return (static_cast<uint64_t>(remainingMatchLength_ + 2) << kState64RemainingShift) |
jpayne@69 111 (uint64_t)(pos_ - uchars_);
jpayne@69 112 }
jpayne@69 113
jpayne@69 114 /**
jpayne@69 115 * Resets this trie to the saved state.
jpayne@69 116 * Unlike resetToState(State), the 64-bit state value
jpayne@69 117 * must be from getState64() from the same trie object or
jpayne@69 118 * from one initialized the exact same way.
jpayne@69 119 * Because of no validation, this method is faster.
jpayne@69 120 *
jpayne@69 121 * @param state The opaque trie state value from getState64().
jpayne@69 122 * @return *this
jpayne@69 123 * @see getState64
jpayne@69 124 * @see resetToState
jpayne@69 125 * @see reset
jpayne@69 126 * @draft ICU 65
jpayne@69 127 */
jpayne@69 128 UCharsTrie &resetToState64(uint64_t state) {
jpayne@69 129 remainingMatchLength_ = static_cast<int32_t>(state >> kState64RemainingShift) - 2;
jpayne@69 130 pos_ = uchars_ + (state & kState64PosMask);
jpayne@69 131 return *this;
jpayne@69 132 }
jpayne@69 133 #endif /* U_HIDE_DRAFT_API */
jpayne@69 134
jpayne@69 135 /**
jpayne@69 136 * UCharsTrie state object, for saving a trie's current state
jpayne@69 137 * and resetting the trie back to this state later.
jpayne@69 138 * @stable ICU 4.8
jpayne@69 139 */
jpayne@69 140 class State : public UMemory {
jpayne@69 141 public:
jpayne@69 142 /**
jpayne@69 143 * Constructs an empty State.
jpayne@69 144 * @stable ICU 4.8
jpayne@69 145 */
jpayne@69 146 State() { uchars=NULL; }
jpayne@69 147 private:
jpayne@69 148 friend class UCharsTrie;
jpayne@69 149
jpayne@69 150 const char16_t *uchars;
jpayne@69 151 const char16_t *pos;
jpayne@69 152 int32_t remainingMatchLength;
jpayne@69 153 };
jpayne@69 154
jpayne@69 155 /**
jpayne@69 156 * Saves the state of this trie.
jpayne@69 157 * @param state The State object to hold the trie's state.
jpayne@69 158 * @return *this
jpayne@69 159 * @see resetToState
jpayne@69 160 * @stable ICU 4.8
jpayne@69 161 */
jpayne@69 162 const UCharsTrie &saveState(State &state) const {
jpayne@69 163 state.uchars=uchars_;
jpayne@69 164 state.pos=pos_;
jpayne@69 165 state.remainingMatchLength=remainingMatchLength_;
jpayne@69 166 return *this;
jpayne@69 167 }
jpayne@69 168
jpayne@69 169 /**
jpayne@69 170 * Resets this trie to the saved state.
jpayne@69 171 * If the state object contains no state, or the state of a different trie,
jpayne@69 172 * then this trie remains unchanged.
jpayne@69 173 * @param state The State object which holds a saved trie state.
jpayne@69 174 * @return *this
jpayne@69 175 * @see saveState
jpayne@69 176 * @see reset
jpayne@69 177 * @stable ICU 4.8
jpayne@69 178 */
jpayne@69 179 UCharsTrie &resetToState(const State &state) {
jpayne@69 180 if(uchars_==state.uchars && uchars_!=NULL) {
jpayne@69 181 pos_=state.pos;
jpayne@69 182 remainingMatchLength_=state.remainingMatchLength;
jpayne@69 183 }
jpayne@69 184 return *this;
jpayne@69 185 }
jpayne@69 186
jpayne@69 187 /**
jpayne@69 188 * Determines whether the string so far matches, whether it has a value,
jpayne@69 189 * and whether another input char16_t can continue a matching string.
jpayne@69 190 * @return The match/value Result.
jpayne@69 191 * @stable ICU 4.8
jpayne@69 192 */
jpayne@69 193 UStringTrieResult current() const;
jpayne@69 194
jpayne@69 195 /**
jpayne@69 196 * Traverses the trie from the initial state for this input char16_t.
jpayne@69 197 * Equivalent to reset().next(uchar).
jpayne@69 198 * @param uchar Input char value. Values below 0 and above 0xffff will never match.
jpayne@69 199 * @return The match/value Result.
jpayne@69 200 * @stable ICU 4.8
jpayne@69 201 */
jpayne@69 202 inline UStringTrieResult first(int32_t uchar) {
jpayne@69 203 remainingMatchLength_=-1;
jpayne@69 204 return nextImpl(uchars_, uchar);
jpayne@69 205 }
jpayne@69 206
jpayne@69 207 /**
jpayne@69 208 * Traverses the trie from the initial state for the
jpayne@69 209 * one or two UTF-16 code units for this input code point.
jpayne@69 210 * Equivalent to reset().nextForCodePoint(cp).
jpayne@69 211 * @param cp A Unicode code point 0..0x10ffff.
jpayne@69 212 * @return The match/value Result.
jpayne@69 213 * @stable ICU 4.8
jpayne@69 214 */
jpayne@69 215 UStringTrieResult firstForCodePoint(UChar32 cp);
jpayne@69 216
jpayne@69 217 /**
jpayne@69 218 * Traverses the trie from the current state for this input char16_t.
jpayne@69 219 * @param uchar Input char value. Values below 0 and above 0xffff will never match.
jpayne@69 220 * @return The match/value Result.
jpayne@69 221 * @stable ICU 4.8
jpayne@69 222 */
jpayne@69 223 UStringTrieResult next(int32_t uchar);
jpayne@69 224
jpayne@69 225 /**
jpayne@69 226 * Traverses the trie from the current state for the
jpayne@69 227 * one or two UTF-16 code units for this input code point.
jpayne@69 228 * @param cp A Unicode code point 0..0x10ffff.
jpayne@69 229 * @return The match/value Result.
jpayne@69 230 * @stable ICU 4.8
jpayne@69 231 */
jpayne@69 232 UStringTrieResult nextForCodePoint(UChar32 cp);
jpayne@69 233
jpayne@69 234 /**
jpayne@69 235 * Traverses the trie from the current state for this string.
jpayne@69 236 * Equivalent to
jpayne@69 237 * \code
jpayne@69 238 * Result result=current();
jpayne@69 239 * for(each c in s)
jpayne@69 240 * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH;
jpayne@69 241 * result=next(c);
jpayne@69 242 * return result;
jpayne@69 243 * \endcode
jpayne@69 244 * @param s A string. Can be NULL if length is 0.
jpayne@69 245 * @param length The length of the string. Can be -1 if NUL-terminated.
jpayne@69 246 * @return The match/value Result.
jpayne@69 247 * @stable ICU 4.8
jpayne@69 248 */
jpayne@69 249 UStringTrieResult next(ConstChar16Ptr s, int32_t length);
jpayne@69 250
jpayne@69 251 /**
jpayne@69 252 * Returns a matching string's value if called immediately after
jpayne@69 253 * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE.
jpayne@69 254 * getValue() can be called multiple times.
jpayne@69 255 *
jpayne@69 256 * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE!
jpayne@69 257 * @return The value for the string so far.
jpayne@69 258 * @stable ICU 4.8
jpayne@69 259 */
jpayne@69 260 inline int32_t getValue() const {
jpayne@69 261 const char16_t *pos=pos_;
jpayne@69 262 int32_t leadUnit=*pos++;
jpayne@69 263 // U_ASSERT(leadUnit>=kMinValueLead);
jpayne@69 264 return leadUnit&kValueIsFinal ?
jpayne@69 265 readValue(pos, leadUnit&0x7fff) : readNodeValue(pos, leadUnit);
jpayne@69 266 }
jpayne@69 267
jpayne@69 268 /**
jpayne@69 269 * Determines whether all strings reachable from the current state
jpayne@69 270 * map to the same value.
jpayne@69 271 * @param uniqueValue Receives the unique value, if this function returns TRUE.
jpayne@69 272 * (output-only)
jpayne@69 273 * @return TRUE if all strings reachable from the current state
jpayne@69 274 * map to the same value.
jpayne@69 275 * @stable ICU 4.8
jpayne@69 276 */
jpayne@69 277 inline UBool hasUniqueValue(int32_t &uniqueValue) const {
jpayne@69 278 const char16_t *pos=pos_;
jpayne@69 279 // Skip the rest of a pending linear-match node.
jpayne@69 280 return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue);
jpayne@69 281 }
jpayne@69 282
jpayne@69 283 /**
jpayne@69 284 * Finds each char16_t which continues the string from the current state.
jpayne@69 285 * That is, each char16_t c for which it would be next(c)!=USTRINGTRIE_NO_MATCH now.
jpayne@69 286 * @param out Each next char16_t is appended to this object.
jpayne@69 287 * @return the number of char16_ts which continue the string from here
jpayne@69 288 * @stable ICU 4.8
jpayne@69 289 */
jpayne@69 290 int32_t getNextUChars(Appendable &out) const;
jpayne@69 291
jpayne@69 292 /**
jpayne@69 293 * Iterator for all of the (string, value) pairs in a UCharsTrie.
jpayne@69 294 * @stable ICU 4.8
jpayne@69 295 */
jpayne@69 296 class U_COMMON_API Iterator : public UMemory {
jpayne@69 297 public:
jpayne@69 298 /**
jpayne@69 299 * Iterates from the root of a char16_t-serialized UCharsTrie.
jpayne@69 300 * @param trieUChars The trie char16_ts.
jpayne@69 301 * @param maxStringLength If 0, the iterator returns full strings.
jpayne@69 302 * Otherwise, the iterator returns strings with this maximum length.
jpayne@69 303 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 304 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 305 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 306 * function chaining. (See User Guide for details.)
jpayne@69 307 * @stable ICU 4.8
jpayne@69 308 */
jpayne@69 309 Iterator(ConstChar16Ptr trieUChars, int32_t maxStringLength, UErrorCode &errorCode);
jpayne@69 310
jpayne@69 311 /**
jpayne@69 312 * Iterates from the current state of the specified UCharsTrie.
jpayne@69 313 * @param trie The trie whose state will be copied for iteration.
jpayne@69 314 * @param maxStringLength If 0, the iterator returns full strings.
jpayne@69 315 * Otherwise, the iterator returns strings with this maximum length.
jpayne@69 316 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 317 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 318 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 319 * function chaining. (See User Guide for details.)
jpayne@69 320 * @stable ICU 4.8
jpayne@69 321 */
jpayne@69 322 Iterator(const UCharsTrie &trie, int32_t maxStringLength, UErrorCode &errorCode);
jpayne@69 323
jpayne@69 324 /**
jpayne@69 325 * Destructor.
jpayne@69 326 * @stable ICU 4.8
jpayne@69 327 */
jpayne@69 328 ~Iterator();
jpayne@69 329
jpayne@69 330 /**
jpayne@69 331 * Resets this iterator to its initial state.
jpayne@69 332 * @return *this
jpayne@69 333 * @stable ICU 4.8
jpayne@69 334 */
jpayne@69 335 Iterator &reset();
jpayne@69 336
jpayne@69 337 /**
jpayne@69 338 * @return TRUE if there are more elements.
jpayne@69 339 * @stable ICU 4.8
jpayne@69 340 */
jpayne@69 341 UBool hasNext() const;
jpayne@69 342
jpayne@69 343 /**
jpayne@69 344 * Finds the next (string, value) pair if there is one.
jpayne@69 345 *
jpayne@69 346 * If the string is truncated to the maximum length and does not
jpayne@69 347 * have a real value, then the value is set to -1.
jpayne@69 348 * In this case, this "not a real value" is indistinguishable from
jpayne@69 349 * a real value of -1.
jpayne@69 350 * @param errorCode Standard ICU error code. Its input value must
jpayne@69 351 * pass the U_SUCCESS() test, or else the function returns
jpayne@69 352 * immediately. Check for U_FAILURE() on output or use with
jpayne@69 353 * function chaining. (See User Guide for details.)
jpayne@69 354 * @return TRUE if there is another element.
jpayne@69 355 * @stable ICU 4.8
jpayne@69 356 */
jpayne@69 357 UBool next(UErrorCode &errorCode);
jpayne@69 358
jpayne@69 359 /**
jpayne@69 360 * @return The string for the last successful next().
jpayne@69 361 * @stable ICU 4.8
jpayne@69 362 */
jpayne@69 363 const UnicodeString &getString() const { return str_; }
jpayne@69 364 /**
jpayne@69 365 * @return The value for the last successful next().
jpayne@69 366 * @stable ICU 4.8
jpayne@69 367 */
jpayne@69 368 int32_t getValue() const { return value_; }
jpayne@69 369
jpayne@69 370 private:
jpayne@69 371 UBool truncateAndStop() {
jpayne@69 372 pos_=NULL;
jpayne@69 373 value_=-1; // no real value for str
jpayne@69 374 return TRUE;
jpayne@69 375 }
jpayne@69 376
jpayne@69 377 const char16_t *branchNext(const char16_t *pos, int32_t length, UErrorCode &errorCode);
jpayne@69 378
jpayne@69 379 const char16_t *uchars_;
jpayne@69 380 const char16_t *pos_;
jpayne@69 381 const char16_t *initialPos_;
jpayne@69 382 int32_t remainingMatchLength_;
jpayne@69 383 int32_t initialRemainingMatchLength_;
jpayne@69 384 UBool skipValue_; // Skip intermediate value which was already delivered.
jpayne@69 385
jpayne@69 386 UnicodeString str_;
jpayne@69 387 int32_t maxLength_;
jpayne@69 388 int32_t value_;
jpayne@69 389
jpayne@69 390 // The stack stores pairs of integers for backtracking to another
jpayne@69 391 // outbound edge of a branch node.
jpayne@69 392 // The first integer is an offset from uchars_.
jpayne@69 393 // The second integer has the str_.length() from before the node in bits 15..0,
jpayne@69 394 // and the remaining branch length in bits 31..16.
jpayne@69 395 // (We could store the remaining branch length minus 1 in bits 30..16 and not use the sign bit,
jpayne@69 396 // but the code looks more confusing that way.)
jpayne@69 397 UVector32 *stack_;
jpayne@69 398 };
jpayne@69 399
jpayne@69 400 private:
jpayne@69 401 friend class UCharsTrieBuilder;
jpayne@69 402
jpayne@69 403 /**
jpayne@69 404 * Constructs a UCharsTrie reader instance.
jpayne@69 405 * Unlike the public constructor which just aliases an array,
jpayne@69 406 * this constructor adopts the builder's array.
jpayne@69 407 * This constructor is only called by the builder.
jpayne@69 408 */
jpayne@69 409 UCharsTrie(char16_t *adoptUChars, const char16_t *trieUChars)
jpayne@69 410 : ownedArray_(adoptUChars), uchars_(trieUChars),
jpayne@69 411 pos_(uchars_), remainingMatchLength_(-1) {}
jpayne@69 412
jpayne@69 413 // No assignment operator.
jpayne@69 414 UCharsTrie &operator=(const UCharsTrie &other);
jpayne@69 415
jpayne@69 416 inline void stop() {
jpayne@69 417 pos_=NULL;
jpayne@69 418 }
jpayne@69 419
jpayne@69 420 // Reads a compact 32-bit integer.
jpayne@69 421 // pos is already after the leadUnit, and the lead unit has bit 15 reset.
jpayne@69 422 static inline int32_t readValue(const char16_t *pos, int32_t leadUnit) {
jpayne@69 423 int32_t value;
jpayne@69 424 if(leadUnit<kMinTwoUnitValueLead) {
jpayne@69 425 value=leadUnit;
jpayne@69 426 } else if(leadUnit<kThreeUnitValueLead) {
jpayne@69 427 value=((leadUnit-kMinTwoUnitValueLead)<<16)|*pos;
jpayne@69 428 } else {
jpayne@69 429 value=(pos[0]<<16)|pos[1];
jpayne@69 430 }
jpayne@69 431 return value;
jpayne@69 432 }
jpayne@69 433 static inline const char16_t *skipValue(const char16_t *pos, int32_t leadUnit) {
jpayne@69 434 if(leadUnit>=kMinTwoUnitValueLead) {
jpayne@69 435 if(leadUnit<kThreeUnitValueLead) {
jpayne@69 436 ++pos;
jpayne@69 437 } else {
jpayne@69 438 pos+=2;
jpayne@69 439 }
jpayne@69 440 }
jpayne@69 441 return pos;
jpayne@69 442 }
jpayne@69 443 static inline const char16_t *skipValue(const char16_t *pos) {
jpayne@69 444 int32_t leadUnit=*pos++;
jpayne@69 445 return skipValue(pos, leadUnit&0x7fff);
jpayne@69 446 }
jpayne@69 447
jpayne@69 448 static inline int32_t readNodeValue(const char16_t *pos, int32_t leadUnit) {
jpayne@69 449 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
jpayne@69 450 int32_t value;
jpayne@69 451 if(leadUnit<kMinTwoUnitNodeValueLead) {
jpayne@69 452 value=(leadUnit>>6)-1;
jpayne@69 453 } else if(leadUnit<kThreeUnitNodeValueLead) {
jpayne@69 454 value=(((leadUnit&0x7fc0)-kMinTwoUnitNodeValueLead)<<10)|*pos;
jpayne@69 455 } else {
jpayne@69 456 value=(pos[0]<<16)|pos[1];
jpayne@69 457 }
jpayne@69 458 return value;
jpayne@69 459 }
jpayne@69 460 static inline const char16_t *skipNodeValue(const char16_t *pos, int32_t leadUnit) {
jpayne@69 461 // U_ASSERT(kMinValueLead<=leadUnit && leadUnit<kValueIsFinal);
jpayne@69 462 if(leadUnit>=kMinTwoUnitNodeValueLead) {
jpayne@69 463 if(leadUnit<kThreeUnitNodeValueLead) {
jpayne@69 464 ++pos;
jpayne@69 465 } else {
jpayne@69 466 pos+=2;
jpayne@69 467 }
jpayne@69 468 }
jpayne@69 469 return pos;
jpayne@69 470 }
jpayne@69 471
jpayne@69 472 static inline const char16_t *jumpByDelta(const char16_t *pos) {
jpayne@69 473 int32_t delta=*pos++;
jpayne@69 474 if(delta>=kMinTwoUnitDeltaLead) {
jpayne@69 475 if(delta==kThreeUnitDeltaLead) {
jpayne@69 476 delta=(pos[0]<<16)|pos[1];
jpayne@69 477 pos+=2;
jpayne@69 478 } else {
jpayne@69 479 delta=((delta-kMinTwoUnitDeltaLead)<<16)|*pos++;
jpayne@69 480 }
jpayne@69 481 }
jpayne@69 482 return pos+delta;
jpayne@69 483 }
jpayne@69 484
jpayne@69 485 static const char16_t *skipDelta(const char16_t *pos) {
jpayne@69 486 int32_t delta=*pos++;
jpayne@69 487 if(delta>=kMinTwoUnitDeltaLead) {
jpayne@69 488 if(delta==kThreeUnitDeltaLead) {
jpayne@69 489 pos+=2;
jpayne@69 490 } else {
jpayne@69 491 ++pos;
jpayne@69 492 }
jpayne@69 493 }
jpayne@69 494 return pos;
jpayne@69 495 }
jpayne@69 496
jpayne@69 497 static inline UStringTrieResult valueResult(int32_t node) {
jpayne@69 498 return (UStringTrieResult)(USTRINGTRIE_INTERMEDIATE_VALUE-(node>>15));
jpayne@69 499 }
jpayne@69 500
jpayne@69 501 // Handles a branch node for both next(uchar) and next(string).
jpayne@69 502 UStringTrieResult branchNext(const char16_t *pos, int32_t length, int32_t uchar);
jpayne@69 503
jpayne@69 504 // Requires remainingLength_<0.
jpayne@69 505 UStringTrieResult nextImpl(const char16_t *pos, int32_t uchar);
jpayne@69 506
jpayne@69 507 // Helper functions for hasUniqueValue().
jpayne@69 508 // Recursively finds a unique value (or whether there is not a unique one)
jpayne@69 509 // from a branch.
jpayne@69 510 static const char16_t *findUniqueValueFromBranch(const char16_t *pos, int32_t length,
jpayne@69 511 UBool haveUniqueValue, int32_t &uniqueValue);
jpayne@69 512 // Recursively finds a unique value (or whether there is not a unique one)
jpayne@69 513 // starting from a position on a node lead unit.
jpayne@69 514 static UBool findUniqueValue(const char16_t *pos, UBool haveUniqueValue, int32_t &uniqueValue);
jpayne@69 515
jpayne@69 516 // Helper functions for getNextUChars().
jpayne@69 517 // getNextUChars() when pos is on a branch node.
jpayne@69 518 static void getNextBranchUChars(const char16_t *pos, int32_t length, Appendable &out);
jpayne@69 519
jpayne@69 520 // UCharsTrie data structure
jpayne@69 521 //
jpayne@69 522 // The trie consists of a series of char16_t-serialized nodes for incremental
jpayne@69 523 // Unicode string/char16_t sequence matching. (char16_t=16-bit unsigned integer)
jpayne@69 524 // The root node is at the beginning of the trie data.
jpayne@69 525 //
jpayne@69 526 // Types of nodes are distinguished by their node lead unit ranges.
jpayne@69 527 // After each node, except a final-value node, another node follows to
jpayne@69 528 // encode match values or continue matching further units.
jpayne@69 529 //
jpayne@69 530 // Node types:
jpayne@69 531 // - Final-value node: Stores a 32-bit integer in a compact, variable-length format.
jpayne@69 532 // The value is for the string/char16_t sequence so far.
jpayne@69 533 // - Match node, optionally with an intermediate value in a different compact format.
jpayne@69 534 // The value, if present, is for the string/char16_t sequence so far.
jpayne@69 535 //
jpayne@69 536 // Aside from the value, which uses the node lead unit's high bits:
jpayne@69 537 //
jpayne@69 538 // - Linear-match node: Matches a number of units.
jpayne@69 539 // - Branch node: Branches to other nodes according to the current input unit.
jpayne@69 540 // The node unit is the length of the branch (number of units to select from)
jpayne@69 541 // minus 1. It is followed by a sub-node:
jpayne@69 542 // - If the length is at most kMaxBranchLinearSubNodeLength, then
jpayne@69 543 // there are length-1 (key, value) pairs and then one more comparison unit.
jpayne@69 544 // If one of the key units matches, then the value is either a final value for
jpayne@69 545 // the string so far, or a "jump" delta to the next node.
jpayne@69 546 // If the last unit matches, then matching continues with the next node.
jpayne@69 547 // (Values have the same encoding as final-value nodes.)
jpayne@69 548 // - If the length is greater than kMaxBranchLinearSubNodeLength, then
jpayne@69 549 // there is one unit and one "jump" delta.
jpayne@69 550 // If the input unit is less than the sub-node unit, then "jump" by delta to
jpayne@69 551 // the next sub-node which will have a length of length/2.
jpayne@69 552 // (The delta has its own compact encoding.)
jpayne@69 553 // Otherwise, skip the "jump" delta to the next sub-node
jpayne@69 554 // which will have a length of length-length/2.
jpayne@69 555
jpayne@69 556 // Match-node lead unit values, after masking off intermediate-value bits:
jpayne@69 557
jpayne@69 558 // 0000..002f: Branch node. If node!=0 then the length is node+1, otherwise
jpayne@69 559 // the length is one more than the next unit.
jpayne@69 560
jpayne@69 561 // For a branch sub-node with at most this many entries, we drop down
jpayne@69 562 // to a linear search.
jpayne@69 563 static const int32_t kMaxBranchLinearSubNodeLength=5;
jpayne@69 564
jpayne@69 565 // 0030..003f: Linear-match node, match 1..16 units and continue reading the next node.
jpayne@69 566 static const int32_t kMinLinearMatch=0x30;
jpayne@69 567 static const int32_t kMaxLinearMatchLength=0x10;
jpayne@69 568
jpayne@69 569 // Match-node lead unit bits 14..6 for the optional intermediate value.
jpayne@69 570 // If these bits are 0, then there is no intermediate value.
jpayne@69 571 // Otherwise, see the *NodeValue* constants below.
jpayne@69 572 static const int32_t kMinValueLead=kMinLinearMatch+kMaxLinearMatchLength; // 0x0040
jpayne@69 573 static const int32_t kNodeTypeMask=kMinValueLead-1; // 0x003f
jpayne@69 574
jpayne@69 575 // A final-value node has bit 15 set.
jpayne@69 576 static const int32_t kValueIsFinal=0x8000;
jpayne@69 577
jpayne@69 578 // Compact value: After testing and masking off bit 15, use the following thresholds.
jpayne@69 579 static const int32_t kMaxOneUnitValue=0x3fff;
jpayne@69 580
jpayne@69 581 static const int32_t kMinTwoUnitValueLead=kMaxOneUnitValue+1; // 0x4000
jpayne@69 582 static const int32_t kThreeUnitValueLead=0x7fff;
jpayne@69 583
jpayne@69 584 static const int32_t kMaxTwoUnitValue=((kThreeUnitValueLead-kMinTwoUnitValueLead)<<16)-1; // 0x3ffeffff
jpayne@69 585
jpayne@69 586 // Compact intermediate-value integer, lead unit shared with a branch or linear-match node.
jpayne@69 587 static const int32_t kMaxOneUnitNodeValue=0xff;
jpayne@69 588 static const int32_t kMinTwoUnitNodeValueLead=kMinValueLead+((kMaxOneUnitNodeValue+1)<<6); // 0x4040
jpayne@69 589 static const int32_t kThreeUnitNodeValueLead=0x7fc0;
jpayne@69 590
jpayne@69 591 static const int32_t kMaxTwoUnitNodeValue=
jpayne@69 592 ((kThreeUnitNodeValueLead-kMinTwoUnitNodeValueLead)<<10)-1; // 0xfdffff
jpayne@69 593
jpayne@69 594 // Compact delta integers.
jpayne@69 595 static const int32_t kMaxOneUnitDelta=0xfbff;
jpayne@69 596 static const int32_t kMinTwoUnitDeltaLead=kMaxOneUnitDelta+1; // 0xfc00
jpayne@69 597 static const int32_t kThreeUnitDeltaLead=0xffff;
jpayne@69 598
jpayne@69 599 static const int32_t kMaxTwoUnitDelta=((kThreeUnitDeltaLead-kMinTwoUnitDeltaLead)<<16)-1; // 0x03feffff
jpayne@69 600
jpayne@69 601 // For getState64():
jpayne@69 602 // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2
jpayne@69 603 // so we need at least 5 bits for that.
jpayne@69 604 // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength.
jpayne@69 605 static constexpr int32_t kState64RemainingShift = 59;
jpayne@69 606 static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1;
jpayne@69 607
jpayne@69 608 char16_t *ownedArray_;
jpayne@69 609
jpayne@69 610 // Fixed value referencing the UCharsTrie words.
jpayne@69 611 const char16_t *uchars_;
jpayne@69 612
jpayne@69 613 // Iterator variables.
jpayne@69 614
jpayne@69 615 // Pointer to next trie unit to read. NULL if no more matches.
jpayne@69 616 const char16_t *pos_;
jpayne@69 617 // Remaining length of a linear-match node, minus 1. Negative if not in such a node.
jpayne@69 618 int32_t remainingMatchLength_;
jpayne@69 619 };
jpayne@69 620
jpayne@69 621 U_NAMESPACE_END
jpayne@69 622
jpayne@69 623 #endif /* U_SHOW_CPLUSPLUS_API */
jpayne@69 624
jpayne@69 625 #endif // __UCHARSTRIE_H__