jpayne@69: // © 2016 and later: Unicode, Inc. and others. jpayne@69: // License & terms of use: http://www.unicode.org/copyright.html jpayne@69: /* jpayne@69: ******************************************************************************* jpayne@69: * Copyright (C) 2010-2012, International Business Machines jpayne@69: * Corporation and others. All Rights Reserved. jpayne@69: ******************************************************************************* jpayne@69: * file name: bytestrie.h jpayne@69: * encoding: UTF-8 jpayne@69: * tab size: 8 (not used) jpayne@69: * indentation:4 jpayne@69: * jpayne@69: * created on: 2010sep25 jpayne@69: * created by: Markus W. Scherer jpayne@69: */ jpayne@69: jpayne@69: #ifndef __BYTESTRIE_H__ jpayne@69: #define __BYTESTRIE_H__ jpayne@69: jpayne@69: /** jpayne@69: * \file jpayne@69: * \brief C++ API: Trie for mapping byte sequences to integer values. jpayne@69: */ jpayne@69: jpayne@69: #include "unicode/utypes.h" jpayne@69: jpayne@69: #if U_SHOW_CPLUSPLUS_API jpayne@69: jpayne@69: #include "unicode/stringpiece.h" jpayne@69: #include "unicode/uobject.h" jpayne@69: #include "unicode/ustringtrie.h" jpayne@69: jpayne@69: U_NAMESPACE_BEGIN jpayne@69: jpayne@69: class ByteSink; jpayne@69: class BytesTrieBuilder; jpayne@69: class CharString; jpayne@69: class UVector32; jpayne@69: jpayne@69: /** jpayne@69: * Light-weight, non-const reader class for a BytesTrie. jpayne@69: * Traverses a byte-serialized data structure with minimal state, jpayne@69: * for mapping byte sequences to non-negative integer values. jpayne@69: * jpayne@69: * This class owns the serialized trie data only if it was constructed by jpayne@69: * the builder's build() method. jpayne@69: * The public constructor and the copy constructor only alias the data (only copy the pointer). jpayne@69: * There is no assignment operator. jpayne@69: * jpayne@69: * This class is not intended for public subclassing. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: class U_COMMON_API BytesTrie : public UMemory { jpayne@69: public: jpayne@69: /** jpayne@69: * Constructs a BytesTrie reader instance. jpayne@69: * jpayne@69: * The trieBytes must contain a copy of a byte sequence from the BytesTrieBuilder, jpayne@69: * starting with the first byte of that sequence. jpayne@69: * The BytesTrie object will not read more bytes than jpayne@69: * the BytesTrieBuilder generated in the corresponding build() call. jpayne@69: * jpayne@69: * The array is not copied/cloned and must not be modified while jpayne@69: * the BytesTrie object is in use. jpayne@69: * jpayne@69: * @param trieBytes The byte array that contains the serialized trie. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: BytesTrie(const void *trieBytes) jpayne@69: : ownedArray_(NULL), bytes_(static_cast(trieBytes)), jpayne@69: pos_(bytes_), remainingMatchLength_(-1) {} jpayne@69: jpayne@69: /** jpayne@69: * Destructor. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: ~BytesTrie(); jpayne@69: jpayne@69: /** jpayne@69: * Copy constructor, copies the other trie reader object and its state, jpayne@69: * but not the byte array which will be shared. (Shallow copy.) jpayne@69: * @param other Another BytesTrie object. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: BytesTrie(const BytesTrie &other) jpayne@69: : ownedArray_(NULL), bytes_(other.bytes_), jpayne@69: pos_(other.pos_), remainingMatchLength_(other.remainingMatchLength_) {} jpayne@69: jpayne@69: /** jpayne@69: * Resets this trie to its initial state. jpayne@69: * @return *this jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: BytesTrie &reset() { jpayne@69: pos_=bytes_; jpayne@69: remainingMatchLength_=-1; jpayne@69: return *this; jpayne@69: } jpayne@69: jpayne@69: #ifndef U_HIDE_DRAFT_API jpayne@69: /** jpayne@69: * Returns the state of this trie as a 64-bit integer. jpayne@69: * The state value is never 0. jpayne@69: * jpayne@69: * @return opaque state value jpayne@69: * @see resetToState64 jpayne@69: * @draft ICU 65 jpayne@69: */ jpayne@69: uint64_t getState64() const { jpayne@69: return (static_cast(remainingMatchLength_ + 2) << kState64RemainingShift) | jpayne@69: (uint64_t)(pos_ - bytes_); jpayne@69: } jpayne@69: jpayne@69: /** jpayne@69: * Resets this trie to the saved state. jpayne@69: * Unlike resetToState(State), the 64-bit state value jpayne@69: * must be from getState64() from the same trie object or jpayne@69: * from one initialized the exact same way. jpayne@69: * Because of no validation, this method is faster. jpayne@69: * jpayne@69: * @param state The opaque trie state value from getState64(). jpayne@69: * @return *this jpayne@69: * @see getState64 jpayne@69: * @see resetToState jpayne@69: * @see reset jpayne@69: * @draft ICU 65 jpayne@69: */ jpayne@69: BytesTrie &resetToState64(uint64_t state) { jpayne@69: remainingMatchLength_ = static_cast(state >> kState64RemainingShift) - 2; jpayne@69: pos_ = bytes_ + (state & kState64PosMask); jpayne@69: return *this; jpayne@69: } jpayne@69: #endif /* U_HIDE_DRAFT_API */ jpayne@69: jpayne@69: /** jpayne@69: * BytesTrie state object, for saving a trie's current state jpayne@69: * and resetting the trie back to this state later. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: class State : public UMemory { jpayne@69: public: jpayne@69: /** jpayne@69: * Constructs an empty State. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: State() { bytes=NULL; } jpayne@69: private: jpayne@69: friend class BytesTrie; jpayne@69: jpayne@69: const uint8_t *bytes; jpayne@69: const uint8_t *pos; jpayne@69: int32_t remainingMatchLength; jpayne@69: }; jpayne@69: jpayne@69: /** jpayne@69: * Saves the state of this trie. jpayne@69: * @param state The State object to hold the trie's state. jpayne@69: * @return *this jpayne@69: * @see resetToState jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: const BytesTrie &saveState(State &state) const { jpayne@69: state.bytes=bytes_; jpayne@69: state.pos=pos_; jpayne@69: state.remainingMatchLength=remainingMatchLength_; jpayne@69: return *this; jpayne@69: } jpayne@69: jpayne@69: /** jpayne@69: * Resets this trie to the saved state. jpayne@69: * If the state object contains no state, or the state of a different trie, jpayne@69: * then this trie remains unchanged. jpayne@69: * @param state The State object which holds a saved trie state. jpayne@69: * @return *this jpayne@69: * @see saveState jpayne@69: * @see reset jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: BytesTrie &resetToState(const State &state) { jpayne@69: if(bytes_==state.bytes && bytes_!=NULL) { jpayne@69: pos_=state.pos; jpayne@69: remainingMatchLength_=state.remainingMatchLength; jpayne@69: } jpayne@69: return *this; jpayne@69: } jpayne@69: jpayne@69: /** jpayne@69: * Determines whether the byte sequence so far matches, whether it has a value, jpayne@69: * and whether another input byte can continue a matching byte sequence. jpayne@69: * @return The match/value Result. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: UStringTrieResult current() const; jpayne@69: jpayne@69: /** jpayne@69: * Traverses the trie from the initial state for this input byte. jpayne@69: * Equivalent to reset().next(inByte). jpayne@69: * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. jpayne@69: * Values below -0x100 and above 0xff will never match. jpayne@69: * @return The match/value Result. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: inline UStringTrieResult first(int32_t inByte) { jpayne@69: remainingMatchLength_=-1; jpayne@69: if(inByte<0) { jpayne@69: inByte+=0x100; jpayne@69: } jpayne@69: return nextImpl(bytes_, inByte); jpayne@69: } jpayne@69: jpayne@69: /** jpayne@69: * Traverses the trie from the current state for this input byte. jpayne@69: * @param inByte Input byte value. Values -0x100..-1 are treated like 0..0xff. jpayne@69: * Values below -0x100 and above 0xff will never match. jpayne@69: * @return The match/value Result. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: UStringTrieResult next(int32_t inByte); jpayne@69: jpayne@69: /** jpayne@69: * Traverses the trie from the current state for this byte sequence. jpayne@69: * Equivalent to jpayne@69: * \code jpayne@69: * Result result=current(); jpayne@69: * for(each c in s) jpayne@69: * if(!USTRINGTRIE_HAS_NEXT(result)) return USTRINGTRIE_NO_MATCH; jpayne@69: * result=next(c); jpayne@69: * return result; jpayne@69: * \endcode jpayne@69: * @param s A string or byte sequence. Can be NULL if length is 0. jpayne@69: * @param length The length of the byte sequence. Can be -1 if NUL-terminated. jpayne@69: * @return The match/value Result. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: UStringTrieResult next(const char *s, int32_t length); jpayne@69: jpayne@69: /** jpayne@69: * Returns a matching byte sequence's value if called immediately after jpayne@69: * current()/first()/next() returned USTRINGTRIE_INTERMEDIATE_VALUE or USTRINGTRIE_FINAL_VALUE. jpayne@69: * getValue() can be called multiple times. jpayne@69: * jpayne@69: * Do not call getValue() after USTRINGTRIE_NO_MATCH or USTRINGTRIE_NO_VALUE! jpayne@69: * @return The value for the byte sequence so far. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: inline int32_t getValue() const { jpayne@69: const uint8_t *pos=pos_; jpayne@69: int32_t leadByte=*pos++; jpayne@69: // U_ASSERT(leadByte>=kMinValueLead); jpayne@69: return readValue(pos, leadByte>>1); jpayne@69: } jpayne@69: jpayne@69: /** jpayne@69: * Determines whether all byte sequences reachable from the current state jpayne@69: * map to the same value. jpayne@69: * @param uniqueValue Receives the unique value, if this function returns TRUE. jpayne@69: * (output-only) jpayne@69: * @return TRUE if all byte sequences reachable from the current state jpayne@69: * map to the same value. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: inline UBool hasUniqueValue(int32_t &uniqueValue) const { jpayne@69: const uint8_t *pos=pos_; jpayne@69: // Skip the rest of a pending linear-match node. jpayne@69: return pos!=NULL && findUniqueValue(pos+remainingMatchLength_+1, FALSE, uniqueValue); jpayne@69: } jpayne@69: jpayne@69: /** jpayne@69: * Finds each byte which continues the byte sequence from the current state. jpayne@69: * That is, each byte b for which it would be next(b)!=USTRINGTRIE_NO_MATCH now. jpayne@69: * @param out Each next byte is appended to this object. jpayne@69: * (Only uses the out.Append(s, length) method.) jpayne@69: * @return the number of bytes which continue the byte sequence from here jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: int32_t getNextBytes(ByteSink &out) const; jpayne@69: jpayne@69: /** jpayne@69: * Iterator for all of the (byte sequence, value) pairs in a BytesTrie. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: class U_COMMON_API Iterator : public UMemory { jpayne@69: public: jpayne@69: /** jpayne@69: * Iterates from the root of a byte-serialized BytesTrie. jpayne@69: * @param trieBytes The trie bytes. jpayne@69: * @param maxStringLength If 0, the iterator returns full strings/byte sequences. jpayne@69: * Otherwise, the iterator returns strings with this maximum length. jpayne@69: * @param errorCode Standard ICU error code. Its input value must jpayne@69: * pass the U_SUCCESS() test, or else the function returns jpayne@69: * immediately. Check for U_FAILURE() on output or use with jpayne@69: * function chaining. (See User Guide for details.) jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: Iterator(const void *trieBytes, int32_t maxStringLength, UErrorCode &errorCode); jpayne@69: jpayne@69: /** jpayne@69: * Iterates from the current state of the specified BytesTrie. jpayne@69: * @param trie The trie whose state will be copied for iteration. jpayne@69: * @param maxStringLength If 0, the iterator returns full strings/byte sequences. jpayne@69: * Otherwise, the iterator returns strings with this maximum length. jpayne@69: * @param errorCode Standard ICU error code. Its input value must jpayne@69: * pass the U_SUCCESS() test, or else the function returns jpayne@69: * immediately. Check for U_FAILURE() on output or use with jpayne@69: * function chaining. (See User Guide for details.) jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: Iterator(const BytesTrie &trie, int32_t maxStringLength, UErrorCode &errorCode); jpayne@69: jpayne@69: /** jpayne@69: * Destructor. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: ~Iterator(); jpayne@69: jpayne@69: /** jpayne@69: * Resets this iterator to its initial state. jpayne@69: * @return *this jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: Iterator &reset(); jpayne@69: jpayne@69: /** jpayne@69: * @return TRUE if there are more elements. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: UBool hasNext() const; jpayne@69: jpayne@69: /** jpayne@69: * Finds the next (byte sequence, value) pair if there is one. jpayne@69: * jpayne@69: * If the byte sequence is truncated to the maximum length and does not jpayne@69: * have a real value, then the value is set to -1. jpayne@69: * In this case, this "not a real value" is indistinguishable from jpayne@69: * a real value of -1. jpayne@69: * @param errorCode Standard ICU error code. Its input value must jpayne@69: * pass the U_SUCCESS() test, or else the function returns jpayne@69: * immediately. Check for U_FAILURE() on output or use with jpayne@69: * function chaining. (See User Guide for details.) jpayne@69: * @return TRUE if there is another element. jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: UBool next(UErrorCode &errorCode); jpayne@69: jpayne@69: /** jpayne@69: * @return The NUL-terminated byte sequence for the last successful next(). jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: StringPiece getString() const; jpayne@69: /** jpayne@69: * @return The value for the last successful next(). jpayne@69: * @stable ICU 4.8 jpayne@69: */ jpayne@69: int32_t getValue() const { return value_; } jpayne@69: jpayne@69: private: jpayne@69: UBool truncateAndStop(); jpayne@69: jpayne@69: const uint8_t *branchNext(const uint8_t *pos, int32_t length, UErrorCode &errorCode); jpayne@69: jpayne@69: const uint8_t *bytes_; jpayne@69: const uint8_t *pos_; jpayne@69: const uint8_t *initialPos_; jpayne@69: int32_t remainingMatchLength_; jpayne@69: int32_t initialRemainingMatchLength_; jpayne@69: jpayne@69: CharString *str_; jpayne@69: int32_t maxLength_; jpayne@69: int32_t value_; jpayne@69: jpayne@69: // The stack stores pairs of integers for backtracking to another jpayne@69: // outbound edge of a branch node. jpayne@69: // The first integer is an offset from bytes_. jpayne@69: // The second integer has the str_->length() from before the node in bits 15..0, jpayne@69: // and the remaining branch length in bits 24..16. (Bits 31..25 are unused.) jpayne@69: // (We could store the remaining branch length minus 1 in bits 23..16 and not use bits 31..24, jpayne@69: // but the code looks more confusing that way.) jpayne@69: UVector32 *stack_; jpayne@69: }; jpayne@69: jpayne@69: private: jpayne@69: friend class BytesTrieBuilder; jpayne@69: jpayne@69: /** jpayne@69: * Constructs a BytesTrie reader instance. jpayne@69: * Unlike the public constructor which just aliases an array, jpayne@69: * this constructor adopts the builder's array. jpayne@69: * This constructor is only called by the builder. jpayne@69: */ jpayne@69: BytesTrie(void *adoptBytes, const void *trieBytes) jpayne@69: : ownedArray_(static_cast(adoptBytes)), jpayne@69: bytes_(static_cast(trieBytes)), jpayne@69: pos_(bytes_), remainingMatchLength_(-1) {} jpayne@69: jpayne@69: // No assignment operator. jpayne@69: BytesTrie &operator=(const BytesTrie &other); jpayne@69: jpayne@69: inline void stop() { jpayne@69: pos_=NULL; jpayne@69: } jpayne@69: jpayne@69: // Reads a compact 32-bit integer. jpayne@69: // pos is already after the leadByte, and the lead byte is already shifted right by 1. jpayne@69: static int32_t readValue(const uint8_t *pos, int32_t leadByte); jpayne@69: static inline const uint8_t *skipValue(const uint8_t *pos, int32_t leadByte) { jpayne@69: // U_ASSERT(leadByte>=kMinValueLead); jpayne@69: if(leadByte>=(kMinTwoByteValueLead<<1)) { jpayne@69: if(leadByte<(kMinThreeByteValueLead<<1)) { jpayne@69: ++pos; jpayne@69: } else if(leadByte<(kFourByteValueLead<<1)) { jpayne@69: pos+=2; jpayne@69: } else { jpayne@69: pos+=3+((leadByte>>1)&1); jpayne@69: } jpayne@69: } jpayne@69: return pos; jpayne@69: } jpayne@69: static inline const uint8_t *skipValue(const uint8_t *pos) { jpayne@69: int32_t leadByte=*pos++; jpayne@69: return skipValue(pos, leadByte); jpayne@69: } jpayne@69: jpayne@69: // Reads a jump delta and jumps. jpayne@69: static const uint8_t *jumpByDelta(const uint8_t *pos); jpayne@69: jpayne@69: static inline const uint8_t *skipDelta(const uint8_t *pos) { jpayne@69: int32_t delta=*pos++; jpayne@69: if(delta>=kMinTwoByteDeltaLead) { jpayne@69: if(delta>8)+1; // 0x6c jpayne@69: static const int32_t kFourByteValueLead=0x7e; jpayne@69: jpayne@69: // A little more than Unicode code points. (0x11ffff) jpayne@69: static const int32_t kMaxThreeByteValue=((kFourByteValueLead-kMinThreeByteValueLead)<<16)-1; jpayne@69: jpayne@69: static const int32_t kFiveByteValueLead=0x7f; jpayne@69: jpayne@69: // Compact delta integers. jpayne@69: static const int32_t kMaxOneByteDelta=0xbf; jpayne@69: static const int32_t kMinTwoByteDeltaLead=kMaxOneByteDelta+1; // 0xc0 jpayne@69: static const int32_t kMinThreeByteDeltaLead=0xf0; jpayne@69: static const int32_t kFourByteDeltaLead=0xfe; jpayne@69: static const int32_t kFiveByteDeltaLead=0xff; jpayne@69: jpayne@69: static const int32_t kMaxTwoByteDelta=((kMinThreeByteDeltaLead-kMinTwoByteDeltaLead)<<8)-1; // 0x2fff jpayne@69: static const int32_t kMaxThreeByteDelta=((kFourByteDeltaLead-kMinThreeByteDeltaLead)<<16)-1; // 0xdffff jpayne@69: jpayne@69: // For getState64(): jpayne@69: // The remainingMatchLength_ is -1..14=(kMaxLinearMatchLength=0x10)-2 jpayne@69: // so we need at least 5 bits for that. jpayne@69: // We add 2 to store it as a positive value 1..16=kMaxLinearMatchLength. jpayne@69: static constexpr int32_t kState64RemainingShift = 59; jpayne@69: static constexpr uint64_t kState64PosMask = (UINT64_C(1) << kState64RemainingShift) - 1; jpayne@69: jpayne@69: uint8_t *ownedArray_; jpayne@69: jpayne@69: // Fixed value referencing the BytesTrie bytes. jpayne@69: const uint8_t *bytes_; jpayne@69: jpayne@69: // Iterator variables. jpayne@69: jpayne@69: // Pointer to next trie byte to read. NULL if no more matches. jpayne@69: const uint8_t *pos_; jpayne@69: // Remaining length of a linear-match node, minus 1. Negative if not in such a node. jpayne@69: int32_t remainingMatchLength_; jpayne@69: }; jpayne@69: jpayne@69: U_NAMESPACE_END jpayne@69: jpayne@69: #endif /* U_SHOW_CPLUSPLUS_API */ jpayne@69: jpayne@69: #endif // __BYTESTRIE_H__