annotate CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/regex.h @ 69:33d812a61356

planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author jpayne
date Tue, 18 Mar 2025 17:55:14 -0400
parents
children
rev   line source
jpayne@69 1 // © 2016 and later: Unicode, Inc. and others.
jpayne@69 2 // License & terms of use: http://www.unicode.org/copyright.html
jpayne@69 3 /*
jpayne@69 4 **********************************************************************
jpayne@69 5 * Copyright (C) 2002-2016, International Business Machines
jpayne@69 6 * Corporation and others. All Rights Reserved.
jpayne@69 7 **********************************************************************
jpayne@69 8 * file name: regex.h
jpayne@69 9 * encoding: UTF-8
jpayne@69 10 * indentation:4
jpayne@69 11 *
jpayne@69 12 * created on: 2002oct22
jpayne@69 13 * created by: Andy Heninger
jpayne@69 14 *
jpayne@69 15 * ICU Regular Expressions, API for C++
jpayne@69 16 */
jpayne@69 17
jpayne@69 18 #ifndef REGEX_H
jpayne@69 19 #define REGEX_H
jpayne@69 20
jpayne@69 21 //#define REGEX_DEBUG
jpayne@69 22
jpayne@69 23 /**
jpayne@69 24 * \file
jpayne@69 25 * \brief C++ API: Regular Expressions
jpayne@69 26 *
jpayne@69 27 * The ICU API for processing regular expressions consists of two classes,
jpayne@69 28 * `RegexPattern` and `RegexMatcher`.
jpayne@69 29 * `RegexPattern` objects represent a pre-processed, or compiled
jpayne@69 30 * regular expression. They are created from a regular expression pattern string,
jpayne@69 31 * and can be used to create `RegexMatcher` objects for the pattern.
jpayne@69 32 *
jpayne@69 33 * Class `RegexMatcher` bundles together a regular expression
jpayne@69 34 * pattern and a target string to which the search pattern will be applied.
jpayne@69 35 * `RegexMatcher` includes API for doing plain find or search
jpayne@69 36 * operations, for search and replace operations, and for obtaining detailed
jpayne@69 37 * information about bounds of a match.
jpayne@69 38 *
jpayne@69 39 * Note that by constructing `RegexMatcher` objects directly from regular
jpayne@69 40 * expression pattern strings application code can be simplified and the explicit
jpayne@69 41 * need for `RegexPattern` objects can usually be eliminated.
jpayne@69 42 *
jpayne@69 43 */
jpayne@69 44
jpayne@69 45 #include "unicode/utypes.h"
jpayne@69 46
jpayne@69 47 #if U_SHOW_CPLUSPLUS_API
jpayne@69 48
jpayne@69 49 #if !UCONFIG_NO_REGULAR_EXPRESSIONS
jpayne@69 50
jpayne@69 51 #include "unicode/uobject.h"
jpayne@69 52 #include "unicode/unistr.h"
jpayne@69 53 #include "unicode/utext.h"
jpayne@69 54 #include "unicode/parseerr.h"
jpayne@69 55
jpayne@69 56 #include "unicode/uregex.h"
jpayne@69 57
jpayne@69 58 // Forward Declarations
jpayne@69 59
jpayne@69 60 struct UHashtable;
jpayne@69 61
jpayne@69 62 U_NAMESPACE_BEGIN
jpayne@69 63
jpayne@69 64 struct Regex8BitSet;
jpayne@69 65 class RegexCImpl;
jpayne@69 66 class RegexMatcher;
jpayne@69 67 class RegexPattern;
jpayne@69 68 struct REStackFrame;
jpayne@69 69 class BreakIterator;
jpayne@69 70 class UnicodeSet;
jpayne@69 71 class UVector;
jpayne@69 72 class UVector32;
jpayne@69 73 class UVector64;
jpayne@69 74
jpayne@69 75
jpayne@69 76 /**
jpayne@69 77 * Class `RegexPattern` represents a compiled regular expression. It includes
jpayne@69 78 * factory methods for creating a RegexPattern object from the source (string) form
jpayne@69 79 * of a regular expression, methods for creating RegexMatchers that allow the pattern
jpayne@69 80 * to be applied to input text, and a few convenience methods for simple common
jpayne@69 81 * uses of regular expressions.
jpayne@69 82 *
jpayne@69 83 * Class RegexPattern is not intended to be subclassed.
jpayne@69 84 *
jpayne@69 85 * @stable ICU 2.4
jpayne@69 86 */
jpayne@69 87 class U_I18N_API RegexPattern U_FINAL : public UObject {
jpayne@69 88 public:
jpayne@69 89
jpayne@69 90 /**
jpayne@69 91 * default constructor. Create a RegexPattern object that refers to no actual
jpayne@69 92 * pattern. Not normally needed; RegexPattern objects are usually
jpayne@69 93 * created using the factory method `compile()`.
jpayne@69 94 *
jpayne@69 95 * @stable ICU 2.4
jpayne@69 96 */
jpayne@69 97 RegexPattern();
jpayne@69 98
jpayne@69 99 /**
jpayne@69 100 * Copy Constructor. Create a new RegexPattern object that is equivalent
jpayne@69 101 * to the source object.
jpayne@69 102 * @param source the pattern object to be copied.
jpayne@69 103 * @stable ICU 2.4
jpayne@69 104 */
jpayne@69 105 RegexPattern(const RegexPattern &source);
jpayne@69 106
jpayne@69 107 /**
jpayne@69 108 * Destructor. Note that a RegexPattern object must persist so long as any
jpayne@69 109 * RegexMatcher objects that were created from the RegexPattern are active.
jpayne@69 110 * @stable ICU 2.4
jpayne@69 111 */
jpayne@69 112 virtual ~RegexPattern();
jpayne@69 113
jpayne@69 114 /**
jpayne@69 115 * Comparison operator. Two RegexPattern objects are considered equal if they
jpayne@69 116 * were constructed from identical source patterns using the same #URegexpFlag
jpayne@69 117 * settings.
jpayne@69 118 * @param that a RegexPattern object to compare with "this".
jpayne@69 119 * @return TRUE if the objects are equivalent.
jpayne@69 120 * @stable ICU 2.4
jpayne@69 121 */
jpayne@69 122 UBool operator==(const RegexPattern& that) const;
jpayne@69 123
jpayne@69 124 /**
jpayne@69 125 * Comparison operator. Two RegexPattern objects are considered equal if they
jpayne@69 126 * were constructed from identical source patterns using the same #URegexpFlag
jpayne@69 127 * settings.
jpayne@69 128 * @param that a RegexPattern object to compare with "this".
jpayne@69 129 * @return TRUE if the objects are different.
jpayne@69 130 * @stable ICU 2.4
jpayne@69 131 */
jpayne@69 132 inline UBool operator!=(const RegexPattern& that) const {return ! operator ==(that);}
jpayne@69 133
jpayne@69 134 /**
jpayne@69 135 * Assignment operator. After assignment, this RegexPattern will behave identically
jpayne@69 136 * to the source object.
jpayne@69 137 * @stable ICU 2.4
jpayne@69 138 */
jpayne@69 139 RegexPattern &operator =(const RegexPattern &source);
jpayne@69 140
jpayne@69 141 /**
jpayne@69 142 * Create an exact copy of this RegexPattern object. Since RegexPattern is not
jpayne@69 143 * intended to be subclassed, <code>clone()</code> and the copy construction are
jpayne@69 144 * equivalent operations.
jpayne@69 145 * @return the copy of this RegexPattern
jpayne@69 146 * @stable ICU 2.4
jpayne@69 147 */
jpayne@69 148 virtual RegexPattern *clone() const;
jpayne@69 149
jpayne@69 150
jpayne@69 151 /**
jpayne@69 152 * Compiles the regular expression in string form into a RegexPattern
jpayne@69 153 * object. These compile methods, rather than the constructors, are the usual
jpayne@69 154 * way that RegexPattern objects are created.
jpayne@69 155 *
jpayne@69 156 * Note that RegexPattern objects must not be deleted while RegexMatcher
jpayne@69 157 * objects created from the pattern are active. RegexMatchers keep a pointer
jpayne@69 158 * back to their pattern, so premature deletion of the pattern is a
jpayne@69 159 * catastrophic error.
jpayne@69 160 *
jpayne@69 161 * All #URegexpFlag pattern match mode flags are set to their default values.
jpayne@69 162 *
jpayne@69 163 * Note that it is often more convenient to construct a RegexMatcher directly
jpayne@69 164 * from a pattern string rather than separately compiling the pattern and
jpayne@69 165 * then creating a RegexMatcher object from the pattern.
jpayne@69 166 *
jpayne@69 167 * @param regex The regular expression to be compiled.
jpayne@69 168 * @param pe Receives the position (line and column nubers) of any error
jpayne@69 169 * within the regular expression.)
jpayne@69 170 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 171 * @return A regexPattern object for the compiled pattern.
jpayne@69 172 *
jpayne@69 173 * @stable ICU 2.4
jpayne@69 174 */
jpayne@69 175 static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
jpayne@69 176 UParseError &pe,
jpayne@69 177 UErrorCode &status);
jpayne@69 178
jpayne@69 179 /**
jpayne@69 180 * Compiles the regular expression in string form into a RegexPattern
jpayne@69 181 * object. These compile methods, rather than the constructors, are the usual
jpayne@69 182 * way that RegexPattern objects are created.
jpayne@69 183 *
jpayne@69 184 * Note that RegexPattern objects must not be deleted while RegexMatcher
jpayne@69 185 * objects created from the pattern are active. RegexMatchers keep a pointer
jpayne@69 186 * back to their pattern, so premature deletion of the pattern is a
jpayne@69 187 * catastrophic error.
jpayne@69 188 *
jpayne@69 189 * All #URegexpFlag pattern match mode flags are set to their default values.
jpayne@69 190 *
jpayne@69 191 * Note that it is often more convenient to construct a RegexMatcher directly
jpayne@69 192 * from a pattern string rather than separately compiling the pattern and
jpayne@69 193 * then creating a RegexMatcher object from the pattern.
jpayne@69 194 *
jpayne@69 195 * @param regex The regular expression to be compiled. Note, the text referred
jpayne@69 196 * to by this UText must not be deleted during the lifetime of the
jpayne@69 197 * RegexPattern object or any RegexMatcher object created from it.
jpayne@69 198 * @param pe Receives the position (line and column nubers) of any error
jpayne@69 199 * within the regular expression.)
jpayne@69 200 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 201 * @return A regexPattern object for the compiled pattern.
jpayne@69 202 *
jpayne@69 203 * @stable ICU 4.6
jpayne@69 204 */
jpayne@69 205 static RegexPattern * U_EXPORT2 compile( UText *regex,
jpayne@69 206 UParseError &pe,
jpayne@69 207 UErrorCode &status);
jpayne@69 208
jpayne@69 209 /**
jpayne@69 210 * Compiles the regular expression in string form into a RegexPattern
jpayne@69 211 * object using the specified #URegexpFlag match mode flags. These compile methods,
jpayne@69 212 * rather than the constructors, are the usual way that RegexPattern objects
jpayne@69 213 * are created.
jpayne@69 214 *
jpayne@69 215 * Note that RegexPattern objects must not be deleted while RegexMatcher
jpayne@69 216 * objects created from the pattern are active. RegexMatchers keep a pointer
jpayne@69 217 * back to their pattern, so premature deletion of the pattern is a
jpayne@69 218 * catastrophic error.
jpayne@69 219 *
jpayne@69 220 * Note that it is often more convenient to construct a RegexMatcher directly
jpayne@69 221 * from a pattern string instead of than separately compiling the pattern and
jpayne@69 222 * then creating a RegexMatcher object from the pattern.
jpayne@69 223 *
jpayne@69 224 * @param regex The regular expression to be compiled.
jpayne@69 225 * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
jpayne@69 226 * @param pe Receives the position (line and column numbers) of any error
jpayne@69 227 * within the regular expression.)
jpayne@69 228 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 229 * @return A regexPattern object for the compiled pattern.
jpayne@69 230 *
jpayne@69 231 * @stable ICU 2.4
jpayne@69 232 */
jpayne@69 233 static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
jpayne@69 234 uint32_t flags,
jpayne@69 235 UParseError &pe,
jpayne@69 236 UErrorCode &status);
jpayne@69 237
jpayne@69 238 /**
jpayne@69 239 * Compiles the regular expression in string form into a RegexPattern
jpayne@69 240 * object using the specified #URegexpFlag match mode flags. These compile methods,
jpayne@69 241 * rather than the constructors, are the usual way that RegexPattern objects
jpayne@69 242 * are created.
jpayne@69 243 *
jpayne@69 244 * Note that RegexPattern objects must not be deleted while RegexMatcher
jpayne@69 245 * objects created from the pattern are active. RegexMatchers keep a pointer
jpayne@69 246 * back to their pattern, so premature deletion of the pattern is a
jpayne@69 247 * catastrophic error.
jpayne@69 248 *
jpayne@69 249 * Note that it is often more convenient to construct a RegexMatcher directly
jpayne@69 250 * from a pattern string instead of than separately compiling the pattern and
jpayne@69 251 * then creating a RegexMatcher object from the pattern.
jpayne@69 252 *
jpayne@69 253 * @param regex The regular expression to be compiled. Note, the text referred
jpayne@69 254 * to by this UText must not be deleted during the lifetime of the
jpayne@69 255 * RegexPattern object or any RegexMatcher object created from it.
jpayne@69 256 * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
jpayne@69 257 * @param pe Receives the position (line and column numbers) of any error
jpayne@69 258 * within the regular expression.)
jpayne@69 259 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 260 * @return A regexPattern object for the compiled pattern.
jpayne@69 261 *
jpayne@69 262 * @stable ICU 4.6
jpayne@69 263 */
jpayne@69 264 static RegexPattern * U_EXPORT2 compile( UText *regex,
jpayne@69 265 uint32_t flags,
jpayne@69 266 UParseError &pe,
jpayne@69 267 UErrorCode &status);
jpayne@69 268
jpayne@69 269 /**
jpayne@69 270 * Compiles the regular expression in string form into a RegexPattern
jpayne@69 271 * object using the specified #URegexpFlag match mode flags. These compile methods,
jpayne@69 272 * rather than the constructors, are the usual way that RegexPattern objects
jpayne@69 273 * are created.
jpayne@69 274 *
jpayne@69 275 * Note that RegexPattern objects must not be deleted while RegexMatcher
jpayne@69 276 * objects created from the pattern are active. RegexMatchers keep a pointer
jpayne@69 277 * back to their pattern, so premature deletion of the pattern is a
jpayne@69 278 * catastrophic error.
jpayne@69 279 *
jpayne@69 280 * Note that it is often more convenient to construct a RegexMatcher directly
jpayne@69 281 * from a pattern string instead of than separately compiling the pattern and
jpayne@69 282 * then creating a RegexMatcher object from the pattern.
jpayne@69 283 *
jpayne@69 284 * @param regex The regular expression to be compiled.
jpayne@69 285 * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
jpayne@69 286 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 287 * @return A regexPattern object for the compiled pattern.
jpayne@69 288 *
jpayne@69 289 * @stable ICU 2.6
jpayne@69 290 */
jpayne@69 291 static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
jpayne@69 292 uint32_t flags,
jpayne@69 293 UErrorCode &status);
jpayne@69 294
jpayne@69 295 /**
jpayne@69 296 * Compiles the regular expression in string form into a RegexPattern
jpayne@69 297 * object using the specified #URegexpFlag match mode flags. These compile methods,
jpayne@69 298 * rather than the constructors, are the usual way that RegexPattern objects
jpayne@69 299 * are created.
jpayne@69 300 *
jpayne@69 301 * Note that RegexPattern objects must not be deleted while RegexMatcher
jpayne@69 302 * objects created from the pattern are active. RegexMatchers keep a pointer
jpayne@69 303 * back to their pattern, so premature deletion of the pattern is a
jpayne@69 304 * catastrophic error.
jpayne@69 305 *
jpayne@69 306 * Note that it is often more convenient to construct a RegexMatcher directly
jpayne@69 307 * from a pattern string instead of than separately compiling the pattern and
jpayne@69 308 * then creating a RegexMatcher object from the pattern.
jpayne@69 309 *
jpayne@69 310 * @param regex The regular expression to be compiled. Note, the text referred
jpayne@69 311 * to by this UText must not be deleted during the lifetime of the
jpayne@69 312 * RegexPattern object or any RegexMatcher object created from it.
jpayne@69 313 * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
jpayne@69 314 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 315 * @return A regexPattern object for the compiled pattern.
jpayne@69 316 *
jpayne@69 317 * @stable ICU 4.6
jpayne@69 318 */
jpayne@69 319 static RegexPattern * U_EXPORT2 compile( UText *regex,
jpayne@69 320 uint32_t flags,
jpayne@69 321 UErrorCode &status);
jpayne@69 322
jpayne@69 323 /**
jpayne@69 324 * Get the #URegexpFlag match mode flags that were used when compiling this pattern.
jpayne@69 325 * @return the #URegexpFlag match mode flags
jpayne@69 326 * @stable ICU 2.4
jpayne@69 327 */
jpayne@69 328 virtual uint32_t flags() const;
jpayne@69 329
jpayne@69 330 /**
jpayne@69 331 * Creates a RegexMatcher that will match the given input against this pattern. The
jpayne@69 332 * RegexMatcher can then be used to perform match, find or replace operations
jpayne@69 333 * on the input. Note that a RegexPattern object must not be deleted while
jpayne@69 334 * RegexMatchers created from it still exist and might possibly be used again.
jpayne@69 335 *
jpayne@69 336 * The matcher will retain a reference to the supplied input string, and all regexp
jpayne@69 337 * pattern matching operations happen directly on this original string. It is
jpayne@69 338 * critical that the string not be altered or deleted before use by the regular
jpayne@69 339 * expression operations is complete.
jpayne@69 340 *
jpayne@69 341 * @param input The input string to which the regular expression will be applied.
jpayne@69 342 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 343 * @return A RegexMatcher object for this pattern and input.
jpayne@69 344 *
jpayne@69 345 * @stable ICU 2.4
jpayne@69 346 */
jpayne@69 347 virtual RegexMatcher *matcher(const UnicodeString &input,
jpayne@69 348 UErrorCode &status) const;
jpayne@69 349
jpayne@69 350 private:
jpayne@69 351 /**
jpayne@69 352 * Cause a compilation error if an application accidentally attempts to
jpayne@69 353 * create a matcher with a (char16_t *) string as input rather than
jpayne@69 354 * a UnicodeString. Avoids a dangling reference to a temporary string.
jpayne@69 355 *
jpayne@69 356 * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
jpayne@69 357 * using one of the aliasing constructors, such as
jpayne@69 358 * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
jpayne@69 359 * or in a UText, using
jpayne@69 360 * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
jpayne@69 361 *
jpayne@69 362 */
jpayne@69 363 RegexMatcher *matcher(const char16_t *input,
jpayne@69 364 UErrorCode &status) const;
jpayne@69 365 public:
jpayne@69 366
jpayne@69 367
jpayne@69 368 /**
jpayne@69 369 * Creates a RegexMatcher that will match against this pattern. The
jpayne@69 370 * RegexMatcher can be used to perform match, find or replace operations.
jpayne@69 371 * Note that a RegexPattern object must not be deleted while
jpayne@69 372 * RegexMatchers created from it still exist and might possibly be used again.
jpayne@69 373 *
jpayne@69 374 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 375 * @return A RegexMatcher object for this pattern and input.
jpayne@69 376 *
jpayne@69 377 * @stable ICU 2.6
jpayne@69 378 */
jpayne@69 379 virtual RegexMatcher *matcher(UErrorCode &status) const;
jpayne@69 380
jpayne@69 381
jpayne@69 382 /**
jpayne@69 383 * Test whether a string matches a regular expression. This convenience function
jpayne@69 384 * both compiles the regular expression and applies it in a single operation.
jpayne@69 385 * Note that if the same pattern needs to be applied repeatedly, this method will be
jpayne@69 386 * less efficient than creating and reusing a RegexMatcher object.
jpayne@69 387 *
jpayne@69 388 * @param regex The regular expression
jpayne@69 389 * @param input The string data to be matched
jpayne@69 390 * @param pe Receives the position of any syntax errors within the regular expression
jpayne@69 391 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 392 * @return True if the regular expression exactly matches the full input string.
jpayne@69 393 *
jpayne@69 394 * @stable ICU 2.4
jpayne@69 395 */
jpayne@69 396 static UBool U_EXPORT2 matches(const UnicodeString &regex,
jpayne@69 397 const UnicodeString &input,
jpayne@69 398 UParseError &pe,
jpayne@69 399 UErrorCode &status);
jpayne@69 400
jpayne@69 401 /**
jpayne@69 402 * Test whether a string matches a regular expression. This convenience function
jpayne@69 403 * both compiles the regular expression and applies it in a single operation.
jpayne@69 404 * Note that if the same pattern needs to be applied repeatedly, this method will be
jpayne@69 405 * less efficient than creating and reusing a RegexMatcher object.
jpayne@69 406 *
jpayne@69 407 * @param regex The regular expression
jpayne@69 408 * @param input The string data to be matched
jpayne@69 409 * @param pe Receives the position of any syntax errors within the regular expression
jpayne@69 410 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 411 * @return True if the regular expression exactly matches the full input string.
jpayne@69 412 *
jpayne@69 413 * @stable ICU 4.6
jpayne@69 414 */
jpayne@69 415 static UBool U_EXPORT2 matches(UText *regex,
jpayne@69 416 UText *input,
jpayne@69 417 UParseError &pe,
jpayne@69 418 UErrorCode &status);
jpayne@69 419
jpayne@69 420 /**
jpayne@69 421 * Returns the regular expression from which this pattern was compiled. This method will work
jpayne@69 422 * even if the pattern was compiled from a UText.
jpayne@69 423 *
jpayne@69 424 * Note: If the pattern was originally compiled from a UText, and that UText was modified,
jpayne@69 425 * the returned string may no longer reflect the RegexPattern object.
jpayne@69 426 * @stable ICU 2.4
jpayne@69 427 */
jpayne@69 428 virtual UnicodeString pattern() const;
jpayne@69 429
jpayne@69 430
jpayne@69 431 /**
jpayne@69 432 * Returns the regular expression from which this pattern was compiled. This method will work
jpayne@69 433 * even if the pattern was compiled from a UnicodeString.
jpayne@69 434 *
jpayne@69 435 * Note: This is the original input, not a clone. If the pattern was originally compiled from a
jpayne@69 436 * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
jpayne@69 437 * object.
jpayne@69 438 *
jpayne@69 439 * @stable ICU 4.6
jpayne@69 440 */
jpayne@69 441 virtual UText *patternText(UErrorCode &status) const;
jpayne@69 442
jpayne@69 443
jpayne@69 444 /**
jpayne@69 445 * Get the group number corresponding to a named capture group.
jpayne@69 446 * The returned number can be used with any function that access
jpayne@69 447 * capture groups by number.
jpayne@69 448 *
jpayne@69 449 * The function returns an error status if the specified name does not
jpayne@69 450 * appear in the pattern.
jpayne@69 451 *
jpayne@69 452 * @param groupName The capture group name.
jpayne@69 453 * @param status A UErrorCode to receive any errors.
jpayne@69 454 *
jpayne@69 455 * @stable ICU 55
jpayne@69 456 */
jpayne@69 457 virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
jpayne@69 458
jpayne@69 459
jpayne@69 460 /**
jpayne@69 461 * Get the group number corresponding to a named capture group.
jpayne@69 462 * The returned number can be used with any function that access
jpayne@69 463 * capture groups by number.
jpayne@69 464 *
jpayne@69 465 * The function returns an error status if the specified name does not
jpayne@69 466 * appear in the pattern.
jpayne@69 467 *
jpayne@69 468 * @param groupName The capture group name,
jpayne@69 469 * platform invariant characters only.
jpayne@69 470 * @param nameLength The length of the name, or -1 if the name is
jpayne@69 471 * nul-terminated.
jpayne@69 472 * @param status A UErrorCode to receive any errors.
jpayne@69 473 *
jpayne@69 474 * @stable ICU 55
jpayne@69 475 */
jpayne@69 476 virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
jpayne@69 477
jpayne@69 478
jpayne@69 479 /**
jpayne@69 480 * Split a string into fields. Somewhat like split() from Perl or Java.
jpayne@69 481 * Pattern matches identify delimiters that separate the input
jpayne@69 482 * into fields. The input data between the delimiters becomes the
jpayne@69 483 * fields themselves.
jpayne@69 484 *
jpayne@69 485 * If the delimiter pattern includes capture groups, the captured text will
jpayne@69 486 * also appear in the destination array of output strings, interspersed
jpayne@69 487 * with the fields. This is similar to Perl, but differs from Java,
jpayne@69 488 * which ignores the presence of capture groups in the pattern.
jpayne@69 489 *
jpayne@69 490 * Trailing empty fields will always be returned, assuming sufficient
jpayne@69 491 * destination capacity. This differs from the default behavior for Java
jpayne@69 492 * and Perl where trailing empty fields are not returned.
jpayne@69 493 *
jpayne@69 494 * The number of strings produced by the split operation is returned.
jpayne@69 495 * This count includes the strings from capture groups in the delimiter pattern.
jpayne@69 496 * This behavior differs from Java, which ignores capture groups.
jpayne@69 497 *
jpayne@69 498 * For the best performance on split() operations,
jpayne@69 499 * <code>RegexMatcher::split</code> is preferable to this function
jpayne@69 500 *
jpayne@69 501 * @param input The string to be split into fields. The field delimiters
jpayne@69 502 * match the pattern (in the "this" object)
jpayne@69 503 * @param dest An array of UnicodeStrings to receive the results of the split.
jpayne@69 504 * This is an array of actual UnicodeString objects, not an
jpayne@69 505 * array of pointers to strings. Local (stack based) arrays can
jpayne@69 506 * work well here.
jpayne@69 507 * @param destCapacity The number of elements in the destination array.
jpayne@69 508 * If the number of fields found is less than destCapacity, the
jpayne@69 509 * extra strings in the destination array are not altered.
jpayne@69 510 * If the number of destination strings is less than the number
jpayne@69 511 * of fields, the trailing part of the input string, including any
jpayne@69 512 * field delimiters, is placed in the last destination string.
jpayne@69 513 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 514 * @return The number of fields into which the input string was split.
jpayne@69 515 * @stable ICU 2.4
jpayne@69 516 */
jpayne@69 517 virtual int32_t split(const UnicodeString &input,
jpayne@69 518 UnicodeString dest[],
jpayne@69 519 int32_t destCapacity,
jpayne@69 520 UErrorCode &status) const;
jpayne@69 521
jpayne@69 522
jpayne@69 523 /**
jpayne@69 524 * Split a string into fields. Somewhat like %split() from Perl or Java.
jpayne@69 525 * Pattern matches identify delimiters that separate the input
jpayne@69 526 * into fields. The input data between the delimiters becomes the
jpayne@69 527 * fields themselves.
jpayne@69 528 *
jpayne@69 529 * If the delimiter pattern includes capture groups, the captured text will
jpayne@69 530 * also appear in the destination array of output strings, interspersed
jpayne@69 531 * with the fields. This is similar to Perl, but differs from Java,
jpayne@69 532 * which ignores the presence of capture groups in the pattern.
jpayne@69 533 *
jpayne@69 534 * Trailing empty fields will always be returned, assuming sufficient
jpayne@69 535 * destination capacity. This differs from the default behavior for Java
jpayne@69 536 * and Perl where trailing empty fields are not returned.
jpayne@69 537 *
jpayne@69 538 * The number of strings produced by the split operation is returned.
jpayne@69 539 * This count includes the strings from capture groups in the delimiter pattern.
jpayne@69 540 * This behavior differs from Java, which ignores capture groups.
jpayne@69 541 *
jpayne@69 542 * For the best performance on split() operations,
jpayne@69 543 * `RegexMatcher::split()` is preferable to this function
jpayne@69 544 *
jpayne@69 545 * @param input The string to be split into fields. The field delimiters
jpayne@69 546 * match the pattern (in the "this" object)
jpayne@69 547 * @param dest An array of mutable UText structs to receive the results of the split.
jpayne@69 548 * If a field is NULL, a new UText is allocated to contain the results for
jpayne@69 549 * that field. This new UText is not guaranteed to be mutable.
jpayne@69 550 * @param destCapacity The number of elements in the destination array.
jpayne@69 551 * If the number of fields found is less than destCapacity, the
jpayne@69 552 * extra strings in the destination array are not altered.
jpayne@69 553 * If the number of destination strings is less than the number
jpayne@69 554 * of fields, the trailing part of the input string, including any
jpayne@69 555 * field delimiters, is placed in the last destination string.
jpayne@69 556 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 557 * @return The number of destination strings used.
jpayne@69 558 *
jpayne@69 559 * @stable ICU 4.6
jpayne@69 560 */
jpayne@69 561 virtual int32_t split(UText *input,
jpayne@69 562 UText *dest[],
jpayne@69 563 int32_t destCapacity,
jpayne@69 564 UErrorCode &status) const;
jpayne@69 565
jpayne@69 566
jpayne@69 567 /**
jpayne@69 568 * ICU "poor man's RTTI", returns a UClassID for the actual class.
jpayne@69 569 *
jpayne@69 570 * @stable ICU 2.4
jpayne@69 571 */
jpayne@69 572 virtual UClassID getDynamicClassID() const;
jpayne@69 573
jpayne@69 574 /**
jpayne@69 575 * ICU "poor man's RTTI", returns a UClassID for this class.
jpayne@69 576 *
jpayne@69 577 * @stable ICU 2.4
jpayne@69 578 */
jpayne@69 579 static UClassID U_EXPORT2 getStaticClassID();
jpayne@69 580
jpayne@69 581 private:
jpayne@69 582 //
jpayne@69 583 // Implementation Data
jpayne@69 584 //
jpayne@69 585 UText *fPattern; // The original pattern string.
jpayne@69 586 UnicodeString *fPatternString; // The original pattern UncodeString if relevant
jpayne@69 587 uint32_t fFlags; // The flags used when compiling the pattern.
jpayne@69 588 //
jpayne@69 589 UVector64 *fCompiledPat; // The compiled pattern p-code.
jpayne@69 590 UnicodeString fLiteralText; // Any literal string data from the pattern,
jpayne@69 591 // after un-escaping, for use during the match.
jpayne@69 592
jpayne@69 593 UVector *fSets; // Any UnicodeSets referenced from the pattern.
jpayne@69 594 Regex8BitSet *fSets8; // (and fast sets for latin-1 range.)
jpayne@69 595
jpayne@69 596
jpayne@69 597 UErrorCode fDeferredStatus; // status if some prior error has left this
jpayne@69 598 // RegexPattern in an unusable state.
jpayne@69 599
jpayne@69 600 int32_t fMinMatchLen; // Minimum Match Length. All matches will have length
jpayne@69 601 // >= this value. For some patterns, this calculated
jpayne@69 602 // value may be less than the true shortest
jpayne@69 603 // possible match.
jpayne@69 604
jpayne@69 605 int32_t fFrameSize; // Size of a state stack frame in the
jpayne@69 606 // execution engine.
jpayne@69 607
jpayne@69 608 int32_t fDataSize; // The size of the data needed by the pattern that
jpayne@69 609 // does not go on the state stack, but has just
jpayne@69 610 // a single copy per matcher.
jpayne@69 611
jpayne@69 612 UVector32 *fGroupMap; // Map from capture group number to position of
jpayne@69 613 // the group's variables in the matcher stack frame.
jpayne@69 614
jpayne@69 615 int32_t fStartType; // Info on how a match must start.
jpayne@69 616 int32_t fInitialStringIdx; //
jpayne@69 617 int32_t fInitialStringLen;
jpayne@69 618 UnicodeSet *fInitialChars;
jpayne@69 619 UChar32 fInitialChar;
jpayne@69 620 Regex8BitSet *fInitialChars8;
jpayne@69 621 UBool fNeedsAltInput;
jpayne@69 622
jpayne@69 623 UHashtable *fNamedCaptureMap; // Map from capture group names to numbers.
jpayne@69 624
jpayne@69 625 friend class RegexCompile;
jpayne@69 626 friend class RegexMatcher;
jpayne@69 627 friend class RegexCImpl;
jpayne@69 628
jpayne@69 629 //
jpayne@69 630 // Implementation Methods
jpayne@69 631 //
jpayne@69 632 void init(); // Common initialization, for use by constructors.
jpayne@69 633 bool initNamedCaptureMap(); // Lazy init for fNamedCaptureMap.
jpayne@69 634 void zap(); // Common cleanup
jpayne@69 635
jpayne@69 636 void dumpOp(int32_t index) const;
jpayne@69 637
jpayne@69 638 public:
jpayne@69 639 #ifndef U_HIDE_INTERNAL_API
jpayne@69 640 /**
jpayne@69 641 * Dump a compiled pattern. Internal debug function.
jpayne@69 642 * @internal
jpayne@69 643 */
jpayne@69 644 void dumpPattern() const;
jpayne@69 645 #endif /* U_HIDE_INTERNAL_API */
jpayne@69 646 };
jpayne@69 647
jpayne@69 648
jpayne@69 649
jpayne@69 650 /**
jpayne@69 651 * class RegexMatcher bundles together a regular expression pattern and
jpayne@69 652 * input text to which the expression can be applied. It includes methods
jpayne@69 653 * for testing for matches, and for find and replace operations.
jpayne@69 654 *
jpayne@69 655 * <p>Class RegexMatcher is not intended to be subclassed.</p>
jpayne@69 656 *
jpayne@69 657 * @stable ICU 2.4
jpayne@69 658 */
jpayne@69 659 class U_I18N_API RegexMatcher U_FINAL : public UObject {
jpayne@69 660 public:
jpayne@69 661
jpayne@69 662 /**
jpayne@69 663 * Construct a RegexMatcher for a regular expression.
jpayne@69 664 * This is a convenience method that avoids the need to explicitly create
jpayne@69 665 * a RegexPattern object. Note that if several RegexMatchers need to be
jpayne@69 666 * created for the same expression, it will be more efficient to
jpayne@69 667 * separately create and cache a RegexPattern object, and use
jpayne@69 668 * its matcher() method to create the RegexMatcher objects.
jpayne@69 669 *
jpayne@69 670 * @param regexp The Regular Expression to be compiled.
jpayne@69 671 * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
jpayne@69 672 * @param status Any errors are reported by setting this UErrorCode variable.
jpayne@69 673 * @stable ICU 2.6
jpayne@69 674 */
jpayne@69 675 RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
jpayne@69 676
jpayne@69 677 /**
jpayne@69 678 * Construct a RegexMatcher for a regular expression.
jpayne@69 679 * This is a convenience method that avoids the need to explicitly create
jpayne@69 680 * a RegexPattern object. Note that if several RegexMatchers need to be
jpayne@69 681 * created for the same expression, it will be more efficient to
jpayne@69 682 * separately create and cache a RegexPattern object, and use
jpayne@69 683 * its matcher() method to create the RegexMatcher objects.
jpayne@69 684 *
jpayne@69 685 * @param regexp The regular expression to be compiled.
jpayne@69 686 * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
jpayne@69 687 * @param status Any errors are reported by setting this UErrorCode variable.
jpayne@69 688 *
jpayne@69 689 * @stable ICU 4.6
jpayne@69 690 */
jpayne@69 691 RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
jpayne@69 692
jpayne@69 693 /**
jpayne@69 694 * Construct a RegexMatcher for a regular expression.
jpayne@69 695 * This is a convenience method that avoids the need to explicitly create
jpayne@69 696 * a RegexPattern object. Note that if several RegexMatchers need to be
jpayne@69 697 * created for the same expression, it will be more efficient to
jpayne@69 698 * separately create and cache a RegexPattern object, and use
jpayne@69 699 * its matcher() method to create the RegexMatcher objects.
jpayne@69 700 *
jpayne@69 701 * The matcher will retain a reference to the supplied input string, and all regexp
jpayne@69 702 * pattern matching operations happen directly on the original string. It is
jpayne@69 703 * critical that the string not be altered or deleted before use by the regular
jpayne@69 704 * expression operations is complete.
jpayne@69 705 *
jpayne@69 706 * @param regexp The Regular Expression to be compiled.
jpayne@69 707 * @param input The string to match. The matcher retains a reference to the
jpayne@69 708 * caller's string; mo copy is made.
jpayne@69 709 * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
jpayne@69 710 * @param status Any errors are reported by setting this UErrorCode variable.
jpayne@69 711 * @stable ICU 2.6
jpayne@69 712 */
jpayne@69 713 RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
jpayne@69 714 uint32_t flags, UErrorCode &status);
jpayne@69 715
jpayne@69 716 /**
jpayne@69 717 * Construct a RegexMatcher for a regular expression.
jpayne@69 718 * This is a convenience method that avoids the need to explicitly create
jpayne@69 719 * a RegexPattern object. Note that if several RegexMatchers need to be
jpayne@69 720 * created for the same expression, it will be more efficient to
jpayne@69 721 * separately create and cache a RegexPattern object, and use
jpayne@69 722 * its matcher() method to create the RegexMatcher objects.
jpayne@69 723 *
jpayne@69 724 * The matcher will make a shallow clone of the supplied input text, and all regexp
jpayne@69 725 * pattern matching operations happen on this clone. While read-only operations on
jpayne@69 726 * the supplied text are permitted, it is critical that the underlying string not be
jpayne@69 727 * altered or deleted before use by the regular expression operations is complete.
jpayne@69 728 *
jpayne@69 729 * @param regexp The Regular Expression to be compiled.
jpayne@69 730 * @param input The string to match. The matcher retains a shallow clone of the text.
jpayne@69 731 * @param flags #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
jpayne@69 732 * @param status Any errors are reported by setting this UErrorCode variable.
jpayne@69 733 *
jpayne@69 734 * @stable ICU 4.6
jpayne@69 735 */
jpayne@69 736 RegexMatcher(UText *regexp, UText *input,
jpayne@69 737 uint32_t flags, UErrorCode &status);
jpayne@69 738
jpayne@69 739 private:
jpayne@69 740 /**
jpayne@69 741 * Cause a compilation error if an application accidentally attempts to
jpayne@69 742 * create a matcher with a (char16_t *) string as input rather than
jpayne@69 743 * a UnicodeString. Avoids a dangling reference to a temporary string.
jpayne@69 744 *
jpayne@69 745 * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
jpayne@69 746 * using one of the aliasing constructors, such as
jpayne@69 747 * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
jpayne@69 748 * or in a UText, using
jpayne@69 749 * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
jpayne@69 750 */
jpayne@69 751 RegexMatcher(const UnicodeString &regexp, const char16_t *input,
jpayne@69 752 uint32_t flags, UErrorCode &status);
jpayne@69 753 public:
jpayne@69 754
jpayne@69 755
jpayne@69 756 /**
jpayne@69 757 * Destructor.
jpayne@69 758 *
jpayne@69 759 * @stable ICU 2.4
jpayne@69 760 */
jpayne@69 761 virtual ~RegexMatcher();
jpayne@69 762
jpayne@69 763
jpayne@69 764 /**
jpayne@69 765 * Attempts to match the entire input region against the pattern.
jpayne@69 766 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 767 * @return TRUE if there is a match
jpayne@69 768 * @stable ICU 2.4
jpayne@69 769 */
jpayne@69 770 virtual UBool matches(UErrorCode &status);
jpayne@69 771
jpayne@69 772
jpayne@69 773 /**
jpayne@69 774 * Resets the matcher, then attempts to match the input beginning
jpayne@69 775 * at the specified startIndex, and extending to the end of the input.
jpayne@69 776 * The input region is reset to include the entire input string.
jpayne@69 777 * A successful match must extend to the end of the input.
jpayne@69 778 * @param startIndex The input string (native) index at which to begin matching.
jpayne@69 779 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 780 * @return TRUE if there is a match
jpayne@69 781 * @stable ICU 2.8
jpayne@69 782 */
jpayne@69 783 virtual UBool matches(int64_t startIndex, UErrorCode &status);
jpayne@69 784
jpayne@69 785
jpayne@69 786 /**
jpayne@69 787 * Attempts to match the input string, starting from the beginning of the region,
jpayne@69 788 * against the pattern. Like the matches() method, this function
jpayne@69 789 * always starts at the beginning of the input region;
jpayne@69 790 * unlike that function, it does not require that the entire region be matched.
jpayne@69 791 *
jpayne@69 792 * If the match succeeds then more information can be obtained via the start(),
jpayne@69 793 * end(), and group() functions.
jpayne@69 794 *
jpayne@69 795 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 796 * @return TRUE if there is a match at the start of the input string.
jpayne@69 797 * @stable ICU 2.4
jpayne@69 798 */
jpayne@69 799 virtual UBool lookingAt(UErrorCode &status);
jpayne@69 800
jpayne@69 801
jpayne@69 802 /**
jpayne@69 803 * Attempts to match the input string, starting from the specified index, against the pattern.
jpayne@69 804 * The match may be of any length, and is not required to extend to the end
jpayne@69 805 * of the input string. Contrast with match().
jpayne@69 806 *
jpayne@69 807 * If the match succeeds then more information can be obtained via the start(),
jpayne@69 808 * end(), and group() functions.
jpayne@69 809 *
jpayne@69 810 * @param startIndex The input string (native) index at which to begin matching.
jpayne@69 811 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 812 * @return TRUE if there is a match.
jpayne@69 813 * @stable ICU 2.8
jpayne@69 814 */
jpayne@69 815 virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
jpayne@69 816
jpayne@69 817
jpayne@69 818 /**
jpayne@69 819 * Find the next pattern match in the input string.
jpayne@69 820 * The find begins searching the input at the location following the end of
jpayne@69 821 * the previous match, or at the start of the string if there is no previous match.
jpayne@69 822 * If a match is found, `start()`, `end()` and `group()`
jpayne@69 823 * will provide more information regarding the match.
jpayne@69 824 * Note that if the input string is changed by the application,
jpayne@69 825 * use find(startPos, status) instead of find(), because the saved starting
jpayne@69 826 * position may not be valid with the altered input string.
jpayne@69 827 * @return TRUE if a match is found.
jpayne@69 828 * @stable ICU 2.4
jpayne@69 829 */
jpayne@69 830 virtual UBool find();
jpayne@69 831
jpayne@69 832
jpayne@69 833 /**
jpayne@69 834 * Find the next pattern match in the input string.
jpayne@69 835 * The find begins searching the input at the location following the end of
jpayne@69 836 * the previous match, or at the start of the string if there is no previous match.
jpayne@69 837 * If a match is found, `start()`, `end()` and `group()`
jpayne@69 838 * will provide more information regarding the match.
jpayne@69 839 *
jpayne@69 840 * Note that if the input string is changed by the application,
jpayne@69 841 * use find(startPos, status) instead of find(), because the saved starting
jpayne@69 842 * position may not be valid with the altered input string.
jpayne@69 843 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 844 * @return TRUE if a match is found.
jpayne@69 845 * @stable ICU 55
jpayne@69 846 */
jpayne@69 847 virtual UBool find(UErrorCode &status);
jpayne@69 848
jpayne@69 849 /**
jpayne@69 850 * Resets this RegexMatcher and then attempts to find the next substring of the
jpayne@69 851 * input string that matches the pattern, starting at the specified index.
jpayne@69 852 *
jpayne@69 853 * @param start The (native) index in the input string to begin the search.
jpayne@69 854 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 855 * @return TRUE if a match is found.
jpayne@69 856 * @stable ICU 2.4
jpayne@69 857 */
jpayne@69 858 virtual UBool find(int64_t start, UErrorCode &status);
jpayne@69 859
jpayne@69 860
jpayne@69 861 /**
jpayne@69 862 * Returns a string containing the text matched by the previous match.
jpayne@69 863 * If the pattern can match an empty string, an empty string may be returned.
jpayne@69 864 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 865 * Possible errors are U_REGEX_INVALID_STATE if no match
jpayne@69 866 * has been attempted or the last match failed.
jpayne@69 867 * @return a string containing the matched input text.
jpayne@69 868 * @stable ICU 2.4
jpayne@69 869 */
jpayne@69 870 virtual UnicodeString group(UErrorCode &status) const;
jpayne@69 871
jpayne@69 872
jpayne@69 873 /**
jpayne@69 874 * Returns a string containing the text captured by the given group
jpayne@69 875 * during the previous match operation. Group(0) is the entire match.
jpayne@69 876 *
jpayne@69 877 * A zero length string is returned both for capture groups that did not
jpayne@69 878 * participate in the match and for actual zero length matches.
jpayne@69 879 * To distinguish between these two cases use the function start(),
jpayne@69 880 * which returns -1 for non-participating groups.
jpayne@69 881 *
jpayne@69 882 * @param groupNum the capture group number
jpayne@69 883 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 884 * Possible errors are U_REGEX_INVALID_STATE if no match
jpayne@69 885 * has been attempted or the last match failed and
jpayne@69 886 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
jpayne@69 887 * @return the captured text
jpayne@69 888 * @stable ICU 2.4
jpayne@69 889 */
jpayne@69 890 virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
jpayne@69 891
jpayne@69 892 /**
jpayne@69 893 * Returns the number of capturing groups in this matcher's pattern.
jpayne@69 894 * @return the number of capture groups
jpayne@69 895 * @stable ICU 2.4
jpayne@69 896 */
jpayne@69 897 virtual int32_t groupCount() const;
jpayne@69 898
jpayne@69 899
jpayne@69 900 /**
jpayne@69 901 * Returns a shallow clone of the entire live input string with the UText current native index
jpayne@69 902 * set to the beginning of the requested group.
jpayne@69 903 *
jpayne@69 904 * @param dest The UText into which the input should be cloned, or NULL to create a new UText
jpayne@69 905 * @param group_len A reference to receive the length of the desired capture group
jpayne@69 906 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 907 * Possible errors are U_REGEX_INVALID_STATE if no match
jpayne@69 908 * has been attempted or the last match failed and
jpayne@69 909 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
jpayne@69 910 * @return dest if non-NULL, a shallow copy of the input text otherwise
jpayne@69 911 *
jpayne@69 912 * @stable ICU 4.6
jpayne@69 913 */
jpayne@69 914 virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const;
jpayne@69 915
jpayne@69 916 /**
jpayne@69 917 * Returns a shallow clone of the entire live input string with the UText current native index
jpayne@69 918 * set to the beginning of the requested group.
jpayne@69 919 *
jpayne@69 920 * A group length of zero is returned both for capture groups that did not
jpayne@69 921 * participate in the match and for actual zero length matches.
jpayne@69 922 * To distinguish between these two cases use the function start(),
jpayne@69 923 * which returns -1 for non-participating groups.
jpayne@69 924 *
jpayne@69 925 * @param groupNum The capture group number.
jpayne@69 926 * @param dest The UText into which the input should be cloned, or NULL to create a new UText.
jpayne@69 927 * @param group_len A reference to receive the length of the desired capture group
jpayne@69 928 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 929 * Possible errors are U_REGEX_INVALID_STATE if no match
jpayne@69 930 * has been attempted or the last match failed and
jpayne@69 931 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
jpayne@69 932 * @return dest if non-NULL, a shallow copy of the input text otherwise
jpayne@69 933 *
jpayne@69 934 * @stable ICU 4.6
jpayne@69 935 */
jpayne@69 936 virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
jpayne@69 937
jpayne@69 938 /**
jpayne@69 939 * Returns the index in the input string of the start of the text matched
jpayne@69 940 * during the previous match operation.
jpayne@69 941 * @param status a reference to a UErrorCode to receive any errors.
jpayne@69 942 * @return The (native) position in the input string of the start of the last match.
jpayne@69 943 * @stable ICU 2.4
jpayne@69 944 */
jpayne@69 945 virtual int32_t start(UErrorCode &status) const;
jpayne@69 946
jpayne@69 947 /**
jpayne@69 948 * Returns the index in the input string of the start of the text matched
jpayne@69 949 * during the previous match operation.
jpayne@69 950 * @param status a reference to a UErrorCode to receive any errors.
jpayne@69 951 * @return The (native) position in the input string of the start of the last match.
jpayne@69 952 * @stable ICU 4.6
jpayne@69 953 */
jpayne@69 954 virtual int64_t start64(UErrorCode &status) const;
jpayne@69 955
jpayne@69 956
jpayne@69 957 /**
jpayne@69 958 * Returns the index in the input string of the start of the text matched by the
jpayne@69 959 * specified capture group during the previous match operation. Return -1 if
jpayne@69 960 * the capture group exists in the pattern, but was not part of the last match.
jpayne@69 961 *
jpayne@69 962 * @param group the capture group number
jpayne@69 963 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 964 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 965 * attempted or the last match failed, and
jpayne@69 966 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
jpayne@69 967 * @return the (native) start position of substring matched by the specified group.
jpayne@69 968 * @stable ICU 2.4
jpayne@69 969 */
jpayne@69 970 virtual int32_t start(int32_t group, UErrorCode &status) const;
jpayne@69 971
jpayne@69 972 /**
jpayne@69 973 * Returns the index in the input string of the start of the text matched by the
jpayne@69 974 * specified capture group during the previous match operation. Return -1 if
jpayne@69 975 * the capture group exists in the pattern, but was not part of the last match.
jpayne@69 976 *
jpayne@69 977 * @param group the capture group number.
jpayne@69 978 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 979 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 980 * attempted or the last match failed, and
jpayne@69 981 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
jpayne@69 982 * @return the (native) start position of substring matched by the specified group.
jpayne@69 983 * @stable ICU 4.6
jpayne@69 984 */
jpayne@69 985 virtual int64_t start64(int32_t group, UErrorCode &status) const;
jpayne@69 986
jpayne@69 987 /**
jpayne@69 988 * Returns the index in the input string of the first character following the
jpayne@69 989 * text matched during the previous match operation.
jpayne@69 990 *
jpayne@69 991 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 992 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 993 * attempted or the last match failed.
jpayne@69 994 * @return the index of the last character matched, plus one.
jpayne@69 995 * The index value returned is a native index, corresponding to
jpayne@69 996 * code units for the underlying encoding type, for example,
jpayne@69 997 * a byte index for UTF-8.
jpayne@69 998 * @stable ICU 2.4
jpayne@69 999 */
jpayne@69 1000 virtual int32_t end(UErrorCode &status) const;
jpayne@69 1001
jpayne@69 1002 /**
jpayne@69 1003 * Returns the index in the input string of the first character following the
jpayne@69 1004 * text matched during the previous match operation.
jpayne@69 1005 *
jpayne@69 1006 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 1007 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 1008 * attempted or the last match failed.
jpayne@69 1009 * @return the index of the last character matched, plus one.
jpayne@69 1010 * The index value returned is a native index, corresponding to
jpayne@69 1011 * code units for the underlying encoding type, for example,
jpayne@69 1012 * a byte index for UTF-8.
jpayne@69 1013 * @stable ICU 4.6
jpayne@69 1014 */
jpayne@69 1015 virtual int64_t end64(UErrorCode &status) const;
jpayne@69 1016
jpayne@69 1017
jpayne@69 1018 /**
jpayne@69 1019 * Returns the index in the input string of the character following the
jpayne@69 1020 * text matched by the specified capture group during the previous match operation.
jpayne@69 1021 *
jpayne@69 1022 * @param group the capture group number
jpayne@69 1023 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 1024 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 1025 * attempted or the last match failed and
jpayne@69 1026 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
jpayne@69 1027 * @return the index of the first character following the text
jpayne@69 1028 * captured by the specified group during the previous match operation.
jpayne@69 1029 * Return -1 if the capture group exists in the pattern but was not part of the match.
jpayne@69 1030 * The index value returned is a native index, corresponding to
jpayne@69 1031 * code units for the underlying encoding type, for example,
jpayne@69 1032 * a byte index for UTF8.
jpayne@69 1033 * @stable ICU 2.4
jpayne@69 1034 */
jpayne@69 1035 virtual int32_t end(int32_t group, UErrorCode &status) const;
jpayne@69 1036
jpayne@69 1037 /**
jpayne@69 1038 * Returns the index in the input string of the character following the
jpayne@69 1039 * text matched by the specified capture group during the previous match operation.
jpayne@69 1040 *
jpayne@69 1041 * @param group the capture group number
jpayne@69 1042 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 1043 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 1044 * attempted or the last match failed and
jpayne@69 1045 * U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
jpayne@69 1046 * @return the index of the first character following the text
jpayne@69 1047 * captured by the specified group during the previous match operation.
jpayne@69 1048 * Return -1 if the capture group exists in the pattern but was not part of the match.
jpayne@69 1049 * The index value returned is a native index, corresponding to
jpayne@69 1050 * code units for the underlying encoding type, for example,
jpayne@69 1051 * a byte index for UTF8.
jpayne@69 1052 * @stable ICU 4.6
jpayne@69 1053 */
jpayne@69 1054 virtual int64_t end64(int32_t group, UErrorCode &status) const;
jpayne@69 1055
jpayne@69 1056 /**
jpayne@69 1057 * Resets this matcher. The effect is to remove any memory of previous matches,
jpayne@69 1058 * and to cause subsequent find() operations to begin at the beginning of
jpayne@69 1059 * the input string.
jpayne@69 1060 *
jpayne@69 1061 * @return this RegexMatcher.
jpayne@69 1062 * @stable ICU 2.4
jpayne@69 1063 */
jpayne@69 1064 virtual RegexMatcher &reset();
jpayne@69 1065
jpayne@69 1066
jpayne@69 1067 /**
jpayne@69 1068 * Resets this matcher, and set the current input position.
jpayne@69 1069 * The effect is to remove any memory of previous matches,
jpayne@69 1070 * and to cause subsequent find() operations to begin at
jpayne@69 1071 * the specified (native) position in the input string.
jpayne@69 1072 *
jpayne@69 1073 * The matcher's region is reset to its default, which is the entire
jpayne@69 1074 * input string.
jpayne@69 1075 *
jpayne@69 1076 * An alternative to this function is to set a match region
jpayne@69 1077 * beginning at the desired index.
jpayne@69 1078 *
jpayne@69 1079 * @return this RegexMatcher.
jpayne@69 1080 * @stable ICU 2.8
jpayne@69 1081 */
jpayne@69 1082 virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
jpayne@69 1083
jpayne@69 1084
jpayne@69 1085 /**
jpayne@69 1086 * Resets this matcher with a new input string. This allows instances of RegexMatcher
jpayne@69 1087 * to be reused, which is more efficient than creating a new RegexMatcher for
jpayne@69 1088 * each input string to be processed.
jpayne@69 1089 * @param input The new string on which subsequent pattern matches will operate.
jpayne@69 1090 * The matcher retains a reference to the callers string, and operates
jpayne@69 1091 * directly on that. Ownership of the string remains with the caller.
jpayne@69 1092 * Because no copy of the string is made, it is essential that the
jpayne@69 1093 * caller not delete the string until after regexp operations on it
jpayne@69 1094 * are done.
jpayne@69 1095 * Note that while a reset on the matcher with an input string that is then
jpayne@69 1096 * modified across/during matcher operations may be supported currently for UnicodeString,
jpayne@69 1097 * this was not originally intended behavior, and support for this is not guaranteed
jpayne@69 1098 * in upcoming versions of ICU.
jpayne@69 1099 * @return this RegexMatcher.
jpayne@69 1100 * @stable ICU 2.4
jpayne@69 1101 */
jpayne@69 1102 virtual RegexMatcher &reset(const UnicodeString &input);
jpayne@69 1103
jpayne@69 1104
jpayne@69 1105 /**
jpayne@69 1106 * Resets this matcher with a new input string. This allows instances of RegexMatcher
jpayne@69 1107 * to be reused, which is more efficient than creating a new RegexMatcher for
jpayne@69 1108 * each input string to be processed.
jpayne@69 1109 * @param input The new string on which subsequent pattern matches will operate.
jpayne@69 1110 * The matcher makes a shallow clone of the given text; ownership of the
jpayne@69 1111 * original string remains with the caller. Because no deep copy of the
jpayne@69 1112 * text is made, it is essential that the caller not modify the string
jpayne@69 1113 * until after regexp operations on it are done.
jpayne@69 1114 * @return this RegexMatcher.
jpayne@69 1115 *
jpayne@69 1116 * @stable ICU 4.6
jpayne@69 1117 */
jpayne@69 1118 virtual RegexMatcher &reset(UText *input);
jpayne@69 1119
jpayne@69 1120
jpayne@69 1121 /**
jpayne@69 1122 * Set the subject text string upon which the regular expression is looking for matches
jpayne@69 1123 * without changing any other aspect of the matching state.
jpayne@69 1124 * The new and previous text strings must have the same content.
jpayne@69 1125 *
jpayne@69 1126 * This function is intended for use in environments where ICU is operating on
jpayne@69 1127 * strings that may move around in memory. It provides a mechanism for notifying
jpayne@69 1128 * ICU that the string has been relocated, and providing a new UText to access the
jpayne@69 1129 * string in its new position.
jpayne@69 1130 *
jpayne@69 1131 * Note that the regular expression implementation never copies the underlying text
jpayne@69 1132 * of a string being matched, but always operates directly on the original text
jpayne@69 1133 * provided by the user. Refreshing simply drops the references to the old text
jpayne@69 1134 * and replaces them with references to the new.
jpayne@69 1135 *
jpayne@69 1136 * Caution: this function is normally used only by very specialized,
jpayne@69 1137 * system-level code. One example use case is with garbage collection that moves
jpayne@69 1138 * the text in memory.
jpayne@69 1139 *
jpayne@69 1140 * @param input The new (moved) text string.
jpayne@69 1141 * @param status Receives errors detected by this function.
jpayne@69 1142 *
jpayne@69 1143 * @stable ICU 4.8
jpayne@69 1144 */
jpayne@69 1145 virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
jpayne@69 1146
jpayne@69 1147 private:
jpayne@69 1148 /**
jpayne@69 1149 * Cause a compilation error if an application accidentally attempts to
jpayne@69 1150 * reset a matcher with a (char16_t *) string as input rather than
jpayne@69 1151 * a UnicodeString. Avoids a dangling reference to a temporary string.
jpayne@69 1152 *
jpayne@69 1153 * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
jpayne@69 1154 * using one of the aliasing constructors, such as
jpayne@69 1155 * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
jpayne@69 1156 * or in a UText, using
jpayne@69 1157 * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
jpayne@69 1158 *
jpayne@69 1159 */
jpayne@69 1160 RegexMatcher &reset(const char16_t *input);
jpayne@69 1161 public:
jpayne@69 1162
jpayne@69 1163 /**
jpayne@69 1164 * Returns the input string being matched. Ownership of the string belongs to
jpayne@69 1165 * the matcher; it should not be altered or deleted. This method will work even if the input
jpayne@69 1166 * was originally supplied as a UText.
jpayne@69 1167 * @return the input string
jpayne@69 1168 * @stable ICU 2.4
jpayne@69 1169 */
jpayne@69 1170 virtual const UnicodeString &input() const;
jpayne@69 1171
jpayne@69 1172 /**
jpayne@69 1173 * Returns the input string being matched. This is the live input text; it should not be
jpayne@69 1174 * altered or deleted. This method will work even if the input was originally supplied as
jpayne@69 1175 * a UnicodeString.
jpayne@69 1176 * @return the input text
jpayne@69 1177 *
jpayne@69 1178 * @stable ICU 4.6
jpayne@69 1179 */
jpayne@69 1180 virtual UText *inputText() const;
jpayne@69 1181
jpayne@69 1182 /**
jpayne@69 1183 * Returns the input string being matched, either by copying it into the provided
jpayne@69 1184 * UText parameter or by returning a shallow clone of the live input. Note that copying
jpayne@69 1185 * the entire input may cause significant performance and memory issues.
jpayne@69 1186 * @param dest The UText into which the input should be copied, or NULL to create a new UText
jpayne@69 1187 * @param status error code
jpayne@69 1188 * @return dest if non-NULL, a shallow copy of the input text otherwise
jpayne@69 1189 *
jpayne@69 1190 * @stable ICU 4.6
jpayne@69 1191 */
jpayne@69 1192 virtual UText *getInput(UText *dest, UErrorCode &status) const;
jpayne@69 1193
jpayne@69 1194
jpayne@69 1195 /** Sets the limits of this matcher's region.
jpayne@69 1196 * The region is the part of the input string that will be searched to find a match.
jpayne@69 1197 * Invoking this method resets the matcher, and then sets the region to start
jpayne@69 1198 * at the index specified by the start parameter and end at the index specified
jpayne@69 1199 * by the end parameter.
jpayne@69 1200 *
jpayne@69 1201 * Depending on the transparency and anchoring being used (see useTransparentBounds
jpayne@69 1202 * and useAnchoringBounds), certain constructs such as anchors may behave differently
jpayne@69 1203 * at or around the boundaries of the region
jpayne@69 1204 *
jpayne@69 1205 * The function will fail if start is greater than limit, or if either index
jpayne@69 1206 * is less than zero or greater than the length of the string being matched.
jpayne@69 1207 *
jpayne@69 1208 * @param start The (native) index to begin searches at.
jpayne@69 1209 * @param limit The index to end searches at (exclusive).
jpayne@69 1210 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1211 * @stable ICU 4.0
jpayne@69 1212 */
jpayne@69 1213 virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
jpayne@69 1214
jpayne@69 1215 /**
jpayne@69 1216 * Identical to region(start, limit, status) but also allows a start position without
jpayne@69 1217 * resetting the region state.
jpayne@69 1218 * @param regionStart The region start
jpayne@69 1219 * @param regionLimit the limit of the region
jpayne@69 1220 * @param startIndex The (native) index within the region bounds at which to begin searches.
jpayne@69 1221 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1222 * If startIndex is not within the specified region bounds,
jpayne@69 1223 * U_INDEX_OUTOFBOUNDS_ERROR is returned.
jpayne@69 1224 * @stable ICU 4.6
jpayne@69 1225 */
jpayne@69 1226 virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
jpayne@69 1227
jpayne@69 1228 /**
jpayne@69 1229 * Reports the start index of this matcher's region. The searches this matcher
jpayne@69 1230 * conducts are limited to finding matches within regionStart (inclusive) and
jpayne@69 1231 * regionEnd (exclusive).
jpayne@69 1232 *
jpayne@69 1233 * @return The starting (native) index of this matcher's region.
jpayne@69 1234 * @stable ICU 4.0
jpayne@69 1235 */
jpayne@69 1236 virtual int32_t regionStart() const;
jpayne@69 1237
jpayne@69 1238 /**
jpayne@69 1239 * Reports the start index of this matcher's region. The searches this matcher
jpayne@69 1240 * conducts are limited to finding matches within regionStart (inclusive) and
jpayne@69 1241 * regionEnd (exclusive).
jpayne@69 1242 *
jpayne@69 1243 * @return The starting (native) index of this matcher's region.
jpayne@69 1244 * @stable ICU 4.6
jpayne@69 1245 */
jpayne@69 1246 virtual int64_t regionStart64() const;
jpayne@69 1247
jpayne@69 1248
jpayne@69 1249 /**
jpayne@69 1250 * Reports the end (limit) index (exclusive) of this matcher's region. The searches
jpayne@69 1251 * this matcher conducts are limited to finding matches within regionStart
jpayne@69 1252 * (inclusive) and regionEnd (exclusive).
jpayne@69 1253 *
jpayne@69 1254 * @return The ending point (native) of this matcher's region.
jpayne@69 1255 * @stable ICU 4.0
jpayne@69 1256 */
jpayne@69 1257 virtual int32_t regionEnd() const;
jpayne@69 1258
jpayne@69 1259 /**
jpayne@69 1260 * Reports the end (limit) index (exclusive) of this matcher's region. The searches
jpayne@69 1261 * this matcher conducts are limited to finding matches within regionStart
jpayne@69 1262 * (inclusive) and regionEnd (exclusive).
jpayne@69 1263 *
jpayne@69 1264 * @return The ending point (native) of this matcher's region.
jpayne@69 1265 * @stable ICU 4.6
jpayne@69 1266 */
jpayne@69 1267 virtual int64_t regionEnd64() const;
jpayne@69 1268
jpayne@69 1269 /**
jpayne@69 1270 * Queries the transparency of region bounds for this matcher.
jpayne@69 1271 * See useTransparentBounds for a description of transparent and opaque bounds.
jpayne@69 1272 * By default, a matcher uses opaque region boundaries.
jpayne@69 1273 *
jpayne@69 1274 * @return TRUE if this matcher is using opaque bounds, false if it is not.
jpayne@69 1275 * @stable ICU 4.0
jpayne@69 1276 */
jpayne@69 1277 virtual UBool hasTransparentBounds() const;
jpayne@69 1278
jpayne@69 1279 /**
jpayne@69 1280 * Sets the transparency of region bounds for this matcher.
jpayne@69 1281 * Invoking this function with an argument of true will set this matcher to use transparent bounds.
jpayne@69 1282 * If the boolean argument is false, then opaque bounds will be used.
jpayne@69 1283 *
jpayne@69 1284 * Using transparent bounds, the boundaries of this matcher's region are transparent
jpayne@69 1285 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
jpayne@69 1286 * see text beyond the boundaries of the region while checking for a match.
jpayne@69 1287 *
jpayne@69 1288 * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
jpayne@69 1289 * lookbehind, and boundary matching constructs.
jpayne@69 1290 *
jpayne@69 1291 * By default, a matcher uses opaque bounds.
jpayne@69 1292 *
jpayne@69 1293 * @param b TRUE for transparent bounds; FALSE for opaque bounds
jpayne@69 1294 * @return This Matcher;
jpayne@69 1295 * @stable ICU 4.0
jpayne@69 1296 **/
jpayne@69 1297 virtual RegexMatcher &useTransparentBounds(UBool b);
jpayne@69 1298
jpayne@69 1299
jpayne@69 1300 /**
jpayne@69 1301 * Return true if this matcher is using anchoring bounds.
jpayne@69 1302 * By default, matchers use anchoring region bounds.
jpayne@69 1303 *
jpayne@69 1304 * @return TRUE if this matcher is using anchoring bounds.
jpayne@69 1305 * @stable ICU 4.0
jpayne@69 1306 */
jpayne@69 1307 virtual UBool hasAnchoringBounds() const;
jpayne@69 1308
jpayne@69 1309
jpayne@69 1310 /**
jpayne@69 1311 * Set whether this matcher is using Anchoring Bounds for its region.
jpayne@69 1312 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
jpayne@69 1313 * and end of the region. Without Anchoring Bounds, anchors will only match at
jpayne@69 1314 * the positions they would in the complete text.
jpayne@69 1315 *
jpayne@69 1316 * Anchoring Bounds are the default for regions.
jpayne@69 1317 *
jpayne@69 1318 * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
jpayne@69 1319 * @return This Matcher
jpayne@69 1320 * @stable ICU 4.0
jpayne@69 1321 */
jpayne@69 1322 virtual RegexMatcher &useAnchoringBounds(UBool b);
jpayne@69 1323
jpayne@69 1324
jpayne@69 1325 /**
jpayne@69 1326 * Return TRUE if the most recent matching operation attempted to access
jpayne@69 1327 * additional input beyond the available input text.
jpayne@69 1328 * In this case, additional input text could change the results of the match.
jpayne@69 1329 *
jpayne@69 1330 * hitEnd() is defined for both successful and unsuccessful matches.
jpayne@69 1331 * In either case hitEnd() will return TRUE if if the end of the text was
jpayne@69 1332 * reached at any point during the matching process.
jpayne@69 1333 *
jpayne@69 1334 * @return TRUE if the most recent match hit the end of input
jpayne@69 1335 * @stable ICU 4.0
jpayne@69 1336 */
jpayne@69 1337 virtual UBool hitEnd() const;
jpayne@69 1338
jpayne@69 1339 /**
jpayne@69 1340 * Return TRUE the most recent match succeeded and additional input could cause
jpayne@69 1341 * it to fail. If this method returns false and a match was found, then more input
jpayne@69 1342 * might change the match but the match won't be lost. If a match was not found,
jpayne@69 1343 * then requireEnd has no meaning.
jpayne@69 1344 *
jpayne@69 1345 * @return TRUE if more input could cause the most recent match to no longer match.
jpayne@69 1346 * @stable ICU 4.0
jpayne@69 1347 */
jpayne@69 1348 virtual UBool requireEnd() const;
jpayne@69 1349
jpayne@69 1350
jpayne@69 1351 /**
jpayne@69 1352 * Returns the pattern that is interpreted by this matcher.
jpayne@69 1353 * @return the RegexPattern for this RegexMatcher
jpayne@69 1354 * @stable ICU 2.4
jpayne@69 1355 */
jpayne@69 1356 virtual const RegexPattern &pattern() const;
jpayne@69 1357
jpayne@69 1358
jpayne@69 1359 /**
jpayne@69 1360 * Replaces every substring of the input that matches the pattern
jpayne@69 1361 * with the given replacement string. This is a convenience function that
jpayne@69 1362 * provides a complete find-and-replace-all operation.
jpayne@69 1363 *
jpayne@69 1364 * This method first resets this matcher. It then scans the input string
jpayne@69 1365 * looking for matches of the pattern. Input that is not part of any
jpayne@69 1366 * match is left unchanged; each match is replaced in the result by the
jpayne@69 1367 * replacement string. The replacement string may contain references to
jpayne@69 1368 * capture groups.
jpayne@69 1369 *
jpayne@69 1370 * @param replacement a string containing the replacement text.
jpayne@69 1371 * @param status a reference to a UErrorCode to receive any errors.
jpayne@69 1372 * @return a string containing the results of the find and replace.
jpayne@69 1373 * @stable ICU 2.4
jpayne@69 1374 */
jpayne@69 1375 virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
jpayne@69 1376
jpayne@69 1377
jpayne@69 1378 /**
jpayne@69 1379 * Replaces every substring of the input that matches the pattern
jpayne@69 1380 * with the given replacement string. This is a convenience function that
jpayne@69 1381 * provides a complete find-and-replace-all operation.
jpayne@69 1382 *
jpayne@69 1383 * This method first resets this matcher. It then scans the input string
jpayne@69 1384 * looking for matches of the pattern. Input that is not part of any
jpayne@69 1385 * match is left unchanged; each match is replaced in the result by the
jpayne@69 1386 * replacement string. The replacement string may contain references to
jpayne@69 1387 * capture groups.
jpayne@69 1388 *
jpayne@69 1389 * @param replacement a string containing the replacement text.
jpayne@69 1390 * @param dest a mutable UText in which the results are placed.
jpayne@69 1391 * If NULL, a new UText will be created (which may not be mutable).
jpayne@69 1392 * @param status a reference to a UErrorCode to receive any errors.
jpayne@69 1393 * @return a string containing the results of the find and replace.
jpayne@69 1394 * If a pre-allocated UText was provided, it will always be used and returned.
jpayne@69 1395 *
jpayne@69 1396 * @stable ICU 4.6
jpayne@69 1397 */
jpayne@69 1398 virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
jpayne@69 1399
jpayne@69 1400
jpayne@69 1401 /**
jpayne@69 1402 * Replaces the first substring of the input that matches
jpayne@69 1403 * the pattern with the replacement string. This is a convenience
jpayne@69 1404 * function that provides a complete find-and-replace operation.
jpayne@69 1405 *
jpayne@69 1406 * This function first resets this RegexMatcher. It then scans the input string
jpayne@69 1407 * looking for a match of the pattern. Input that is not part
jpayne@69 1408 * of the match is appended directly to the result string; the match is replaced
jpayne@69 1409 * in the result by the replacement string. The replacement string may contain
jpayne@69 1410 * references to captured groups.
jpayne@69 1411 *
jpayne@69 1412 * The state of the matcher (the position at which a subsequent find()
jpayne@69 1413 * would begin) after completing a replaceFirst() is not specified. The
jpayne@69 1414 * RegexMatcher should be reset before doing additional find() operations.
jpayne@69 1415 *
jpayne@69 1416 * @param replacement a string containing the replacement text.
jpayne@69 1417 * @param status a reference to a UErrorCode to receive any errors.
jpayne@69 1418 * @return a string containing the results of the find and replace.
jpayne@69 1419 * @stable ICU 2.4
jpayne@69 1420 */
jpayne@69 1421 virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
jpayne@69 1422
jpayne@69 1423
jpayne@69 1424 /**
jpayne@69 1425 * Replaces the first substring of the input that matches
jpayne@69 1426 * the pattern with the replacement string. This is a convenience
jpayne@69 1427 * function that provides a complete find-and-replace operation.
jpayne@69 1428 *
jpayne@69 1429 * This function first resets this RegexMatcher. It then scans the input string
jpayne@69 1430 * looking for a match of the pattern. Input that is not part
jpayne@69 1431 * of the match is appended directly to the result string; the match is replaced
jpayne@69 1432 * in the result by the replacement string. The replacement string may contain
jpayne@69 1433 * references to captured groups.
jpayne@69 1434 *
jpayne@69 1435 * The state of the matcher (the position at which a subsequent find()
jpayne@69 1436 * would begin) after completing a replaceFirst() is not specified. The
jpayne@69 1437 * RegexMatcher should be reset before doing additional find() operations.
jpayne@69 1438 *
jpayne@69 1439 * @param replacement a string containing the replacement text.
jpayne@69 1440 * @param dest a mutable UText in which the results are placed.
jpayne@69 1441 * If NULL, a new UText will be created (which may not be mutable).
jpayne@69 1442 * @param status a reference to a UErrorCode to receive any errors.
jpayne@69 1443 * @return a string containing the results of the find and replace.
jpayne@69 1444 * If a pre-allocated UText was provided, it will always be used and returned.
jpayne@69 1445 *
jpayne@69 1446 * @stable ICU 4.6
jpayne@69 1447 */
jpayne@69 1448 virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
jpayne@69 1449
jpayne@69 1450
jpayne@69 1451 /**
jpayne@69 1452 * Implements a replace operation intended to be used as part of an
jpayne@69 1453 * incremental find-and-replace.
jpayne@69 1454 *
jpayne@69 1455 * The input string, starting from the end of the previous replacement and ending at
jpayne@69 1456 * the start of the current match, is appended to the destination string. Then the
jpayne@69 1457 * replacement string is appended to the output string,
jpayne@69 1458 * including handling any substitutions of captured text.
jpayne@69 1459 *
jpayne@69 1460 * For simple, prepackaged, non-incremental find-and-replace
jpayne@69 1461 * operations, see replaceFirst() or replaceAll().
jpayne@69 1462 *
jpayne@69 1463 * @param dest A UnicodeString to which the results of the find-and-replace are appended.
jpayne@69 1464 * @param replacement A UnicodeString that provides the text to be substituted for
jpayne@69 1465 * the input text that matched the regexp pattern. The replacement
jpayne@69 1466 * text may contain references to captured text from the
jpayne@69 1467 * input.
jpayne@69 1468 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 1469 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 1470 * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
jpayne@69 1471 * if the replacement text specifies a capture group that
jpayne@69 1472 * does not exist in the pattern.
jpayne@69 1473 *
jpayne@69 1474 * @return this RegexMatcher
jpayne@69 1475 * @stable ICU 2.4
jpayne@69 1476 *
jpayne@69 1477 */
jpayne@69 1478 virtual RegexMatcher &appendReplacement(UnicodeString &dest,
jpayne@69 1479 const UnicodeString &replacement, UErrorCode &status);
jpayne@69 1480
jpayne@69 1481
jpayne@69 1482 /**
jpayne@69 1483 * Implements a replace operation intended to be used as part of an
jpayne@69 1484 * incremental find-and-replace.
jpayne@69 1485 *
jpayne@69 1486 * The input string, starting from the end of the previous replacement and ending at
jpayne@69 1487 * the start of the current match, is appended to the destination string. Then the
jpayne@69 1488 * replacement string is appended to the output string,
jpayne@69 1489 * including handling any substitutions of captured text.
jpayne@69 1490 *
jpayne@69 1491 * For simple, prepackaged, non-incremental find-and-replace
jpayne@69 1492 * operations, see replaceFirst() or replaceAll().
jpayne@69 1493 *
jpayne@69 1494 * @param dest A mutable UText to which the results of the find-and-replace are appended.
jpayne@69 1495 * Must not be NULL.
jpayne@69 1496 * @param replacement A UText that provides the text to be substituted for
jpayne@69 1497 * the input text that matched the regexp pattern. The replacement
jpayne@69 1498 * text may contain references to captured text from the input.
jpayne@69 1499 * @param status A reference to a UErrorCode to receive any errors. Possible
jpayne@69 1500 * errors are U_REGEX_INVALID_STATE if no match has been
jpayne@69 1501 * attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
jpayne@69 1502 * if the replacement text specifies a capture group that
jpayne@69 1503 * does not exist in the pattern.
jpayne@69 1504 *
jpayne@69 1505 * @return this RegexMatcher
jpayne@69 1506 *
jpayne@69 1507 * @stable ICU 4.6
jpayne@69 1508 */
jpayne@69 1509 virtual RegexMatcher &appendReplacement(UText *dest,
jpayne@69 1510 UText *replacement, UErrorCode &status);
jpayne@69 1511
jpayne@69 1512
jpayne@69 1513 /**
jpayne@69 1514 * As the final step in a find-and-replace operation, append the remainder
jpayne@69 1515 * of the input string, starting at the position following the last appendReplacement(),
jpayne@69 1516 * to the destination string. `appendTail()` is intended to be invoked after one
jpayne@69 1517 * or more invocations of the `RegexMatcher::appendReplacement()`.
jpayne@69 1518 *
jpayne@69 1519 * @param dest A UnicodeString to which the results of the find-and-replace are appended.
jpayne@69 1520 * @return the destination string.
jpayne@69 1521 * @stable ICU 2.4
jpayne@69 1522 */
jpayne@69 1523 virtual UnicodeString &appendTail(UnicodeString &dest);
jpayne@69 1524
jpayne@69 1525
jpayne@69 1526 /**
jpayne@69 1527 * As the final step in a find-and-replace operation, append the remainder
jpayne@69 1528 * of the input string, starting at the position following the last appendReplacement(),
jpayne@69 1529 * to the destination string. `appendTail()` is intended to be invoked after one
jpayne@69 1530 * or more invocations of the `RegexMatcher::appendReplacement()`.
jpayne@69 1531 *
jpayne@69 1532 * @param dest A mutable UText to which the results of the find-and-replace are appended.
jpayne@69 1533 * Must not be NULL.
jpayne@69 1534 * @param status error cod
jpayne@69 1535 * @return the destination string.
jpayne@69 1536 *
jpayne@69 1537 * @stable ICU 4.6
jpayne@69 1538 */
jpayne@69 1539 virtual UText *appendTail(UText *dest, UErrorCode &status);
jpayne@69 1540
jpayne@69 1541
jpayne@69 1542 /**
jpayne@69 1543 * Split a string into fields. Somewhat like %split() from Perl.
jpayne@69 1544 * The pattern matches identify delimiters that separate the input
jpayne@69 1545 * into fields. The input data between the matches becomes the
jpayne@69 1546 * fields themselves.
jpayne@69 1547 *
jpayne@69 1548 * @param input The string to be split into fields. The field delimiters
jpayne@69 1549 * match the pattern (in the "this" object). This matcher
jpayne@69 1550 * will be reset to this input string.
jpayne@69 1551 * @param dest An array of UnicodeStrings to receive the results of the split.
jpayne@69 1552 * This is an array of actual UnicodeString objects, not an
jpayne@69 1553 * array of pointers to strings. Local (stack based) arrays can
jpayne@69 1554 * work well here.
jpayne@69 1555 * @param destCapacity The number of elements in the destination array.
jpayne@69 1556 * If the number of fields found is less than destCapacity, the
jpayne@69 1557 * extra strings in the destination array are not altered.
jpayne@69 1558 * If the number of destination strings is less than the number
jpayne@69 1559 * of fields, the trailing part of the input string, including any
jpayne@69 1560 * field delimiters, is placed in the last destination string.
jpayne@69 1561 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1562 * @return The number of fields into which the input string was split.
jpayne@69 1563 * @stable ICU 2.6
jpayne@69 1564 */
jpayne@69 1565 virtual int32_t split(const UnicodeString &input,
jpayne@69 1566 UnicodeString dest[],
jpayne@69 1567 int32_t destCapacity,
jpayne@69 1568 UErrorCode &status);
jpayne@69 1569
jpayne@69 1570
jpayne@69 1571 /**
jpayne@69 1572 * Split a string into fields. Somewhat like %split() from Perl.
jpayne@69 1573 * The pattern matches identify delimiters that separate the input
jpayne@69 1574 * into fields. The input data between the matches becomes the
jpayne@69 1575 * fields themselves.
jpayne@69 1576 *
jpayne@69 1577 * @param input The string to be split into fields. The field delimiters
jpayne@69 1578 * match the pattern (in the "this" object). This matcher
jpayne@69 1579 * will be reset to this input string.
jpayne@69 1580 * @param dest An array of mutable UText structs to receive the results of the split.
jpayne@69 1581 * If a field is NULL, a new UText is allocated to contain the results for
jpayne@69 1582 * that field. This new UText is not guaranteed to be mutable.
jpayne@69 1583 * @param destCapacity The number of elements in the destination array.
jpayne@69 1584 * If the number of fields found is less than destCapacity, the
jpayne@69 1585 * extra strings in the destination array are not altered.
jpayne@69 1586 * If the number of destination strings is less than the number
jpayne@69 1587 * of fields, the trailing part of the input string, including any
jpayne@69 1588 * field delimiters, is placed in the last destination string.
jpayne@69 1589 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1590 * @return The number of fields into which the input string was split.
jpayne@69 1591 *
jpayne@69 1592 * @stable ICU 4.6
jpayne@69 1593 */
jpayne@69 1594 virtual int32_t split(UText *input,
jpayne@69 1595 UText *dest[],
jpayne@69 1596 int32_t destCapacity,
jpayne@69 1597 UErrorCode &status);
jpayne@69 1598
jpayne@69 1599 /**
jpayne@69 1600 * Set a processing time limit for match operations with this Matcher.
jpayne@69 1601 *
jpayne@69 1602 * Some patterns, when matching certain strings, can run in exponential time.
jpayne@69 1603 * For practical purposes, the match operation may appear to be in an
jpayne@69 1604 * infinite loop.
jpayne@69 1605 * When a limit is set a match operation will fail with an error if the
jpayne@69 1606 * limit is exceeded.
jpayne@69 1607 *
jpayne@69 1608 * The units of the limit are steps of the match engine.
jpayne@69 1609 * Correspondence with actual processor time will depend on the speed
jpayne@69 1610 * of the processor and the details of the specific pattern, but will
jpayne@69 1611 * typically be on the order of milliseconds.
jpayne@69 1612 *
jpayne@69 1613 * By default, the matching time is not limited.
jpayne@69 1614 *
jpayne@69 1615 *
jpayne@69 1616 * @param limit The limit value, or 0 for no limit.
jpayne@69 1617 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1618 * @stable ICU 4.0
jpayne@69 1619 */
jpayne@69 1620 virtual void setTimeLimit(int32_t limit, UErrorCode &status);
jpayne@69 1621
jpayne@69 1622 /**
jpayne@69 1623 * Get the time limit, if any, for match operations made with this Matcher.
jpayne@69 1624 *
jpayne@69 1625 * @return the maximum allowed time for a match, in units of processing steps.
jpayne@69 1626 * @stable ICU 4.0
jpayne@69 1627 */
jpayne@69 1628 virtual int32_t getTimeLimit() const;
jpayne@69 1629
jpayne@69 1630 /**
jpayne@69 1631 * Set the amount of heap storage available for use by the match backtracking stack.
jpayne@69 1632 * The matcher is also reset, discarding any results from previous matches.
jpayne@69 1633 *
jpayne@69 1634 * ICU uses a backtracking regular expression engine, with the backtrack stack
jpayne@69 1635 * maintained on the heap. This function sets the limit to the amount of memory
jpayne@69 1636 * that can be used for this purpose. A backtracking stack overflow will
jpayne@69 1637 * result in an error from the match operation that caused it.
jpayne@69 1638 *
jpayne@69 1639 * A limit is desirable because a malicious or poorly designed pattern can use
jpayne@69 1640 * excessive memory, potentially crashing the process. A limit is enabled
jpayne@69 1641 * by default.
jpayne@69 1642 *
jpayne@69 1643 * @param limit The maximum size, in bytes, of the matching backtrack stack.
jpayne@69 1644 * A value of zero means no limit.
jpayne@69 1645 * The limit must be greater or equal to zero.
jpayne@69 1646 *
jpayne@69 1647 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1648 *
jpayne@69 1649 * @stable ICU 4.0
jpayne@69 1650 */
jpayne@69 1651 virtual void setStackLimit(int32_t limit, UErrorCode &status);
jpayne@69 1652
jpayne@69 1653 /**
jpayne@69 1654 * Get the size of the heap storage available for use by the back tracking stack.
jpayne@69 1655 *
jpayne@69 1656 * @return the maximum backtracking stack size, in bytes, or zero if the
jpayne@69 1657 * stack size is unlimited.
jpayne@69 1658 * @stable ICU 4.0
jpayne@69 1659 */
jpayne@69 1660 virtual int32_t getStackLimit() const;
jpayne@69 1661
jpayne@69 1662
jpayne@69 1663 /**
jpayne@69 1664 * Set a callback function for use with this Matcher.
jpayne@69 1665 * During matching operations the function will be called periodically,
jpayne@69 1666 * giving the application the opportunity to terminate a long-running
jpayne@69 1667 * match.
jpayne@69 1668 *
jpayne@69 1669 * @param callback A pointer to the user-supplied callback function.
jpayne@69 1670 * @param context User context pointer. The value supplied at the
jpayne@69 1671 * time the callback function is set will be saved
jpayne@69 1672 * and passed to the callback each time that it is called.
jpayne@69 1673 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1674 * @stable ICU 4.0
jpayne@69 1675 */
jpayne@69 1676 virtual void setMatchCallback(URegexMatchCallback *callback,
jpayne@69 1677 const void *context,
jpayne@69 1678 UErrorCode &status);
jpayne@69 1679
jpayne@69 1680
jpayne@69 1681 /**
jpayne@69 1682 * Get the callback function for this URegularExpression.
jpayne@69 1683 *
jpayne@69 1684 * @param callback Out parameter, receives a pointer to the user-supplied
jpayne@69 1685 * callback function.
jpayne@69 1686 * @param context Out parameter, receives the user context pointer that
jpayne@69 1687 * was set when uregex_setMatchCallback() was called.
jpayne@69 1688 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1689 * @stable ICU 4.0
jpayne@69 1690 */
jpayne@69 1691 virtual void getMatchCallback(URegexMatchCallback *&callback,
jpayne@69 1692 const void *&context,
jpayne@69 1693 UErrorCode &status);
jpayne@69 1694
jpayne@69 1695
jpayne@69 1696 /**
jpayne@69 1697 * Set a progress callback function for use with find operations on this Matcher.
jpayne@69 1698 * During find operations, the callback will be invoked after each return from a
jpayne@69 1699 * match attempt, giving the application the opportunity to terminate a long-running
jpayne@69 1700 * find operation.
jpayne@69 1701 *
jpayne@69 1702 * @param callback A pointer to the user-supplied callback function.
jpayne@69 1703 * @param context User context pointer. The value supplied at the
jpayne@69 1704 * time the callback function is set will be saved
jpayne@69 1705 * and passed to the callback each time that it is called.
jpayne@69 1706 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1707 * @stable ICU 4.6
jpayne@69 1708 */
jpayne@69 1709 virtual void setFindProgressCallback(URegexFindProgressCallback *callback,
jpayne@69 1710 const void *context,
jpayne@69 1711 UErrorCode &status);
jpayne@69 1712
jpayne@69 1713
jpayne@69 1714 /**
jpayne@69 1715 * Get the find progress callback function for this URegularExpression.
jpayne@69 1716 *
jpayne@69 1717 * @param callback Out parameter, receives a pointer to the user-supplied
jpayne@69 1718 * callback function.
jpayne@69 1719 * @param context Out parameter, receives the user context pointer that
jpayne@69 1720 * was set when uregex_setFindProgressCallback() was called.
jpayne@69 1721 * @param status A reference to a UErrorCode to receive any errors.
jpayne@69 1722 * @stable ICU 4.6
jpayne@69 1723 */
jpayne@69 1724 virtual void getFindProgressCallback(URegexFindProgressCallback *&callback,
jpayne@69 1725 const void *&context,
jpayne@69 1726 UErrorCode &status);
jpayne@69 1727
jpayne@69 1728 #ifndef U_HIDE_INTERNAL_API
jpayne@69 1729 /**
jpayne@69 1730 * setTrace Debug function, enable/disable tracing of the matching engine.
jpayne@69 1731 * For internal ICU development use only. DO NO USE!!!!
jpayne@69 1732 * @internal
jpayne@69 1733 */
jpayne@69 1734 void setTrace(UBool state);
jpayne@69 1735 #endif /* U_HIDE_INTERNAL_API */
jpayne@69 1736
jpayne@69 1737 /**
jpayne@69 1738 * ICU "poor man's RTTI", returns a UClassID for this class.
jpayne@69 1739 *
jpayne@69 1740 * @stable ICU 2.2
jpayne@69 1741 */
jpayne@69 1742 static UClassID U_EXPORT2 getStaticClassID();
jpayne@69 1743
jpayne@69 1744 /**
jpayne@69 1745 * ICU "poor man's RTTI", returns a UClassID for the actual class.
jpayne@69 1746 *
jpayne@69 1747 * @stable ICU 2.2
jpayne@69 1748 */
jpayne@69 1749 virtual UClassID getDynamicClassID() const;
jpayne@69 1750
jpayne@69 1751 private:
jpayne@69 1752 // Constructors and other object boilerplate are private.
jpayne@69 1753 // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
jpayne@69 1754 RegexMatcher(); // default constructor not implemented
jpayne@69 1755 RegexMatcher(const RegexPattern *pat);
jpayne@69 1756 RegexMatcher(const RegexMatcher &other);
jpayne@69 1757 RegexMatcher &operator =(const RegexMatcher &rhs);
jpayne@69 1758 void init(UErrorCode &status); // Common initialization
jpayne@69 1759 void init2(UText *t, UErrorCode &e); // Common initialization, part 2.
jpayne@69 1760
jpayne@69 1761 friend class RegexPattern;
jpayne@69 1762 friend class RegexCImpl;
jpayne@69 1763 public:
jpayne@69 1764 #ifndef U_HIDE_INTERNAL_API
jpayne@69 1765 /** @internal */
jpayne@69 1766 void resetPreserveRegion(); // Reset matcher state, but preserve any region.
jpayne@69 1767 #endif /* U_HIDE_INTERNAL_API */
jpayne@69 1768 private:
jpayne@69 1769
jpayne@69 1770 //
jpayne@69 1771 // MatchAt This is the internal interface to the match engine itself.
jpayne@69 1772 // Match status comes back in matcher member variables.
jpayne@69 1773 //
jpayne@69 1774 void MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
jpayne@69 1775 inline void backTrack(int64_t &inputIdx, int32_t &patIdx);
jpayne@69 1776 UBool isWordBoundary(int64_t pos); // perform Perl-like \b test
jpayne@69 1777 UBool isUWordBoundary(int64_t pos, UErrorCode &status); // perform RBBI based \b test
jpayne@69 1778 // Find a grapheme cluster boundary using a break iterator. For handling \X in regexes.
jpayne@69 1779 int64_t followingGCBoundary(int64_t pos, UErrorCode &status);
jpayne@69 1780 REStackFrame *resetStack();
jpayne@69 1781 inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
jpayne@69 1782 void IncrementTime(UErrorCode &status);
jpayne@69 1783
jpayne@69 1784 // Call user find callback function, if set. Return TRUE if operation should be interrupted.
jpayne@69 1785 inline UBool findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
jpayne@69 1786
jpayne@69 1787 int64_t appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
jpayne@69 1788
jpayne@69 1789 UBool findUsingChunk(UErrorCode &status);
jpayne@69 1790 void MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
jpayne@69 1791 UBool isChunkWordBoundary(int32_t pos);
jpayne@69 1792
jpayne@69 1793 const RegexPattern *fPattern;
jpayne@69 1794 RegexPattern *fPatternOwned; // Non-NULL if this matcher owns the pattern, and
jpayne@69 1795 // should delete it when through.
jpayne@69 1796
jpayne@69 1797 const UnicodeString *fInput; // The string being matched. Only used for input()
jpayne@69 1798 UText *fInputText; // The text being matched. Is never NULL.
jpayne@69 1799 UText *fAltInputText; // A shallow copy of the text being matched.
jpayne@69 1800 // Only created if the pattern contains backreferences.
jpayne@69 1801 int64_t fInputLength; // Full length of the input text.
jpayne@69 1802 int32_t fFrameSize; // The size of a frame in the backtrack stack.
jpayne@69 1803
jpayne@69 1804 int64_t fRegionStart; // Start of the input region, default = 0.
jpayne@69 1805 int64_t fRegionLimit; // End of input region, default to input.length.
jpayne@69 1806
jpayne@69 1807 int64_t fAnchorStart; // Region bounds for anchoring operations (^ or $).
jpayne@69 1808 int64_t fAnchorLimit; // See useAnchoringBounds
jpayne@69 1809
jpayne@69 1810 int64_t fLookStart; // Region bounds for look-ahead/behind and
jpayne@69 1811 int64_t fLookLimit; // and other boundary tests. See
jpayne@69 1812 // useTransparentBounds
jpayne@69 1813
jpayne@69 1814 int64_t fActiveStart; // Currently active bounds for matching.
jpayne@69 1815 int64_t fActiveLimit; // Usually is the same as region, but
jpayne@69 1816 // is changed to fLookStart/Limit when
jpayne@69 1817 // entering look around regions.
jpayne@69 1818
jpayne@69 1819 UBool fTransparentBounds; // True if using transparent bounds.
jpayne@69 1820 UBool fAnchoringBounds; // True if using anchoring bounds.
jpayne@69 1821
jpayne@69 1822 UBool fMatch; // True if the last attempted match was successful.
jpayne@69 1823 int64_t fMatchStart; // Position of the start of the most recent match
jpayne@69 1824 int64_t fMatchEnd; // First position after the end of the most recent match
jpayne@69 1825 // Zero if no previous match, even when a region
jpayne@69 1826 // is active.
jpayne@69 1827 int64_t fLastMatchEnd; // First position after the end of the previous match,
jpayne@69 1828 // or -1 if there was no previous match.
jpayne@69 1829 int64_t fAppendPosition; // First position after the end of the previous
jpayne@69 1830 // appendReplacement(). As described by the
jpayne@69 1831 // JavaDoc for Java Matcher, where it is called
jpayne@69 1832 // "append position"
jpayne@69 1833 UBool fHitEnd; // True if the last match touched the end of input.
jpayne@69 1834 UBool fRequireEnd; // True if the last match required end-of-input
jpayne@69 1835 // (matched $ or Z)
jpayne@69 1836
jpayne@69 1837 UVector64 *fStack;
jpayne@69 1838 REStackFrame *fFrame; // After finding a match, the last active stack frame,
jpayne@69 1839 // which will contain the capture group results.
jpayne@69 1840 // NOT valid while match engine is running.
jpayne@69 1841
jpayne@69 1842 int64_t *fData; // Data area for use by the compiled pattern.
jpayne@69 1843 int64_t fSmallData[8]; // Use this for data if it's enough.
jpayne@69 1844
jpayne@69 1845 int32_t fTimeLimit; // Max time (in arbitrary steps) to let the
jpayne@69 1846 // match engine run. Zero for unlimited.
jpayne@69 1847
jpayne@69 1848 int32_t fTime; // Match time, accumulates while matching.
jpayne@69 1849 int32_t fTickCounter; // Low bits counter for time. Counts down StateSaves.
jpayne@69 1850 // Kept separately from fTime to keep as much
jpayne@69 1851 // code as possible out of the inline
jpayne@69 1852 // StateSave function.
jpayne@69 1853
jpayne@69 1854 int32_t fStackLimit; // Maximum memory size to use for the backtrack
jpayne@69 1855 // stack, in bytes. Zero for unlimited.
jpayne@69 1856
jpayne@69 1857 URegexMatchCallback *fCallbackFn; // Pointer to match progress callback funct.
jpayne@69 1858 // NULL if there is no callback.
jpayne@69 1859 const void *fCallbackContext; // User Context ptr for callback function.
jpayne@69 1860
jpayne@69 1861 URegexFindProgressCallback *fFindProgressCallbackFn; // Pointer to match progress callback funct.
jpayne@69 1862 // NULL if there is no callback.
jpayne@69 1863 const void *fFindProgressCallbackContext; // User Context ptr for callback function.
jpayne@69 1864
jpayne@69 1865
jpayne@69 1866 UBool fInputUniStrMaybeMutable; // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
jpayne@69 1867
jpayne@69 1868 UBool fTraceDebug; // Set true for debug tracing of match engine.
jpayne@69 1869
jpayne@69 1870 UErrorCode fDeferredStatus; // Save error state that cannot be immediately
jpayne@69 1871 // reported, or that permanently disables this matcher.
jpayne@69 1872
jpayne@69 1873 BreakIterator *fWordBreakItr;
jpayne@69 1874 BreakIterator *fGCBreakItr;
jpayne@69 1875 };
jpayne@69 1876
jpayne@69 1877 U_NAMESPACE_END
jpayne@69 1878 #endif // UCONFIG_NO_REGULAR_EXPRESSIONS
jpayne@69 1879
jpayne@69 1880 #endif /* U_SHOW_CPLUSPLUS_API */
jpayne@69 1881
jpayne@69 1882 #endif