Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/unicode/uregex.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // © 2016 and later: Unicode, Inc. and others. | |
2 // License & terms of use: http://www.unicode.org/copyright.html | |
3 /* | |
4 ********************************************************************** | |
5 * Copyright (C) 2004-2016, International Business Machines | |
6 * Corporation and others. All Rights Reserved. | |
7 ********************************************************************** | |
8 * file name: uregex.h | |
9 * encoding: UTF-8 | |
10 * indentation:4 | |
11 * | |
12 * created on: 2004mar09 | |
13 * created by: Andy Heninger | |
14 * | |
15 * ICU Regular Expressions, API for C | |
16 */ | |
17 | |
18 /** | |
19 * \file | |
20 * \brief C API: Regular Expressions | |
21 * | |
22 * <p>This is a C wrapper around the C++ RegexPattern and RegexMatcher classes.</p> | |
23 */ | |
24 | |
25 #ifndef UREGEX_H | |
26 #define UREGEX_H | |
27 | |
28 #include "unicode/utext.h" | |
29 #include "unicode/utypes.h" | |
30 | |
31 #if !UCONFIG_NO_REGULAR_EXPRESSIONS | |
32 | |
33 #include "unicode/localpointer.h" | |
34 #include "unicode/parseerr.h" | |
35 | |
36 struct URegularExpression; | |
37 /** | |
38 * Structure representing a compiled regular expression, plus the results | |
39 * of a match operation. | |
40 * @stable ICU 3.0 | |
41 */ | |
42 typedef struct URegularExpression URegularExpression; | |
43 | |
44 | |
45 /** | |
46 * Constants for Regular Expression Match Modes. | |
47 * @stable ICU 2.4 | |
48 */ | |
49 typedef enum URegexpFlag{ | |
50 | |
51 #ifndef U_HIDE_DRAFT_API | |
52 /** Forces normalization of pattern and strings. | |
53 Not implemented yet, just a placeholder, hence draft. | |
54 @draft ICU 2.4 */ | |
55 UREGEX_CANON_EQ = 128, | |
56 #endif /* U_HIDE_DRAFT_API */ | |
57 /** Enable case insensitive matching. @stable ICU 2.4 */ | |
58 UREGEX_CASE_INSENSITIVE = 2, | |
59 | |
60 /** Allow white space and comments within patterns @stable ICU 2.4 */ | |
61 UREGEX_COMMENTS = 4, | |
62 | |
63 /** If set, '.' matches line terminators, otherwise '.' matching stops at line end. | |
64 * @stable ICU 2.4 */ | |
65 UREGEX_DOTALL = 32, | |
66 | |
67 /** If set, treat the entire pattern as a literal string. | |
68 * Metacharacters or escape sequences in the input sequence will be given | |
69 * no special meaning. | |
70 * | |
71 * The flag UREGEX_CASE_INSENSITIVE retains its impact | |
72 * on matching when used in conjunction with this flag. | |
73 * The other flags become superfluous. | |
74 * | |
75 * @stable ICU 4.0 | |
76 */ | |
77 UREGEX_LITERAL = 16, | |
78 | |
79 /** Control behavior of "$" and "^" | |
80 * If set, recognize line terminators within string, | |
81 * otherwise, match only at start and end of input string. | |
82 * @stable ICU 2.4 */ | |
83 UREGEX_MULTILINE = 8, | |
84 | |
85 /** Unix-only line endings. | |
86 * When this mode is enabled, only \\u000a is recognized as a line ending | |
87 * in the behavior of ., ^, and $. | |
88 * @stable ICU 4.0 | |
89 */ | |
90 UREGEX_UNIX_LINES = 1, | |
91 | |
92 /** Unicode word boundaries. | |
93 * If set, \b uses the Unicode TR 29 definition of word boundaries. | |
94 * Warning: Unicode word boundaries are quite different from | |
95 * traditional regular expression word boundaries. See | |
96 * http://unicode.org/reports/tr29/#Word_Boundaries | |
97 * @stable ICU 2.8 | |
98 */ | |
99 UREGEX_UWORD = 256, | |
100 | |
101 /** Error on Unrecognized backslash escapes. | |
102 * If set, fail with an error on patterns that contain | |
103 * backslash-escaped ASCII letters without a known special | |
104 * meaning. If this flag is not set, these | |
105 * escaped letters represent themselves. | |
106 * @stable ICU 4.0 | |
107 */ | |
108 UREGEX_ERROR_ON_UNKNOWN_ESCAPES = 512 | |
109 | |
110 } URegexpFlag; | |
111 | |
112 /** | |
113 * Open (compile) an ICU regular expression. Compiles the regular expression in | |
114 * string form into an internal representation using the specified match mode flags. | |
115 * The resulting regular expression handle can then be used to perform various | |
116 * matching operations. | |
117 * | |
118 * | |
119 * @param pattern The Regular Expression pattern to be compiled. | |
120 * @param patternLength The length of the pattern, or -1 if the pattern is | |
121 * NUL terminated. | |
122 * @param flags Flags that alter the default matching behavior for | |
123 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
124 * example. For default behavior, set this parameter to zero. | |
125 * See <code>enum URegexpFlag</code>. All desired flags | |
126 * are bitwise-ORed together. | |
127 * @param pe Receives the position (line and column numbers) of any syntax | |
128 * error within the source regular expression string. If this | |
129 * information is not wanted, pass NULL for this parameter. | |
130 * @param status Receives error detected by this function. | |
131 * @stable ICU 3.0 | |
132 * | |
133 */ | |
134 U_STABLE URegularExpression * U_EXPORT2 | |
135 uregex_open( const UChar *pattern, | |
136 int32_t patternLength, | |
137 uint32_t flags, | |
138 UParseError *pe, | |
139 UErrorCode *status); | |
140 | |
141 /** | |
142 * Open (compile) an ICU regular expression. Compiles the regular expression in | |
143 * string form into an internal representation using the specified match mode flags. | |
144 * The resulting regular expression handle can then be used to perform various | |
145 * matching operations. | |
146 * <p> | |
147 * The contents of the pattern UText will be extracted and saved. Ownership of the | |
148 * UText struct itself remains with the caller. This is to match the behavior of | |
149 * uregex_open(). | |
150 * | |
151 * @param pattern The Regular Expression pattern to be compiled. | |
152 * @param flags Flags that alter the default matching behavior for | |
153 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
154 * example. For default behavior, set this parameter to zero. | |
155 * See <code>enum URegexpFlag</code>. All desired flags | |
156 * are bitwise-ORed together. | |
157 * @param pe Receives the position (line and column numbers) of any syntax | |
158 * error within the source regular expression string. If this | |
159 * information is not wanted, pass NULL for this parameter. | |
160 * @param status Receives error detected by this function. | |
161 * | |
162 * @stable ICU 4.6 | |
163 */ | |
164 U_STABLE URegularExpression * U_EXPORT2 | |
165 uregex_openUText(UText *pattern, | |
166 uint32_t flags, | |
167 UParseError *pe, | |
168 UErrorCode *status); | |
169 | |
170 #if !UCONFIG_NO_CONVERSION | |
171 /** | |
172 * Open (compile) an ICU regular expression. The resulting regular expression | |
173 * handle can then be used to perform various matching operations. | |
174 * <p> | |
175 * This function is the same as uregex_open, except that the pattern | |
176 * is supplied as an 8 bit char * string in the default code page. | |
177 * | |
178 * @param pattern The Regular Expression pattern to be compiled, | |
179 * NUL terminated. | |
180 * @param flags Flags that alter the default matching behavior for | |
181 * the regular expression, UREGEX_CASE_INSENSITIVE, for | |
182 * example. For default behavior, set this parameter to zero. | |
183 * See <code>enum URegexpFlag</code>. All desired flags | |
184 * are bitwise-ORed together. | |
185 * @param pe Receives the position (line and column numbers) of any syntax | |
186 * error within the source regular expression string. If this | |
187 * information is not wanted, pass NULL for this parameter. | |
188 * @param status Receives errors detected by this function. | |
189 * @return The URegularExpression object representing the compiled | |
190 * pattern. | |
191 * | |
192 * @stable ICU 3.0 | |
193 */ | |
194 U_STABLE URegularExpression * U_EXPORT2 | |
195 uregex_openC( const char *pattern, | |
196 uint32_t flags, | |
197 UParseError *pe, | |
198 UErrorCode *status); | |
199 #endif | |
200 | |
201 | |
202 | |
203 /** | |
204 * Close the regular expression, recovering all resources (memory) it | |
205 * was holding. | |
206 * | |
207 * @param regexp The regular expression to be closed. | |
208 * @stable ICU 3.0 | |
209 */ | |
210 U_STABLE void U_EXPORT2 | |
211 uregex_close(URegularExpression *regexp); | |
212 | |
213 #if U_SHOW_CPLUSPLUS_API | |
214 | |
215 U_NAMESPACE_BEGIN | |
216 | |
217 /** | |
218 * \class LocalURegularExpressionPointer | |
219 * "Smart pointer" class, closes a URegularExpression via uregex_close(). | |
220 * For most methods see the LocalPointerBase base class. | |
221 * | |
222 * @see LocalPointerBase | |
223 * @see LocalPointer | |
224 * @stable ICU 4.4 | |
225 */ | |
226 U_DEFINE_LOCAL_OPEN_POINTER(LocalURegularExpressionPointer, URegularExpression, uregex_close); | |
227 | |
228 U_NAMESPACE_END | |
229 | |
230 #endif | |
231 | |
232 /** | |
233 * Make a copy of a compiled regular expression. Cloning a regular | |
234 * expression is faster than opening a second instance from the source | |
235 * form of the expression, and requires less memory. | |
236 * <p> | |
237 * Note that the current input string and the position of any matched text | |
238 * within it are not cloned; only the pattern itself and the | |
239 * match mode flags are copied. | |
240 * <p> | |
241 * Cloning can be particularly useful to threaded applications that perform | |
242 * multiple match operations in parallel. Each concurrent RE | |
243 * operation requires its own instance of a URegularExpression. | |
244 * | |
245 * @param regexp The compiled regular expression to be cloned. | |
246 * @param status Receives indication of any errors encountered | |
247 * @return the cloned copy of the compiled regular expression. | |
248 * @stable ICU 3.0 | |
249 */ | |
250 U_STABLE URegularExpression * U_EXPORT2 | |
251 uregex_clone(const URegularExpression *regexp, UErrorCode *status); | |
252 | |
253 /** | |
254 * Returns a pointer to the source form of the pattern for this regular expression. | |
255 * This function will work even if the pattern was originally specified as a UText. | |
256 * | |
257 * @param regexp The compiled regular expression. | |
258 * @param patLength This output parameter will be set to the length of the | |
259 * pattern string. A NULL pointer may be used here if the | |
260 * pattern length is not needed, as would be the case if | |
261 * the pattern is known in advance to be a NUL terminated | |
262 * string. | |
263 * @param status Receives errors detected by this function. | |
264 * @return a pointer to the pattern string. The storage for the string is | |
265 * owned by the regular expression object, and must not be | |
266 * altered or deleted by the application. The returned string | |
267 * will remain valid until the regular expression is closed. | |
268 * @stable ICU 3.0 | |
269 */ | |
270 U_STABLE const UChar * U_EXPORT2 | |
271 uregex_pattern(const URegularExpression *regexp, | |
272 int32_t *patLength, | |
273 UErrorCode *status); | |
274 | |
275 /** | |
276 * Returns the source text of the pattern for this regular expression. | |
277 * This function will work even if the pattern was originally specified as a UChar string. | |
278 * | |
279 * @param regexp The compiled regular expression. | |
280 * @param status Receives errors detected by this function. | |
281 * @return the pattern text. The storage for the text is owned by the regular expression | |
282 * object, and must not be altered or deleted. | |
283 * | |
284 * @stable ICU 4.6 | |
285 */ | |
286 U_STABLE UText * U_EXPORT2 | |
287 uregex_patternUText(const URegularExpression *regexp, | |
288 UErrorCode *status); | |
289 | |
290 /** | |
291 * Get the match mode flags that were specified when compiling this regular expression. | |
292 * @param status Receives errors detected by this function. | |
293 * @param regexp The compiled regular expression. | |
294 * @return The match mode flags | |
295 * @see URegexpFlag | |
296 * @stable ICU 3.0 | |
297 */ | |
298 U_STABLE int32_t U_EXPORT2 | |
299 uregex_flags(const URegularExpression *regexp, | |
300 UErrorCode *status); | |
301 | |
302 | |
303 /** | |
304 * Set the subject text string upon which the regular expression will look for matches. | |
305 * This function may be called any number of times, allowing the regular | |
306 * expression pattern to be applied to different strings. | |
307 * <p> | |
308 * Regular expression matching operations work directly on the application's | |
309 * string data. No copy is made. The subject string data must not be | |
310 * altered after calling this function until after all regular expression | |
311 * operations involving this string data are completed. | |
312 * <p> | |
313 * Zero length strings are permitted. In this case, no subsequent match | |
314 * operation will dereference the text string pointer. | |
315 * | |
316 * @param regexp The compiled regular expression. | |
317 * @param text The subject text string. | |
318 * @param textLength The length of the subject text, or -1 if the string | |
319 * is NUL terminated. | |
320 * @param status Receives errors detected by this function. | |
321 * @stable ICU 3.0 | |
322 */ | |
323 U_STABLE void U_EXPORT2 | |
324 uregex_setText(URegularExpression *regexp, | |
325 const UChar *text, | |
326 int32_t textLength, | |
327 UErrorCode *status); | |
328 | |
329 | |
330 /** | |
331 * Set the subject text string upon which the regular expression will look for matches. | |
332 * This function may be called any number of times, allowing the regular | |
333 * expression pattern to be applied to different strings. | |
334 * <p> | |
335 * Regular expression matching operations work directly on the application's | |
336 * string data; only a shallow clone is made. The subject string data must not be | |
337 * altered after calling this function until after all regular expression | |
338 * operations involving this string data are completed. | |
339 * | |
340 * @param regexp The compiled regular expression. | |
341 * @param text The subject text string. | |
342 * @param status Receives errors detected by this function. | |
343 * | |
344 * @stable ICU 4.6 | |
345 */ | |
346 U_STABLE void U_EXPORT2 | |
347 uregex_setUText(URegularExpression *regexp, | |
348 UText *text, | |
349 UErrorCode *status); | |
350 | |
351 /** | |
352 * Get the subject text that is currently associated with this | |
353 * regular expression object. If the input was supplied using uregex_setText(), | |
354 * that pointer will be returned. Otherwise, the characters in the input will | |
355 * be extracted to a buffer and returned. In either case, ownership remains | |
356 * with the regular expression object. | |
357 * | |
358 * This function will work even if the input was originally specified as a UText. | |
359 * | |
360 * @param regexp The compiled regular expression. | |
361 * @param textLength The length of the string is returned in this output parameter. | |
362 * A NULL pointer may be used here if the | |
363 * text length is not needed, as would be the case if | |
364 * the text is known in advance to be a NUL terminated | |
365 * string. | |
366 * @param status Receives errors detected by this function. | |
367 * @return Pointer to the subject text string currently associated with | |
368 * this regular expression. | |
369 * @stable ICU 3.0 | |
370 */ | |
371 U_STABLE const UChar * U_EXPORT2 | |
372 uregex_getText(URegularExpression *regexp, | |
373 int32_t *textLength, | |
374 UErrorCode *status); | |
375 | |
376 /** | |
377 * Get the subject text that is currently associated with this | |
378 * regular expression object. | |
379 * | |
380 * This function will work even if the input was originally specified as a UChar string. | |
381 * | |
382 * @param regexp The compiled regular expression. | |
383 * @param dest A mutable UText in which to store the current input. | |
384 * If NULL, a new UText will be created as an immutable shallow clone | |
385 * of the actual input string. | |
386 * @param status Receives errors detected by this function. | |
387 * @return The subject text currently associated with this regular expression. | |
388 * If a pre-allocated UText was provided, it will always be used and returned. | |
389 * | |
390 * @stable ICU 4.6 | |
391 */ | |
392 U_STABLE UText * U_EXPORT2 | |
393 uregex_getUText(URegularExpression *regexp, | |
394 UText *dest, | |
395 UErrorCode *status); | |
396 | |
397 /** | |
398 * Set the subject text string upon which the regular expression is looking for matches | |
399 * without changing any other aspect of the matching state. | |
400 * The new and previous text strings must have the same content. | |
401 * | |
402 * This function is intended for use in environments where ICU is operating on | |
403 * strings that may move around in memory. It provides a mechanism for notifying | |
404 * ICU that the string has been relocated, and providing a new UText to access the | |
405 * string in its new position. | |
406 * | |
407 * Note that the regular expression implementation never copies the underlying text | |
408 * of a string being matched, but always operates directly on the original text | |
409 * provided by the user. Refreshing simply drops the references to the old text | |
410 * and replaces them with references to the new. | |
411 * | |
412 * Caution: this function is normally used only by very specialized | |
413 * system-level code. One example use case is with garbage collection | |
414 * that moves the text in memory. | |
415 * | |
416 * @param regexp The compiled regular expression. | |
417 * @param text The new (moved) text string. | |
418 * @param status Receives errors detected by this function. | |
419 * | |
420 * @stable ICU 4.8 | |
421 */ | |
422 U_STABLE void U_EXPORT2 | |
423 uregex_refreshUText(URegularExpression *regexp, | |
424 UText *text, | |
425 UErrorCode *status); | |
426 | |
427 /** | |
428 * Attempts to match the input string against the pattern. | |
429 * To succeed, the match must extend to the end of the string, | |
430 * or cover the complete match region. | |
431 * | |
432 * If startIndex >= zero the match operation starts at the specified | |
433 * index and must extend to the end of the input string. Any region | |
434 * that has been specified is reset. | |
435 * | |
436 * If startIndex == -1 the match must cover the input region, or the entire | |
437 * input string if no region has been set. This directly corresponds to | |
438 * Matcher.matches() in Java | |
439 * | |
440 * @param regexp The compiled regular expression. | |
441 * @param startIndex The input string (native) index at which to begin matching, or -1 | |
442 * to match the input Region. | |
443 * @param status Receives errors detected by this function. | |
444 * @return TRUE if there is a match | |
445 * @stable ICU 3.0 | |
446 */ | |
447 U_STABLE UBool U_EXPORT2 | |
448 uregex_matches(URegularExpression *regexp, | |
449 int32_t startIndex, | |
450 UErrorCode *status); | |
451 | |
452 /** | |
453 * 64bit version of uregex_matches. | |
454 * Attempts to match the input string against the pattern. | |
455 * To succeed, the match must extend to the end of the string, | |
456 * or cover the complete match region. | |
457 * | |
458 * If startIndex >= zero the match operation starts at the specified | |
459 * index and must extend to the end of the input string. Any region | |
460 * that has been specified is reset. | |
461 * | |
462 * If startIndex == -1 the match must cover the input region, or the entire | |
463 * input string if no region has been set. This directly corresponds to | |
464 * Matcher.matches() in Java | |
465 * | |
466 * @param regexp The compiled regular expression. | |
467 * @param startIndex The input string (native) index at which to begin matching, or -1 | |
468 * to match the input Region. | |
469 * @param status Receives errors detected by this function. | |
470 * @return TRUE if there is a match | |
471 * @stable ICU 4.6 | |
472 */ | |
473 U_STABLE UBool U_EXPORT2 | |
474 uregex_matches64(URegularExpression *regexp, | |
475 int64_t startIndex, | |
476 UErrorCode *status); | |
477 | |
478 /** | |
479 * Attempts to match the input string, starting from the specified index, against the pattern. | |
480 * The match may be of any length, and is not required to extend to the end | |
481 * of the input string. Contrast with uregex_matches(). | |
482 * | |
483 * <p>If startIndex is >= 0 any input region that was set for this | |
484 * URegularExpression is reset before the operation begins. | |
485 * | |
486 * <p>If the specified starting index == -1 the match begins at the start of the input | |
487 * region, or at the start of the full string if no region has been specified. | |
488 * This corresponds directly with Matcher.lookingAt() in Java. | |
489 * | |
490 * <p>If the match succeeds then more information can be obtained via the | |
491 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, | |
492 * and <code>uregex_group()</code> functions.</p> | |
493 * | |
494 * @param regexp The compiled regular expression. | |
495 * @param startIndex The input string (native) index at which to begin matching, or | |
496 * -1 to match the Input Region | |
497 * @param status A reference to a UErrorCode to receive any errors. | |
498 * @return TRUE if there is a match. | |
499 * @stable ICU 3.0 | |
500 */ | |
501 U_STABLE UBool U_EXPORT2 | |
502 uregex_lookingAt(URegularExpression *regexp, | |
503 int32_t startIndex, | |
504 UErrorCode *status); | |
505 | |
506 /** | |
507 * 64bit version of uregex_lookingAt. | |
508 * Attempts to match the input string, starting from the specified index, against the pattern. | |
509 * The match may be of any length, and is not required to extend to the end | |
510 * of the input string. Contrast with uregex_matches(). | |
511 * | |
512 * <p>If startIndex is >= 0 any input region that was set for this | |
513 * URegularExpression is reset before the operation begins. | |
514 * | |
515 * <p>If the specified starting index == -1 the match begins at the start of the input | |
516 * region, or at the start of the full string if no region has been specified. | |
517 * This corresponds directly with Matcher.lookingAt() in Java. | |
518 * | |
519 * <p>If the match succeeds then more information can be obtained via the | |
520 * <code>uregexp_start()</code>, <code>uregexp_end()</code>, | |
521 * and <code>uregex_group()</code> functions.</p> | |
522 * | |
523 * @param regexp The compiled regular expression. | |
524 * @param startIndex The input string (native) index at which to begin matching, or | |
525 * -1 to match the Input Region | |
526 * @param status A reference to a UErrorCode to receive any errors. | |
527 * @return TRUE if there is a match. | |
528 * @stable ICU 4.6 | |
529 */ | |
530 U_STABLE UBool U_EXPORT2 | |
531 uregex_lookingAt64(URegularExpression *regexp, | |
532 int64_t startIndex, | |
533 UErrorCode *status); | |
534 | |
535 /** | |
536 * Find the first matching substring of the input string that matches the pattern. | |
537 * If startIndex is >= zero the search for a match begins at the specified index, | |
538 * and any match region is reset. This corresponds directly with | |
539 * Matcher.find(startIndex) in Java. | |
540 * | |
541 * If startIndex == -1 the search begins at the start of the input region, | |
542 * or at the start of the full string if no region has been specified. | |
543 * | |
544 * If a match is found, <code>uregex_start(), uregex_end()</code>, and | |
545 * <code>uregex_group()</code> will provide more information regarding the match. | |
546 * | |
547 * @param regexp The compiled regular expression. | |
548 * @param startIndex The position (native) in the input string to begin the search, or | |
549 * -1 to search within the Input Region. | |
550 * @param status A reference to a UErrorCode to receive any errors. | |
551 * @return TRUE if a match is found. | |
552 * @stable ICU 3.0 | |
553 */ | |
554 U_STABLE UBool U_EXPORT2 | |
555 uregex_find(URegularExpression *regexp, | |
556 int32_t startIndex, | |
557 UErrorCode *status); | |
558 | |
559 /** | |
560 * 64bit version of uregex_find. | |
561 * Find the first matching substring of the input string that matches the pattern. | |
562 * If startIndex is >= zero the search for a match begins at the specified index, | |
563 * and any match region is reset. This corresponds directly with | |
564 * Matcher.find(startIndex) in Java. | |
565 * | |
566 * If startIndex == -1 the search begins at the start of the input region, | |
567 * or at the start of the full string if no region has been specified. | |
568 * | |
569 * If a match is found, <code>uregex_start(), uregex_end()</code>, and | |
570 * <code>uregex_group()</code> will provide more information regarding the match. | |
571 * | |
572 * @param regexp The compiled regular expression. | |
573 * @param startIndex The position (native) in the input string to begin the search, or | |
574 * -1 to search within the Input Region. | |
575 * @param status A reference to a UErrorCode to receive any errors. | |
576 * @return TRUE if a match is found. | |
577 * @stable ICU 4.6 | |
578 */ | |
579 U_STABLE UBool U_EXPORT2 | |
580 uregex_find64(URegularExpression *regexp, | |
581 int64_t startIndex, | |
582 UErrorCode *status); | |
583 | |
584 /** | |
585 * Find the next pattern match in the input string. Begin searching | |
586 * the input at the location following the end of he previous match, | |
587 * or at the start of the string (or region) if there is no | |
588 * previous match. If a match is found, <code>uregex_start(), uregex_end()</code>, and | |
589 * <code>uregex_group()</code> will provide more information regarding the match. | |
590 * | |
591 * @param regexp The compiled regular expression. | |
592 * @param status A reference to a UErrorCode to receive any errors. | |
593 * @return TRUE if a match is found. | |
594 * @see uregex_reset | |
595 * @stable ICU 3.0 | |
596 */ | |
597 U_STABLE UBool U_EXPORT2 | |
598 uregex_findNext(URegularExpression *regexp, | |
599 UErrorCode *status); | |
600 | |
601 /** | |
602 * Get the number of capturing groups in this regular expression's pattern. | |
603 * @param regexp The compiled regular expression. | |
604 * @param status A reference to a UErrorCode to receive any errors. | |
605 * @return the number of capture groups | |
606 * @stable ICU 3.0 | |
607 */ | |
608 U_STABLE int32_t U_EXPORT2 | |
609 uregex_groupCount(URegularExpression *regexp, | |
610 UErrorCode *status); | |
611 | |
612 /** | |
613 * Get the group number corresponding to a named capture group. | |
614 * The returned number can be used with any function that access | |
615 * capture groups by number. | |
616 * | |
617 * The function returns an error status if the specified name does not | |
618 * appear in the pattern. | |
619 * | |
620 * @param regexp The compiled regular expression. | |
621 * @param groupName The capture group name. | |
622 * @param nameLength The length of the name, or -1 if the name is a | |
623 * nul-terminated string. | |
624 * @param status A pointer to a UErrorCode to receive any errors. | |
625 * | |
626 * @stable ICU 55 | |
627 */ | |
628 U_STABLE int32_t U_EXPORT2 | |
629 uregex_groupNumberFromName(URegularExpression *regexp, | |
630 const UChar *groupName, | |
631 int32_t nameLength, | |
632 UErrorCode *status); | |
633 | |
634 | |
635 /** | |
636 * Get the group number corresponding to a named capture group. | |
637 * The returned number can be used with any function that access | |
638 * capture groups by number. | |
639 * | |
640 * The function returns an error status if the specified name does not | |
641 * appear in the pattern. | |
642 * | |
643 * @param regexp The compiled regular expression. | |
644 * @param groupName The capture group name, | |
645 * platform invariant characters only. | |
646 * @param nameLength The length of the name, or -1 if the name is | |
647 * nul-terminated. | |
648 * @param status A pointer to a UErrorCode to receive any errors. | |
649 * | |
650 * @stable ICU 55 | |
651 */ | |
652 U_STABLE int32_t U_EXPORT2 | |
653 uregex_groupNumberFromCName(URegularExpression *regexp, | |
654 const char *groupName, | |
655 int32_t nameLength, | |
656 UErrorCode *status); | |
657 | |
658 /** Extract the string for the specified matching expression or subexpression. | |
659 * Group #0 is the complete string of matched text. | |
660 * Group #1 is the text matched by the first set of capturing parentheses. | |
661 * | |
662 * @param regexp The compiled regular expression. | |
663 * @param groupNum The capture group to extract. Group 0 is the complete | |
664 * match. The value of this parameter must be | |
665 * less than or equal to the number of capture groups in | |
666 * the pattern. | |
667 * @param dest Buffer to receive the matching string data | |
668 * @param destCapacity Capacity of the dest buffer. | |
669 * @param status A reference to a UErrorCode to receive any errors. | |
670 * @return Length of matching data, | |
671 * or -1 if no applicable match. | |
672 * @stable ICU 3.0 | |
673 */ | |
674 U_STABLE int32_t U_EXPORT2 | |
675 uregex_group(URegularExpression *regexp, | |
676 int32_t groupNum, | |
677 UChar *dest, | |
678 int32_t destCapacity, | |
679 UErrorCode *status); | |
680 | |
681 /** Returns a shallow immutable clone of the entire input string with the current index set | |
682 * to the beginning of the requested capture group. The capture group length is also | |
683 * returned via groupLength. | |
684 * Group #0 is the complete string of matched text. | |
685 * Group #1 is the text matched by the first set of capturing parentheses. | |
686 * | |
687 * @param regexp The compiled regular expression. | |
688 * @param groupNum The capture group to extract. Group 0 is the complete | |
689 * match. The value of this parameter must be | |
690 * less than or equal to the number of capture groups in | |
691 * the pattern. | |
692 * @param dest A mutable UText in which to store the current input. | |
693 * If NULL, a new UText will be created as an immutable shallow clone | |
694 * of the entire input string. | |
695 * @param groupLength The group length of the desired capture group. Output parameter. | |
696 * @param status A reference to a UErrorCode to receive any errors. | |
697 * @return The subject text currently associated with this regular expression. | |
698 * If a pre-allocated UText was provided, it will always be used and returned. | |
699 | |
700 * | |
701 * @stable ICU 4.6 | |
702 */ | |
703 U_STABLE UText * U_EXPORT2 | |
704 uregex_groupUText(URegularExpression *regexp, | |
705 int32_t groupNum, | |
706 UText *dest, | |
707 int64_t *groupLength, | |
708 UErrorCode *status); | |
709 | |
710 /** | |
711 * Returns the index in the input string of the start of the text matched by the | |
712 * specified capture group during the previous match operation. Return -1 if | |
713 * the capture group was not part of the last match. | |
714 * Group #0 refers to the complete range of matched text. | |
715 * Group #1 refers to the text matched by the first set of capturing parentheses. | |
716 * | |
717 * @param regexp The compiled regular expression. | |
718 * @param groupNum The capture group number | |
719 * @param status A reference to a UErrorCode to receive any errors. | |
720 * @return the starting (native) position in the input of the text matched | |
721 * by the specified group. | |
722 * @stable ICU 3.0 | |
723 */ | |
724 U_STABLE int32_t U_EXPORT2 | |
725 uregex_start(URegularExpression *regexp, | |
726 int32_t groupNum, | |
727 UErrorCode *status); | |
728 | |
729 /** | |
730 * 64bit version of uregex_start. | |
731 * Returns the index in the input string of the start of the text matched by the | |
732 * specified capture group during the previous match operation. Return -1 if | |
733 * the capture group was not part of the last match. | |
734 * Group #0 refers to the complete range of matched text. | |
735 * Group #1 refers to the text matched by the first set of capturing parentheses. | |
736 * | |
737 * @param regexp The compiled regular expression. | |
738 * @param groupNum The capture group number | |
739 * @param status A reference to a UErrorCode to receive any errors. | |
740 * @return the starting (native) position in the input of the text matched | |
741 * by the specified group. | |
742 * @stable ICU 4.6 | |
743 */ | |
744 U_STABLE int64_t U_EXPORT2 | |
745 uregex_start64(URegularExpression *regexp, | |
746 int32_t groupNum, | |
747 UErrorCode *status); | |
748 | |
749 /** | |
750 * Returns the index in the input string of the position following the end | |
751 * of the text matched by the specified capture group. | |
752 * Return -1 if the capture group was not part of the last match. | |
753 * Group #0 refers to the complete range of matched text. | |
754 * Group #1 refers to the text matched by the first set of capturing parentheses. | |
755 * | |
756 * @param regexp The compiled regular expression. | |
757 * @param groupNum The capture group number | |
758 * @param status A reference to a UErrorCode to receive any errors. | |
759 * @return the (native) index of the position following the last matched character. | |
760 * @stable ICU 3.0 | |
761 */ | |
762 U_STABLE int32_t U_EXPORT2 | |
763 uregex_end(URegularExpression *regexp, | |
764 int32_t groupNum, | |
765 UErrorCode *status); | |
766 | |
767 /** | |
768 * 64bit version of uregex_end. | |
769 * Returns the index in the input string of the position following the end | |
770 * of the text matched by the specified capture group. | |
771 * Return -1 if the capture group was not part of the last match. | |
772 * Group #0 refers to the complete range of matched text. | |
773 * Group #1 refers to the text matched by the first set of capturing parentheses. | |
774 * | |
775 * @param regexp The compiled regular expression. | |
776 * @param groupNum The capture group number | |
777 * @param status A reference to a UErrorCode to receive any errors. | |
778 * @return the (native) index of the position following the last matched character. | |
779 * @stable ICU 4.6 | |
780 */ | |
781 U_STABLE int64_t U_EXPORT2 | |
782 uregex_end64(URegularExpression *regexp, | |
783 int32_t groupNum, | |
784 UErrorCode *status); | |
785 | |
786 /** | |
787 * Reset any saved state from the previous match. Has the effect of | |
788 * causing uregex_findNext to begin at the specified index, and causing | |
789 * uregex_start(), uregex_end() and uregex_group() to return an error | |
790 * indicating that there is no match information available. Clears any | |
791 * match region that may have been set. | |
792 * | |
793 * @param regexp The compiled regular expression. | |
794 * @param index The position (native) in the text at which a | |
795 * uregex_findNext() should begin searching. | |
796 * @param status A reference to a UErrorCode to receive any errors. | |
797 * @stable ICU 3.0 | |
798 */ | |
799 U_STABLE void U_EXPORT2 | |
800 uregex_reset(URegularExpression *regexp, | |
801 int32_t index, | |
802 UErrorCode *status); | |
803 | |
804 /** | |
805 * 64bit version of uregex_reset. | |
806 * Reset any saved state from the previous match. Has the effect of | |
807 * causing uregex_findNext to begin at the specified index, and causing | |
808 * uregex_start(), uregex_end() and uregex_group() to return an error | |
809 * indicating that there is no match information available. Clears any | |
810 * match region that may have been set. | |
811 * | |
812 * @param regexp The compiled regular expression. | |
813 * @param index The position (native) in the text at which a | |
814 * uregex_findNext() should begin searching. | |
815 * @param status A reference to a UErrorCode to receive any errors. | |
816 * @stable ICU 4.6 | |
817 */ | |
818 U_STABLE void U_EXPORT2 | |
819 uregex_reset64(URegularExpression *regexp, | |
820 int64_t index, | |
821 UErrorCode *status); | |
822 | |
823 /** | |
824 * Sets the limits of the matching region for this URegularExpression. | |
825 * The region is the part of the input string that will be considered when matching. | |
826 * Invoking this method resets any saved state from the previous match, | |
827 * then sets the region to start at the index specified by the start parameter | |
828 * and end at the index specified by the end parameter. | |
829 * | |
830 * Depending on the transparency and anchoring being used (see useTransparentBounds | |
831 * and useAnchoringBounds), certain constructs such as anchors may behave differently | |
832 * at or around the boundaries of the region | |
833 * | |
834 * The function will fail if start is greater than limit, or if either index | |
835 * is less than zero or greater than the length of the string being matched. | |
836 * | |
837 * @param regexp The compiled regular expression. | |
838 * @param regionStart The (native) index to begin searches at. | |
839 * @param regionLimit The (native) index to end searches at (exclusive). | |
840 * @param status A pointer to a UErrorCode to receive any errors. | |
841 * @stable ICU 4.0 | |
842 */ | |
843 U_STABLE void U_EXPORT2 | |
844 uregex_setRegion(URegularExpression *regexp, | |
845 int32_t regionStart, | |
846 int32_t regionLimit, | |
847 UErrorCode *status); | |
848 | |
849 /** | |
850 * 64bit version of uregex_setRegion. | |
851 * Sets the limits of the matching region for this URegularExpression. | |
852 * The region is the part of the input string that will be considered when matching. | |
853 * Invoking this method resets any saved state from the previous match, | |
854 * then sets the region to start at the index specified by the start parameter | |
855 * and end at the index specified by the end parameter. | |
856 * | |
857 * Depending on the transparency and anchoring being used (see useTransparentBounds | |
858 * and useAnchoringBounds), certain constructs such as anchors may behave differently | |
859 * at or around the boundaries of the region | |
860 * | |
861 * The function will fail if start is greater than limit, or if either index | |
862 * is less than zero or greater than the length of the string being matched. | |
863 * | |
864 * @param regexp The compiled regular expression. | |
865 * @param regionStart The (native) index to begin searches at. | |
866 * @param regionLimit The (native) index to end searches at (exclusive). | |
867 * @param status A pointer to a UErrorCode to receive any errors. | |
868 * @stable ICU 4.6 | |
869 */ | |
870 U_STABLE void U_EXPORT2 | |
871 uregex_setRegion64(URegularExpression *regexp, | |
872 int64_t regionStart, | |
873 int64_t regionLimit, | |
874 UErrorCode *status); | |
875 | |
876 /** | |
877 * Set the matching region and the starting index for subsequent matches | |
878 * in a single operation. | |
879 * This is useful because the usual function for setting the starting | |
880 * index, urgex_reset(), also resets any region limits. | |
881 * | |
882 * @param regexp The compiled regular expression. | |
883 * @param regionStart The (native) index to begin searches at. | |
884 * @param regionLimit The (native) index to end searches at (exclusive). | |
885 * @param startIndex The index in the input text at which the next | |
886 * match operation should begin. | |
887 * @param status A pointer to a UErrorCode to receive any errors. | |
888 * @stable ICU 4.6 | |
889 */ | |
890 U_STABLE void U_EXPORT2 | |
891 uregex_setRegionAndStart(URegularExpression *regexp, | |
892 int64_t regionStart, | |
893 int64_t regionLimit, | |
894 int64_t startIndex, | |
895 UErrorCode *status); | |
896 | |
897 /** | |
898 * Reports the start index of the matching region. Any matches found are limited to | |
899 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). | |
900 * | |
901 * @param regexp The compiled regular expression. | |
902 * @param status A pointer to a UErrorCode to receive any errors. | |
903 * @return The starting (native) index of this matcher's region. | |
904 * @stable ICU 4.0 | |
905 */ | |
906 U_STABLE int32_t U_EXPORT2 | |
907 uregex_regionStart(const URegularExpression *regexp, | |
908 UErrorCode *status); | |
909 | |
910 /** | |
911 * 64bit version of uregex_regionStart. | |
912 * Reports the start index of the matching region. Any matches found are limited to | |
913 * to the region bounded by regionStart (inclusive) and regionEnd (exclusive). | |
914 * | |
915 * @param regexp The compiled regular expression. | |
916 * @param status A pointer to a UErrorCode to receive any errors. | |
917 * @return The starting (native) index of this matcher's region. | |
918 * @stable ICU 4.6 | |
919 */ | |
920 U_STABLE int64_t U_EXPORT2 | |
921 uregex_regionStart64(const URegularExpression *regexp, | |
922 UErrorCode *status); | |
923 | |
924 /** | |
925 * Reports the end index (exclusive) of the matching region for this URegularExpression. | |
926 * Any matches found are limited to to the region bounded by regionStart (inclusive) | |
927 * and regionEnd (exclusive). | |
928 * | |
929 * @param regexp The compiled regular expression. | |
930 * @param status A pointer to a UErrorCode to receive any errors. | |
931 * @return The ending point (native) of this matcher's region. | |
932 * @stable ICU 4.0 | |
933 */ | |
934 U_STABLE int32_t U_EXPORT2 | |
935 uregex_regionEnd(const URegularExpression *regexp, | |
936 UErrorCode *status); | |
937 | |
938 /** | |
939 * 64bit version of uregex_regionEnd. | |
940 * Reports the end index (exclusive) of the matching region for this URegularExpression. | |
941 * Any matches found are limited to to the region bounded by regionStart (inclusive) | |
942 * and regionEnd (exclusive). | |
943 * | |
944 * @param regexp The compiled regular expression. | |
945 * @param status A pointer to a UErrorCode to receive any errors. | |
946 * @return The ending point (native) of this matcher's region. | |
947 * @stable ICU 4.6 | |
948 */ | |
949 U_STABLE int64_t U_EXPORT2 | |
950 uregex_regionEnd64(const URegularExpression *regexp, | |
951 UErrorCode *status); | |
952 | |
953 /** | |
954 * Queries the transparency of region bounds for this URegularExpression. | |
955 * See useTransparentBounds for a description of transparent and opaque bounds. | |
956 * By default, matching boundaries are opaque. | |
957 * | |
958 * @param regexp The compiled regular expression. | |
959 * @param status A pointer to a UErrorCode to receive any errors. | |
960 * @return TRUE if this matcher is using opaque bounds, false if it is not. | |
961 * @stable ICU 4.0 | |
962 */ | |
963 U_STABLE UBool U_EXPORT2 | |
964 uregex_hasTransparentBounds(const URegularExpression *regexp, | |
965 UErrorCode *status); | |
966 | |
967 | |
968 /** | |
969 * Sets the transparency of region bounds for this URegularExpression. | |
970 * Invoking this function with an argument of TRUE will set matches to use transparent bounds. | |
971 * If the boolean argument is FALSE, then opaque bounds will be used. | |
972 * | |
973 * Using transparent bounds, the boundaries of the matching region are transparent | |
974 * to lookahead, lookbehind, and boundary matching constructs. Those constructs can | |
975 * see text beyond the boundaries of the region while checking for a match. | |
976 * | |
977 * With opaque bounds, no text outside of the matching region is visible to lookahead, | |
978 * lookbehind, and boundary matching constructs. | |
979 * | |
980 * By default, opaque bounds are used. | |
981 * | |
982 * @param regexp The compiled regular expression. | |
983 * @param b TRUE for transparent bounds; FALSE for opaque bounds | |
984 * @param status A pointer to a UErrorCode to receive any errors. | |
985 * @stable ICU 4.0 | |
986 **/ | |
987 U_STABLE void U_EXPORT2 | |
988 uregex_useTransparentBounds(URegularExpression *regexp, | |
989 UBool b, | |
990 UErrorCode *status); | |
991 | |
992 | |
993 /** | |
994 * Return true if this URegularExpression is using anchoring bounds. | |
995 * By default, anchoring region bounds are used. | |
996 * | |
997 * @param regexp The compiled regular expression. | |
998 * @param status A pointer to a UErrorCode to receive any errors. | |
999 * @return TRUE if this matcher is using anchoring bounds. | |
1000 * @stable ICU 4.0 | |
1001 */ | |
1002 U_STABLE UBool U_EXPORT2 | |
1003 uregex_hasAnchoringBounds(const URegularExpression *regexp, | |
1004 UErrorCode *status); | |
1005 | |
1006 | |
1007 /** | |
1008 * Set whether this URegularExpression is using Anchoring Bounds for its region. | |
1009 * With anchoring bounds, pattern anchors such as ^ and $ will match at the start | |
1010 * and end of the region. Without Anchoring Bounds, anchors will only match at | |
1011 * the positions they would in the complete text. | |
1012 * | |
1013 * Anchoring Bounds are the default for regions. | |
1014 * | |
1015 * @param regexp The compiled regular expression. | |
1016 * @param b TRUE if to enable anchoring bounds; FALSE to disable them. | |
1017 * @param status A pointer to a UErrorCode to receive any errors. | |
1018 * @stable ICU 4.0 | |
1019 */ | |
1020 U_STABLE void U_EXPORT2 | |
1021 uregex_useAnchoringBounds(URegularExpression *regexp, | |
1022 UBool b, | |
1023 UErrorCode *status); | |
1024 | |
1025 /** | |
1026 * Return TRUE if the most recent matching operation touched the | |
1027 * end of the text being processed. In this case, additional input text could | |
1028 * change the results of that match. | |
1029 * | |
1030 * @param regexp The compiled regular expression. | |
1031 * @param status A pointer to a UErrorCode to receive any errors. | |
1032 * @return TRUE if the most recent match hit the end of input | |
1033 * @stable ICU 4.0 | |
1034 */ | |
1035 U_STABLE UBool U_EXPORT2 | |
1036 uregex_hitEnd(const URegularExpression *regexp, | |
1037 UErrorCode *status); | |
1038 | |
1039 /** | |
1040 * Return TRUE the most recent match succeeded and additional input could cause | |
1041 * it to fail. If this function returns false and a match was found, then more input | |
1042 * might change the match but the match won't be lost. If a match was not found, | |
1043 * then requireEnd has no meaning. | |
1044 * | |
1045 * @param regexp The compiled regular expression. | |
1046 * @param status A pointer to a UErrorCode to receive any errors. | |
1047 * @return TRUE if more input could cause the most recent match to no longer match. | |
1048 * @stable ICU 4.0 | |
1049 */ | |
1050 U_STABLE UBool U_EXPORT2 | |
1051 uregex_requireEnd(const URegularExpression *regexp, | |
1052 UErrorCode *status); | |
1053 | |
1054 | |
1055 | |
1056 | |
1057 | |
1058 /** | |
1059 * Replaces every substring of the input that matches the pattern | |
1060 * with the given replacement string. This is a convenience function that | |
1061 * provides a complete find-and-replace-all operation. | |
1062 * | |
1063 * This method scans the input string looking for matches of the pattern. | |
1064 * Input that is not part of any match is copied unchanged to the | |
1065 * destination buffer. Matched regions are replaced in the output | |
1066 * buffer by the replacement string. The replacement string may contain | |
1067 * references to capture groups; these take the form of $1, $2, etc. | |
1068 * | |
1069 * @param regexp The compiled regular expression. | |
1070 * @param replacementText A string containing the replacement text. | |
1071 * @param replacementLength The length of the replacement string, or | |
1072 * -1 if it is NUL terminated. | |
1073 * @param destBuf A (UChar *) buffer that will receive the result. | |
1074 * @param destCapacity The capacity of the destination buffer. | |
1075 * @param status A reference to a UErrorCode to receive any errors. | |
1076 * @return The length of the string resulting from the find | |
1077 * and replace operation. In the event that the | |
1078 * destination capacity is inadequate, the return value | |
1079 * is still the full length of the untruncated string. | |
1080 * @stable ICU 3.0 | |
1081 */ | |
1082 U_STABLE int32_t U_EXPORT2 | |
1083 uregex_replaceAll(URegularExpression *regexp, | |
1084 const UChar *replacementText, | |
1085 int32_t replacementLength, | |
1086 UChar *destBuf, | |
1087 int32_t destCapacity, | |
1088 UErrorCode *status); | |
1089 | |
1090 /** | |
1091 * Replaces every substring of the input that matches the pattern | |
1092 * with the given replacement string. This is a convenience function that | |
1093 * provides a complete find-and-replace-all operation. | |
1094 * | |
1095 * This method scans the input string looking for matches of the pattern. | |
1096 * Input that is not part of any match is copied unchanged to the | |
1097 * destination buffer. Matched regions are replaced in the output | |
1098 * buffer by the replacement string. The replacement string may contain | |
1099 * references to capture groups; these take the form of $1, $2, etc. | |
1100 * | |
1101 * @param regexp The compiled regular expression. | |
1102 * @param replacement A string containing the replacement text. | |
1103 * @param dest A mutable UText that will receive the result. | |
1104 * If NULL, a new UText will be created (which may not be mutable). | |
1105 * @param status A reference to a UErrorCode to receive any errors. | |
1106 * @return A UText containing the results of the find and replace. | |
1107 * If a pre-allocated UText was provided, it will always be used and returned. | |
1108 * | |
1109 * @stable ICU 4.6 | |
1110 */ | |
1111 U_STABLE UText * U_EXPORT2 | |
1112 uregex_replaceAllUText(URegularExpression *regexp, | |
1113 UText *replacement, | |
1114 UText *dest, | |
1115 UErrorCode *status); | |
1116 | |
1117 /** | |
1118 * Replaces the first substring of the input that matches the pattern | |
1119 * with the given replacement string. This is a convenience function that | |
1120 * provides a complete find-and-replace operation. | |
1121 * | |
1122 * This method scans the input string looking for a match of the pattern. | |
1123 * All input that is not part of the match is copied unchanged to the | |
1124 * destination buffer. The matched region is replaced in the output | |
1125 * buffer by the replacement string. The replacement string may contain | |
1126 * references to capture groups; these take the form of $1, $2, etc. | |
1127 * | |
1128 * @param regexp The compiled regular expression. | |
1129 * @param replacementText A string containing the replacement text. | |
1130 * @param replacementLength The length of the replacement string, or | |
1131 * -1 if it is NUL terminated. | |
1132 * @param destBuf A (UChar *) buffer that will receive the result. | |
1133 * @param destCapacity The capacity of the destination buffer. | |
1134 * @param status a reference to a UErrorCode to receive any errors. | |
1135 * @return The length of the string resulting from the find | |
1136 * and replace operation. In the event that the | |
1137 * destination capacity is inadequate, the return value | |
1138 * is still the full length of the untruncated string. | |
1139 * @stable ICU 3.0 | |
1140 */ | |
1141 U_STABLE int32_t U_EXPORT2 | |
1142 uregex_replaceFirst(URegularExpression *regexp, | |
1143 const UChar *replacementText, | |
1144 int32_t replacementLength, | |
1145 UChar *destBuf, | |
1146 int32_t destCapacity, | |
1147 UErrorCode *status); | |
1148 | |
1149 /** | |
1150 * Replaces the first substring of the input that matches the pattern | |
1151 * with the given replacement string. This is a convenience function that | |
1152 * provides a complete find-and-replace operation. | |
1153 * | |
1154 * This method scans the input string looking for a match of the pattern. | |
1155 * All input that is not part of the match is copied unchanged to the | |
1156 * destination buffer. The matched region is replaced in the output | |
1157 * buffer by the replacement string. The replacement string may contain | |
1158 * references to capture groups; these take the form of $1, $2, etc. | |
1159 * | |
1160 * @param regexp The compiled regular expression. | |
1161 * @param replacement A string containing the replacement text. | |
1162 * @param dest A mutable UText that will receive the result. | |
1163 * If NULL, a new UText will be created (which may not be mutable). | |
1164 * @param status A reference to a UErrorCode to receive any errors. | |
1165 * @return A UText containing the results of the find and replace. | |
1166 * If a pre-allocated UText was provided, it will always be used and returned. | |
1167 * | |
1168 * @stable ICU 4.6 | |
1169 */ | |
1170 U_STABLE UText * U_EXPORT2 | |
1171 uregex_replaceFirstUText(URegularExpression *regexp, | |
1172 UText *replacement, | |
1173 UText *dest, | |
1174 UErrorCode *status); | |
1175 | |
1176 /** | |
1177 * Implements a replace operation intended to be used as part of an | |
1178 * incremental find-and-replace. | |
1179 * | |
1180 * <p>The input string, starting from the end of the previous match and ending at | |
1181 * the start of the current match, is appended to the destination string. Then the | |
1182 * replacement string is appended to the output string, | |
1183 * including handling any substitutions of captured text.</p> | |
1184 * | |
1185 * <p>A note on preflight computation of buffersize and error handling: | |
1186 * Calls to uregex_appendReplacement() and uregex_appendTail() are | |
1187 * designed to be chained, one after another, with the destination | |
1188 * buffer pointer and buffer capacity updated after each in preparation | |
1189 * to for the next. If the destination buffer is exhausted partway through such a | |
1190 * sequence, a U_BUFFER_OVERFLOW_ERROR status will be returned. Normal | |
1191 * ICU conventions are for a function to perform no action if it is | |
1192 * called with an error status, but for this one case, uregex_appendRepacement() | |
1193 * will operate normally so that buffer size computations will complete | |
1194 * correctly. | |
1195 * | |
1196 * <p>For simple, prepackaged, non-incremental find-and-replace | |
1197 * operations, see replaceFirst() or replaceAll().</p> | |
1198 * | |
1199 * @param regexp The regular expression object. | |
1200 * @param replacementText The string that will replace the matched portion of the | |
1201 * input string as it is copied to the destination buffer. | |
1202 * The replacement text may contain references ($1, for | |
1203 * example) to capture groups from the match. | |
1204 * @param replacementLength The length of the replacement text string, | |
1205 * or -1 if the string is NUL terminated. | |
1206 * @param destBuf The buffer into which the results of the | |
1207 * find-and-replace are placed. On return, this pointer | |
1208 * will be updated to refer to the beginning of the | |
1209 * unused portion of buffer, leaving it in position for | |
1210 * a subsequent call to this function. | |
1211 * @param destCapacity The size of the output buffer, On return, this | |
1212 * parameter will be updated to reflect the space remaining | |
1213 * unused in the output buffer. | |
1214 * @param status A reference to a UErrorCode to receive any errors. | |
1215 * @return The length of the result string. In the event that | |
1216 * destCapacity is inadequate, the full length of the | |
1217 * untruncated output string is returned. | |
1218 * | |
1219 * @stable ICU 3.0 | |
1220 * | |
1221 */ | |
1222 U_STABLE int32_t U_EXPORT2 | |
1223 uregex_appendReplacement(URegularExpression *regexp, | |
1224 const UChar *replacementText, | |
1225 int32_t replacementLength, | |
1226 UChar **destBuf, | |
1227 int32_t *destCapacity, | |
1228 UErrorCode *status); | |
1229 | |
1230 /** | |
1231 * Implements a replace operation intended to be used as part of an | |
1232 * incremental find-and-replace. | |
1233 * | |
1234 * <p>The input string, starting from the end of the previous match and ending at | |
1235 * the start of the current match, is appended to the destination string. Then the | |
1236 * replacement string is appended to the output string, | |
1237 * including handling any substitutions of captured text.</p> | |
1238 * | |
1239 * <p>For simple, prepackaged, non-incremental find-and-replace | |
1240 * operations, see replaceFirst() or replaceAll().</p> | |
1241 * | |
1242 * @param regexp The regular expression object. | |
1243 * @param replacementText The string that will replace the matched portion of the | |
1244 * input string as it is copied to the destination buffer. | |
1245 * The replacement text may contain references ($1, for | |
1246 * example) to capture groups from the match. | |
1247 * @param dest A mutable UText that will receive the result. Must not be NULL. | |
1248 * @param status A reference to a UErrorCode to receive any errors. | |
1249 * | |
1250 * @stable ICU 4.6 | |
1251 */ | |
1252 U_STABLE void U_EXPORT2 | |
1253 uregex_appendReplacementUText(URegularExpression *regexp, | |
1254 UText *replacementText, | |
1255 UText *dest, | |
1256 UErrorCode *status); | |
1257 | |
1258 /** | |
1259 * As the final step in a find-and-replace operation, append the remainder | |
1260 * of the input string, starting at the position following the last match, | |
1261 * to the destination string. <code>uregex_appendTail()</code> is intended | |
1262 * to be invoked after one or more invocations of the | |
1263 * <code>uregex_appendReplacement()</code> function. | |
1264 * | |
1265 * @param regexp The regular expression object. This is needed to | |
1266 * obtain the input string and with the position | |
1267 * of the last match within it. | |
1268 * @param destBuf The buffer in which the results of the | |
1269 * find-and-replace are placed. On return, the pointer | |
1270 * will be updated to refer to the beginning of the | |
1271 * unused portion of buffer. | |
1272 * @param destCapacity The size of the output buffer, On return, this | |
1273 * value will be updated to reflect the space remaining | |
1274 * unused in the output buffer. | |
1275 * @param status A reference to a UErrorCode to receive any errors. | |
1276 * @return The length of the result string. In the event that | |
1277 * destCapacity is inadequate, the full length of the | |
1278 * untruncated output string is returned. | |
1279 * | |
1280 * @stable ICU 3.0 | |
1281 */ | |
1282 U_STABLE int32_t U_EXPORT2 | |
1283 uregex_appendTail(URegularExpression *regexp, | |
1284 UChar **destBuf, | |
1285 int32_t *destCapacity, | |
1286 UErrorCode *status); | |
1287 | |
1288 /** | |
1289 * As the final step in a find-and-replace operation, append the remainder | |
1290 * of the input string, starting at the position following the last match, | |
1291 * to the destination string. <code>uregex_appendTailUText()</code> is intended | |
1292 * to be invoked after one or more invocations of the | |
1293 * <code>uregex_appendReplacementUText()</code> function. | |
1294 * | |
1295 * @param regexp The regular expression object. This is needed to | |
1296 * obtain the input string and with the position | |
1297 * of the last match within it. | |
1298 * @param dest A mutable UText that will receive the result. Must not be NULL. | |
1299 * | |
1300 * @param status Error code | |
1301 * | |
1302 * @return The destination UText. | |
1303 * | |
1304 * @stable ICU 4.6 | |
1305 */ | |
1306 U_STABLE UText * U_EXPORT2 | |
1307 uregex_appendTailUText(URegularExpression *regexp, | |
1308 UText *dest, | |
1309 UErrorCode *status); | |
1310 | |
1311 /** | |
1312 * Split a string into fields. Somewhat like split() from Perl. | |
1313 * The pattern matches identify delimiters that separate the input | |
1314 * into fields. The input data between the matches becomes the | |
1315 * fields themselves. | |
1316 * | |
1317 * Each of the fields is copied from the input string to the destination | |
1318 * buffer, and NUL terminated. The position of each field within | |
1319 * the destination buffer is returned in the destFields array. | |
1320 * | |
1321 * If the delimiter pattern includes capture groups, the captured text will | |
1322 * also appear in the destination array of output strings, interspersed | |
1323 * with the fields. This is similar to Perl, but differs from Java, | |
1324 * which ignores the presence of capture groups in the pattern. | |
1325 * | |
1326 * Trailing empty fields will always be returned, assuming sufficient | |
1327 * destination capacity. This differs from the default behavior for Java | |
1328 * and Perl where trailing empty fields are not returned. | |
1329 * | |
1330 * The number of strings produced by the split operation is returned. | |
1331 * This count includes the strings from capture groups in the delimiter pattern. | |
1332 * This behavior differs from Java, which ignores capture groups. | |
1333 * | |
1334 * @param regexp The compiled regular expression. | |
1335 * @param destBuf A (UChar *) buffer to receive the fields that | |
1336 * are extracted from the input string. These | |
1337 * field pointers will refer to positions within the | |
1338 * destination buffer supplied by the caller. Any | |
1339 * extra positions within the destFields array will be | |
1340 * set to NULL. | |
1341 * @param destCapacity The capacity of the destBuf. | |
1342 * @param requiredCapacity The actual capacity required of the destBuf. | |
1343 * If destCapacity is too small, requiredCapacity will return | |
1344 * the total capacity required to hold all of the output, and | |
1345 * a U_BUFFER_OVERFLOW_ERROR will be returned. | |
1346 * @param destFields An array to be filled with the position of each | |
1347 * of the extracted fields within destBuf. | |
1348 * @param destFieldsCapacity The number of elements in the destFields array. | |
1349 * If the number of fields found is less than destFieldsCapacity, | |
1350 * the extra destFields elements are set to zero. | |
1351 * If destFieldsCapacity is too small, the trailing part of the | |
1352 * input, including any field delimiters, is treated as if it | |
1353 * were the last field - it is copied to the destBuf, and | |
1354 * its position is in the destBuf is stored in the last element | |
1355 * of destFields. This behavior mimics that of Perl. It is not | |
1356 * an error condition, and no error status is returned when all destField | |
1357 * positions are used. | |
1358 * @param status A reference to a UErrorCode to receive any errors. | |
1359 * @return The number of fields into which the input string was split. | |
1360 * @stable ICU 3.0 | |
1361 */ | |
1362 U_STABLE int32_t U_EXPORT2 | |
1363 uregex_split( URegularExpression *regexp, | |
1364 UChar *destBuf, | |
1365 int32_t destCapacity, | |
1366 int32_t *requiredCapacity, | |
1367 UChar *destFields[], | |
1368 int32_t destFieldsCapacity, | |
1369 UErrorCode *status); | |
1370 | |
1371 /** | |
1372 * Split a string into fields. Somewhat like split() from Perl. | |
1373 * The pattern matches identify delimiters that separate the input | |
1374 * into fields. The input data between the matches becomes the | |
1375 * fields themselves. | |
1376 * <p> | |
1377 * The behavior of this function is not very closely aligned with uregex_split(); | |
1378 * instead, it is based on (and implemented directly on top of) the C++ split method. | |
1379 * | |
1380 * @param regexp The compiled regular expression. | |
1381 * @param destFields An array of mutable UText structs to receive the results of the split. | |
1382 * If a field is NULL, a new UText is allocated to contain the results for | |
1383 * that field. This new UText is not guaranteed to be mutable. | |
1384 * @param destFieldsCapacity The number of elements in the destination array. | |
1385 * If the number of fields found is less than destCapacity, the | |
1386 * extra strings in the destination array are not altered. | |
1387 * If the number of destination strings is less than the number | |
1388 * of fields, the trailing part of the input string, including any | |
1389 * field delimiters, is placed in the last destination string. | |
1390 * This behavior mimics that of Perl. It is not an error condition, and no | |
1391 * error status is returned when all destField positions are used. | |
1392 * @param status A reference to a UErrorCode to receive any errors. | |
1393 * @return The number of fields into which the input string was split. | |
1394 * | |
1395 * @stable ICU 4.6 | |
1396 */ | |
1397 U_STABLE int32_t U_EXPORT2 | |
1398 uregex_splitUText(URegularExpression *regexp, | |
1399 UText *destFields[], | |
1400 int32_t destFieldsCapacity, | |
1401 UErrorCode *status); | |
1402 | |
1403 /** | |
1404 * Set a processing time limit for match operations with this URegularExpression. | |
1405 * | |
1406 * Some patterns, when matching certain strings, can run in exponential time. | |
1407 * For practical purposes, the match operation may appear to be in an | |
1408 * infinite loop. | |
1409 * When a limit is set a match operation will fail with an error if the | |
1410 * limit is exceeded. | |
1411 * <p> | |
1412 * The units of the limit are steps of the match engine. | |
1413 * Correspondence with actual processor time will depend on the speed | |
1414 * of the processor and the details of the specific pattern, but will | |
1415 * typically be on the order of milliseconds. | |
1416 * <p> | |
1417 * By default, the matching time is not limited. | |
1418 * <p> | |
1419 * | |
1420 * @param regexp The compiled regular expression. | |
1421 * @param limit The limit value, or 0 for no limit. | |
1422 * @param status A reference to a UErrorCode to receive any errors. | |
1423 * @stable ICU 4.0 | |
1424 */ | |
1425 U_STABLE void U_EXPORT2 | |
1426 uregex_setTimeLimit(URegularExpression *regexp, | |
1427 int32_t limit, | |
1428 UErrorCode *status); | |
1429 | |
1430 /** | |
1431 * Get the time limit for for matches with this URegularExpression. | |
1432 * A return value of zero indicates that there is no limit. | |
1433 * | |
1434 * @param regexp The compiled regular expression. | |
1435 * @param status A reference to a UErrorCode to receive any errors. | |
1436 * @return the maximum allowed time for a match, in units of processing steps. | |
1437 * @stable ICU 4.0 | |
1438 */ | |
1439 U_STABLE int32_t U_EXPORT2 | |
1440 uregex_getTimeLimit(const URegularExpression *regexp, | |
1441 UErrorCode *status); | |
1442 | |
1443 /** | |
1444 * Set the amount of heap storage available for use by the match backtracking stack. | |
1445 * <p> | |
1446 * ICU uses a backtracking regular expression engine, with the backtrack stack | |
1447 * maintained on the heap. This function sets the limit to the amount of memory | |
1448 * that can be used for this purpose. A backtracking stack overflow will | |
1449 * result in an error from the match operation that caused it. | |
1450 * <p> | |
1451 * A limit is desirable because a malicious or poorly designed pattern can use | |
1452 * excessive memory, potentially crashing the process. A limit is enabled | |
1453 * by default. | |
1454 * <p> | |
1455 * @param regexp The compiled regular expression. | |
1456 * @param limit The maximum size, in bytes, of the matching backtrack stack. | |
1457 * A value of zero means no limit. | |
1458 * The limit must be greater than or equal to zero. | |
1459 * @param status A reference to a UErrorCode to receive any errors. | |
1460 * | |
1461 * @stable ICU 4.0 | |
1462 */ | |
1463 U_STABLE void U_EXPORT2 | |
1464 uregex_setStackLimit(URegularExpression *regexp, | |
1465 int32_t limit, | |
1466 UErrorCode *status); | |
1467 | |
1468 /** | |
1469 * Get the size of the heap storage available for use by the back tracking stack. | |
1470 * | |
1471 * @return the maximum backtracking stack size, in bytes, or zero if the | |
1472 * stack size is unlimited. | |
1473 * @stable ICU 4.0 | |
1474 */ | |
1475 U_STABLE int32_t U_EXPORT2 | |
1476 uregex_getStackLimit(const URegularExpression *regexp, | |
1477 UErrorCode *status); | |
1478 | |
1479 | |
1480 /** | |
1481 * Function pointer for a regular expression matching callback function. | |
1482 * When set, a callback function will be called periodically during matching | |
1483 * operations. If the call back function returns FALSE, the matching | |
1484 * operation will be terminated early. | |
1485 * | |
1486 * Note: the callback function must not call other functions on this | |
1487 * URegularExpression. | |
1488 * | |
1489 * @param context context pointer. The callback function will be invoked | |
1490 * with the context specified at the time that | |
1491 * uregex_setMatchCallback() is called. | |
1492 * @param steps the accumulated processing time, in match steps, | |
1493 * for this matching operation. | |
1494 * @return TRUE to continue the matching operation. | |
1495 * FALSE to terminate the matching operation. | |
1496 * @stable ICU 4.0 | |
1497 */ | |
1498 U_CDECL_BEGIN | |
1499 typedef UBool U_CALLCONV URegexMatchCallback ( | |
1500 const void *context, | |
1501 int32_t steps); | |
1502 U_CDECL_END | |
1503 | |
1504 /** | |
1505 * Set a callback function for this URegularExpression. | |
1506 * During matching operations the function will be called periodically, | |
1507 * giving the application the opportunity to terminate a long-running | |
1508 * match. | |
1509 * | |
1510 * @param regexp The compiled regular expression. | |
1511 * @param callback A pointer to the user-supplied callback function. | |
1512 * @param context User context pointer. The value supplied at the | |
1513 * time the callback function is set will be saved | |
1514 * and passed to the callback each time that it is called. | |
1515 * @param status A reference to a UErrorCode to receive any errors. | |
1516 * @stable ICU 4.0 | |
1517 */ | |
1518 U_STABLE void U_EXPORT2 | |
1519 uregex_setMatchCallback(URegularExpression *regexp, | |
1520 URegexMatchCallback *callback, | |
1521 const void *context, | |
1522 UErrorCode *status); | |
1523 | |
1524 | |
1525 /** | |
1526 * Get the callback function for this URegularExpression. | |
1527 * | |
1528 * @param regexp The compiled regular expression. | |
1529 * @param callback Out parameter, receives a pointer to the user-supplied | |
1530 * callback function. | |
1531 * @param context Out parameter, receives the user context pointer that | |
1532 * was set when uregex_setMatchCallback() was called. | |
1533 * @param status A reference to a UErrorCode to receive any errors. | |
1534 * @stable ICU 4.0 | |
1535 */ | |
1536 U_STABLE void U_EXPORT2 | |
1537 uregex_getMatchCallback(const URegularExpression *regexp, | |
1538 URegexMatchCallback **callback, | |
1539 const void **context, | |
1540 UErrorCode *status); | |
1541 | |
1542 /** | |
1543 * Function pointer for a regular expression find callback function. | |
1544 * | |
1545 * When set, a callback function will be called during a find operation | |
1546 * and for operations that depend on find, such as findNext, split and some replace | |
1547 * operations like replaceFirst. | |
1548 * The callback will usually be called after each attempt at a match, but this is not a | |
1549 * guarantee that the callback will be invoked at each character. For finds where the | |
1550 * match engine is invoked at each character, this may be close to true, but less likely | |
1551 * for more optimized loops where the pattern is known to only start, and the match | |
1552 * engine invoked, at certain characters. | |
1553 * When invoked, this callback will specify the index at which a match operation is about | |
1554 * to be attempted, giving the application the opportunity to terminate a long-running | |
1555 * find operation. | |
1556 * | |
1557 * If the call back function returns FALSE, the find operation will be terminated early. | |
1558 * | |
1559 * Note: the callback function must not call other functions on this | |
1560 * URegularExpression | |
1561 * | |
1562 * @param context context pointer. The callback function will be invoked | |
1563 * with the context specified at the time that | |
1564 * uregex_setFindProgressCallback() is called. | |
1565 * @param matchIndex the next index at which a match attempt will be attempted for this | |
1566 * find operation. If this callback interrupts the search, this is the | |
1567 * index at which a find/findNext operation may be re-initiated. | |
1568 * @return TRUE to continue the matching operation. | |
1569 * FALSE to terminate the matching operation. | |
1570 * @stable ICU 4.6 | |
1571 */ | |
1572 U_CDECL_BEGIN | |
1573 typedef UBool U_CALLCONV URegexFindProgressCallback ( | |
1574 const void *context, | |
1575 int64_t matchIndex); | |
1576 U_CDECL_END | |
1577 | |
1578 | |
1579 /** | |
1580 * Set the find progress callback function for this URegularExpression. | |
1581 * | |
1582 * @param regexp The compiled regular expression. | |
1583 * @param callback A pointer to the user-supplied callback function. | |
1584 * @param context User context pointer. The value supplied at the | |
1585 * time the callback function is set will be saved | |
1586 * and passed to the callback each time that it is called. | |
1587 * @param status A reference to a UErrorCode to receive any errors. | |
1588 * @stable ICU 4.6 | |
1589 */ | |
1590 U_STABLE void U_EXPORT2 | |
1591 uregex_setFindProgressCallback(URegularExpression *regexp, | |
1592 URegexFindProgressCallback *callback, | |
1593 const void *context, | |
1594 UErrorCode *status); | |
1595 | |
1596 /** | |
1597 * Get the find progress callback function for this URegularExpression. | |
1598 * | |
1599 * @param regexp The compiled regular expression. | |
1600 * @param callback Out parameter, receives a pointer to the user-supplied | |
1601 * callback function. | |
1602 * @param context Out parameter, receives the user context pointer that | |
1603 * was set when uregex_setFindProgressCallback() was called. | |
1604 * @param status A reference to a UErrorCode to receive any errors. | |
1605 * @stable ICU 4.6 | |
1606 */ | |
1607 U_STABLE void U_EXPORT2 | |
1608 uregex_getFindProgressCallback(const URegularExpression *regexp, | |
1609 URegexFindProgressCallback **callback, | |
1610 const void **context, | |
1611 UErrorCode *status); | |
1612 | |
1613 #endif /* !UCONFIG_NO_REGULAR_EXPRESSIONS */ | |
1614 #endif /* UREGEX_H */ |