Mercurial > repos > rliterman > csp2
comparison CSP2/CSP2_env/env-d9b9114564458d9d-741b3de822f2aaca6c6caa4325c4afce/include/pcrecpp.h @ 69:33d812a61356
planemo upload commit 2e9511a184a1ca667c7be0c6321a36dc4e3d116d
author | jpayne |
---|---|
date | Tue, 18 Mar 2025 17:55:14 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
67:0e9998148a16 | 69:33d812a61356 |
---|---|
1 // Copyright (c) 2005, Google Inc. | |
2 // All rights reserved. | |
3 // | |
4 // Redistribution and use in source and binary forms, with or without | |
5 // modification, are permitted provided that the following conditions are | |
6 // met: | |
7 // | |
8 // * Redistributions of source code must retain the above copyright | |
9 // notice, this list of conditions and the following disclaimer. | |
10 // * Redistributions in binary form must reproduce the above | |
11 // copyright notice, this list of conditions and the following disclaimer | |
12 // in the documentation and/or other materials provided with the | |
13 // distribution. | |
14 // * Neither the name of Google Inc. nor the names of its | |
15 // contributors may be used to endorse or promote products derived from | |
16 // this software without specific prior written permission. | |
17 // | |
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS | |
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT | |
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR | |
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT | |
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, | |
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT | |
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, | |
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY | |
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE | |
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
29 // | |
30 // Author: Sanjay Ghemawat | |
31 // Support for PCRE_XXX modifiers added by Giuseppe Maxia, July 2005 | |
32 | |
33 #ifndef _PCRECPP_H | |
34 #define _PCRECPP_H | |
35 | |
36 // C++ interface to the pcre regular-expression library. RE supports | |
37 // Perl-style regular expressions (with extensions like \d, \w, \s, | |
38 // ...). | |
39 // | |
40 // ----------------------------------------------------------------------- | |
41 // REGEXP SYNTAX: | |
42 // | |
43 // This module is part of the pcre library and hence supports its syntax | |
44 // for regular expressions. | |
45 // | |
46 // The syntax is pretty similar to Perl's. For those not familiar | |
47 // with Perl's regular expressions, here are some examples of the most | |
48 // commonly used extensions: | |
49 // | |
50 // "hello (\\w+) world" -- \w matches a "word" character | |
51 // "version (\\d+)" -- \d matches a digit | |
52 // "hello\\s+world" -- \s matches any whitespace character | |
53 // "\\b(\\w+)\\b" -- \b matches empty string at a word boundary | |
54 // "(?i)hello" -- (?i) turns on case-insensitive matching | |
55 // "/\\*(.*?)\\*/" -- .*? matches . minimum no. of times possible | |
56 // | |
57 // ----------------------------------------------------------------------- | |
58 // MATCHING INTERFACE: | |
59 // | |
60 // The "FullMatch" operation checks that supplied text matches a | |
61 // supplied pattern exactly. | |
62 // | |
63 // Example: successful match | |
64 // pcrecpp::RE re("h.*o"); | |
65 // re.FullMatch("hello"); | |
66 // | |
67 // Example: unsuccessful match (requires full match): | |
68 // pcrecpp::RE re("e"); | |
69 // !re.FullMatch("hello"); | |
70 // | |
71 // Example: creating a temporary RE object: | |
72 // pcrecpp::RE("h.*o").FullMatch("hello"); | |
73 // | |
74 // You can pass in a "const char*" or a "string" for "text". The | |
75 // examples below tend to use a const char*. | |
76 // | |
77 // You can, as in the different examples above, store the RE object | |
78 // explicitly in a variable or use a temporary RE object. The | |
79 // examples below use one mode or the other arbitrarily. Either | |
80 // could correctly be used for any of these examples. | |
81 // | |
82 // ----------------------------------------------------------------------- | |
83 // MATCHING WITH SUB-STRING EXTRACTION: | |
84 // | |
85 // You can supply extra pointer arguments to extract matched subpieces. | |
86 // | |
87 // Example: extracts "ruby" into "s" and 1234 into "i" | |
88 // int i; | |
89 // string s; | |
90 // pcrecpp::RE re("(\\w+):(\\d+)"); | |
91 // re.FullMatch("ruby:1234", &s, &i); | |
92 // | |
93 // Example: does not try to extract any extra sub-patterns | |
94 // re.FullMatch("ruby:1234", &s); | |
95 // | |
96 // Example: does not try to extract into NULL | |
97 // re.FullMatch("ruby:1234", NULL, &i); | |
98 // | |
99 // Example: integer overflow causes failure | |
100 // !re.FullMatch("ruby:1234567891234", NULL, &i); | |
101 // | |
102 // Example: fails because there aren't enough sub-patterns: | |
103 // !pcrecpp::RE("\\w+:\\d+").FullMatch("ruby:1234", &s); | |
104 // | |
105 // Example: fails because string cannot be stored in integer | |
106 // !pcrecpp::RE("(.*)").FullMatch("ruby", &i); | |
107 // | |
108 // The provided pointer arguments can be pointers to any scalar numeric | |
109 // type, or one of | |
110 // string (matched piece is copied to string) | |
111 // StringPiece (StringPiece is mutated to point to matched piece) | |
112 // T (where "bool T::ParseFrom(const char*, int)" exists) | |
113 // NULL (the corresponding matched sub-pattern is not copied) | |
114 // | |
115 // CAVEAT: An optional sub-pattern that does not exist in the matched | |
116 // string is assigned the empty string. Therefore, the following will | |
117 // return false (because the empty string is not a valid number): | |
118 // int number; | |
119 // pcrecpp::RE::FullMatch("abc", "[a-z]+(\\d+)?", &number); | |
120 // | |
121 // ----------------------------------------------------------------------- | |
122 // DO_MATCH | |
123 // | |
124 // The matching interface supports at most 16 arguments per call. | |
125 // If you need more, consider using the more general interface | |
126 // pcrecpp::RE::DoMatch(). See pcrecpp.h for the signature for DoMatch. | |
127 // | |
128 // ----------------------------------------------------------------------- | |
129 // PARTIAL MATCHES | |
130 // | |
131 // You can use the "PartialMatch" operation when you want the pattern | |
132 // to match any substring of the text. | |
133 // | |
134 // Example: simple search for a string: | |
135 // pcrecpp::RE("ell").PartialMatch("hello"); | |
136 // | |
137 // Example: find first number in a string: | |
138 // int number; | |
139 // pcrecpp::RE re("(\\d+)"); | |
140 // re.PartialMatch("x*100 + 20", &number); | |
141 // assert(number == 100); | |
142 // | |
143 // ----------------------------------------------------------------------- | |
144 // UTF-8 AND THE MATCHING INTERFACE: | |
145 // | |
146 // By default, pattern and text are plain text, one byte per character. | |
147 // The UTF8 flag, passed to the constructor, causes both pattern | |
148 // and string to be treated as UTF-8 text, still a byte stream but | |
149 // potentially multiple bytes per character. In practice, the text | |
150 // is likelier to be UTF-8 than the pattern, but the match returned | |
151 // may depend on the UTF8 flag, so always use it when matching | |
152 // UTF8 text. E.g., "." will match one byte normally but with UTF8 | |
153 // set may match up to three bytes of a multi-byte character. | |
154 // | |
155 // Example: | |
156 // pcrecpp::RE_Options options; | |
157 // options.set_utf8(); | |
158 // pcrecpp::RE re(utf8_pattern, options); | |
159 // re.FullMatch(utf8_string); | |
160 // | |
161 // Example: using the convenience function UTF8(): | |
162 // pcrecpp::RE re(utf8_pattern, pcrecpp::UTF8()); | |
163 // re.FullMatch(utf8_string); | |
164 // | |
165 // NOTE: The UTF8 option is ignored if pcre was not configured with the | |
166 // --enable-utf8 flag. | |
167 // | |
168 // ----------------------------------------------------------------------- | |
169 // PASSING MODIFIERS TO THE REGULAR EXPRESSION ENGINE | |
170 // | |
171 // PCRE defines some modifiers to change the behavior of the regular | |
172 // expression engine. | |
173 // The C++ wrapper defines an auxiliary class, RE_Options, as a vehicle | |
174 // to pass such modifiers to a RE class. | |
175 // | |
176 // Currently, the following modifiers are supported | |
177 // | |
178 // modifier description Perl corresponding | |
179 // | |
180 // PCRE_CASELESS case insensitive match /i | |
181 // PCRE_MULTILINE multiple lines match /m | |
182 // PCRE_DOTALL dot matches newlines /s | |
183 // PCRE_DOLLAR_ENDONLY $ matches only at end N/A | |
184 // PCRE_EXTRA strict escape parsing N/A | |
185 // PCRE_EXTENDED ignore whitespaces /x | |
186 // PCRE_UTF8 handles UTF8 chars built-in | |
187 // PCRE_UNGREEDY reverses * and *? N/A | |
188 // PCRE_NO_AUTO_CAPTURE disables matching parens N/A (*) | |
189 // | |
190 // (For a full account on how each modifier works, please check the | |
191 // PCRE API reference manual). | |
192 // | |
193 // (*) Both Perl and PCRE allow non matching parentheses by means of the | |
194 // "?:" modifier within the pattern itself. e.g. (?:ab|cd) does not | |
195 // capture, while (ab|cd) does. | |
196 // | |
197 // For each modifier, there are two member functions whose name is made | |
198 // out of the modifier in lowercase, without the "PCRE_" prefix. For | |
199 // instance, PCRE_CASELESS is handled by | |
200 // bool caseless(), | |
201 // which returns true if the modifier is set, and | |
202 // RE_Options & set_caseless(bool), | |
203 // which sets or unsets the modifier. | |
204 // | |
205 // Moreover, PCRE_EXTRA_MATCH_LIMIT can be accessed through the | |
206 // set_match_limit() and match_limit() member functions. | |
207 // Setting match_limit to a non-zero value will limit the executation of | |
208 // pcre to keep it from doing bad things like blowing the stack or taking | |
209 // an eternity to return a result. A value of 5000 is good enough to stop | |
210 // stack blowup in a 2MB thread stack. Setting match_limit to zero will | |
211 // disable match limiting. Alternately, you can set match_limit_recursion() | |
212 // which uses PCRE_EXTRA_MATCH_LIMIT_RECURSION to limit how much pcre | |
213 // recurses. match_limit() caps the number of matches pcre does; | |
214 // match_limit_recrusion() caps the depth of recursion. | |
215 // | |
216 // Normally, to pass one or more modifiers to a RE class, you declare | |
217 // a RE_Options object, set the appropriate options, and pass this | |
218 // object to a RE constructor. Example: | |
219 // | |
220 // RE_options opt; | |
221 // opt.set_caseless(true); | |
222 // | |
223 // if (RE("HELLO", opt).PartialMatch("hello world")) ... | |
224 // | |
225 // RE_options has two constructors. The default constructor takes no | |
226 // arguments and creates a set of flags that are off by default. | |
227 // | |
228 // The optional parameter 'option_flags' is to facilitate transfer | |
229 // of legacy code from C programs. This lets you do | |
230 // RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); | |
231 // | |
232 // But new code is better off doing | |
233 // RE(pattern, | |
234 // RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str); | |
235 // (See below) | |
236 // | |
237 // If you are going to pass one of the most used modifiers, there are some | |
238 // convenience functions that return a RE_Options class with the | |
239 // appropriate modifier already set: | |
240 // CASELESS(), UTF8(), MULTILINE(), DOTALL(), EXTENDED() | |
241 // | |
242 // If you need to set several options at once, and you don't want to go | |
243 // through the pains of declaring a RE_Options object and setting several | |
244 // options, there is a parallel method that give you such ability on the | |
245 // fly. You can concatenate several set_xxxxx member functions, since each | |
246 // of them returns a reference to its class object. e.g.: to pass | |
247 // PCRE_CASELESS, PCRE_EXTENDED, and PCRE_MULTILINE to a RE with one | |
248 // statement, you may write | |
249 // | |
250 // RE(" ^ xyz \\s+ .* blah$", RE_Options() | |
251 // .set_caseless(true) | |
252 // .set_extended(true) | |
253 // .set_multiline(true)).PartialMatch(sometext); | |
254 // | |
255 // ----------------------------------------------------------------------- | |
256 // SCANNING TEXT INCREMENTALLY | |
257 // | |
258 // The "Consume" operation may be useful if you want to repeatedly | |
259 // match regular expressions at the front of a string and skip over | |
260 // them as they match. This requires use of the "StringPiece" type, | |
261 // which represents a sub-range of a real string. Like RE, StringPiece | |
262 // is defined in the pcrecpp namespace. | |
263 // | |
264 // Example: read lines of the form "var = value" from a string. | |
265 // string contents = ...; // Fill string somehow | |
266 // pcrecpp::StringPiece input(contents); // Wrap in a StringPiece | |
267 // | |
268 // string var; | |
269 // int value; | |
270 // pcrecpp::RE re("(\\w+) = (\\d+)\n"); | |
271 // while (re.Consume(&input, &var, &value)) { | |
272 // ...; | |
273 // } | |
274 // | |
275 // Each successful call to "Consume" will set "var/value", and also | |
276 // advance "input" so it points past the matched text. | |
277 // | |
278 // The "FindAndConsume" operation is similar to "Consume" but does not | |
279 // anchor your match at the beginning of the string. For example, you | |
280 // could extract all words from a string by repeatedly calling | |
281 // pcrecpp::RE("(\\w+)").FindAndConsume(&input, &word) | |
282 // | |
283 // ----------------------------------------------------------------------- | |
284 // PARSING HEX/OCTAL/C-RADIX NUMBERS | |
285 // | |
286 // By default, if you pass a pointer to a numeric value, the | |
287 // corresponding text is interpreted as a base-10 number. You can | |
288 // instead wrap the pointer with a call to one of the operators Hex(), | |
289 // Octal(), or CRadix() to interpret the text in another base. The | |
290 // CRadix operator interprets C-style "0" (base-8) and "0x" (base-16) | |
291 // prefixes, but defaults to base-10. | |
292 // | |
293 // Example: | |
294 // int a, b, c, d; | |
295 // pcrecpp::RE re("(.*) (.*) (.*) (.*)"); | |
296 // re.FullMatch("100 40 0100 0x40", | |
297 // pcrecpp::Octal(&a), pcrecpp::Hex(&b), | |
298 // pcrecpp::CRadix(&c), pcrecpp::CRadix(&d)); | |
299 // will leave 64 in a, b, c, and d. | |
300 // | |
301 // ----------------------------------------------------------------------- | |
302 // REPLACING PARTS OF STRINGS | |
303 // | |
304 // You can replace the first match of "pattern" in "str" with | |
305 // "rewrite". Within "rewrite", backslash-escaped digits (\1 to \9) | |
306 // can be used to insert text matching corresponding parenthesized | |
307 // group from the pattern. \0 in "rewrite" refers to the entire | |
308 // matching text. E.g., | |
309 // | |
310 // string s = "yabba dabba doo"; | |
311 // pcrecpp::RE("b+").Replace("d", &s); | |
312 // | |
313 // will leave "s" containing "yada dabba doo". The result is true if | |
314 // the pattern matches and a replacement occurs, or false otherwise. | |
315 // | |
316 // GlobalReplace() is like Replace(), except that it replaces all | |
317 // occurrences of the pattern in the string with the rewrite. | |
318 // Replacements are not subject to re-matching. E.g., | |
319 // | |
320 // string s = "yabba dabba doo"; | |
321 // pcrecpp::RE("b+").GlobalReplace("d", &s); | |
322 // | |
323 // will leave "s" containing "yada dada doo". It returns the number | |
324 // of replacements made. | |
325 // | |
326 // Extract() is like Replace(), except that if the pattern matches, | |
327 // "rewrite" is copied into "out" (an additional argument) with | |
328 // substitutions. The non-matching portions of "text" are ignored. | |
329 // Returns true iff a match occurred and the extraction happened | |
330 // successfully. If no match occurs, the string is left unaffected. | |
331 | |
332 | |
333 #include <string> | |
334 #include <pcre.h> | |
335 #include <pcrecpparg.h> // defines the Arg class | |
336 // This isn't technically needed here, but we include it | |
337 // anyway so folks who include pcrecpp.h don't have to. | |
338 #include <pcre_stringpiece.h> | |
339 | |
340 namespace pcrecpp { | |
341 | |
342 #define PCRE_SET_OR_CLEAR(b, o) \ | |
343 if (b) all_options_ |= (o); else all_options_ &= ~(o); \ | |
344 return *this | |
345 | |
346 #define PCRE_IS_SET(o) \ | |
347 (all_options_ & o) == o | |
348 | |
349 /***** Compiling regular expressions: the RE class *****/ | |
350 | |
351 // RE_Options allow you to set options to be passed along to pcre, | |
352 // along with other options we put on top of pcre. | |
353 // Only 9 modifiers, plus match_limit and match_limit_recursion, | |
354 // are supported now. | |
355 class PCRECPP_EXP_DEFN RE_Options { | |
356 public: | |
357 // constructor | |
358 RE_Options() : match_limit_(0), match_limit_recursion_(0), all_options_(0) {} | |
359 | |
360 // alternative constructor. | |
361 // To facilitate transfer of legacy code from C programs | |
362 // | |
363 // This lets you do | |
364 // RE(pattern, RE_Options(PCRE_CASELESS|PCRE_MULTILINE)).PartialMatch(str); | |
365 // But new code is better off doing | |
366 // RE(pattern, | |
367 // RE_Options().set_caseless(true).set_multiline(true)).PartialMatch(str); | |
368 RE_Options(int option_flags) : match_limit_(0), match_limit_recursion_(0), | |
369 all_options_(option_flags) {} | |
370 // we're fine with the default destructor, copy constructor, etc. | |
371 | |
372 // accessors and mutators | |
373 int match_limit() const { return match_limit_; }; | |
374 RE_Options &set_match_limit(int limit) { | |
375 match_limit_ = limit; | |
376 return *this; | |
377 } | |
378 | |
379 int match_limit_recursion() const { return match_limit_recursion_; }; | |
380 RE_Options &set_match_limit_recursion(int limit) { | |
381 match_limit_recursion_ = limit; | |
382 return *this; | |
383 } | |
384 | |
385 bool caseless() const { | |
386 return PCRE_IS_SET(PCRE_CASELESS); | |
387 } | |
388 RE_Options &set_caseless(bool x) { | |
389 PCRE_SET_OR_CLEAR(x, PCRE_CASELESS); | |
390 } | |
391 | |
392 bool multiline() const { | |
393 return PCRE_IS_SET(PCRE_MULTILINE); | |
394 } | |
395 RE_Options &set_multiline(bool x) { | |
396 PCRE_SET_OR_CLEAR(x, PCRE_MULTILINE); | |
397 } | |
398 | |
399 bool dotall() const { | |
400 return PCRE_IS_SET(PCRE_DOTALL); | |
401 } | |
402 RE_Options &set_dotall(bool x) { | |
403 PCRE_SET_OR_CLEAR(x, PCRE_DOTALL); | |
404 } | |
405 | |
406 bool extended() const { | |
407 return PCRE_IS_SET(PCRE_EXTENDED); | |
408 } | |
409 RE_Options &set_extended(bool x) { | |
410 PCRE_SET_OR_CLEAR(x, PCRE_EXTENDED); | |
411 } | |
412 | |
413 bool dollar_endonly() const { | |
414 return PCRE_IS_SET(PCRE_DOLLAR_ENDONLY); | |
415 } | |
416 RE_Options &set_dollar_endonly(bool x) { | |
417 PCRE_SET_OR_CLEAR(x, PCRE_DOLLAR_ENDONLY); | |
418 } | |
419 | |
420 bool extra() const { | |
421 return PCRE_IS_SET(PCRE_EXTRA); | |
422 } | |
423 RE_Options &set_extra(bool x) { | |
424 PCRE_SET_OR_CLEAR(x, PCRE_EXTRA); | |
425 } | |
426 | |
427 bool ungreedy() const { | |
428 return PCRE_IS_SET(PCRE_UNGREEDY); | |
429 } | |
430 RE_Options &set_ungreedy(bool x) { | |
431 PCRE_SET_OR_CLEAR(x, PCRE_UNGREEDY); | |
432 } | |
433 | |
434 bool utf8() const { | |
435 return PCRE_IS_SET(PCRE_UTF8); | |
436 } | |
437 RE_Options &set_utf8(bool x) { | |
438 PCRE_SET_OR_CLEAR(x, PCRE_UTF8); | |
439 } | |
440 | |
441 bool no_auto_capture() const { | |
442 return PCRE_IS_SET(PCRE_NO_AUTO_CAPTURE); | |
443 } | |
444 RE_Options &set_no_auto_capture(bool x) { | |
445 PCRE_SET_OR_CLEAR(x, PCRE_NO_AUTO_CAPTURE); | |
446 } | |
447 | |
448 RE_Options &set_all_options(int opt) { | |
449 all_options_ = opt; | |
450 return *this; | |
451 } | |
452 int all_options() const { | |
453 return all_options_ ; | |
454 } | |
455 | |
456 // TODO: add other pcre flags | |
457 | |
458 private: | |
459 int match_limit_; | |
460 int match_limit_recursion_; | |
461 int all_options_; | |
462 }; | |
463 | |
464 // These functions return some common RE_Options | |
465 static inline RE_Options UTF8() { | |
466 return RE_Options().set_utf8(true); | |
467 } | |
468 | |
469 static inline RE_Options CASELESS() { | |
470 return RE_Options().set_caseless(true); | |
471 } | |
472 static inline RE_Options MULTILINE() { | |
473 return RE_Options().set_multiline(true); | |
474 } | |
475 | |
476 static inline RE_Options DOTALL() { | |
477 return RE_Options().set_dotall(true); | |
478 } | |
479 | |
480 static inline RE_Options EXTENDED() { | |
481 return RE_Options().set_extended(true); | |
482 } | |
483 | |
484 // Interface for regular expression matching. Also corresponds to a | |
485 // pre-compiled regular expression. An "RE" object is safe for | |
486 // concurrent use by multiple threads. | |
487 class PCRECPP_EXP_DEFN RE { | |
488 public: | |
489 // We provide implicit conversions from strings so that users can | |
490 // pass in a string or a "const char*" wherever an "RE" is expected. | |
491 RE(const string& pat) { Init(pat, NULL); } | |
492 RE(const string& pat, const RE_Options& option) { Init(pat, &option); } | |
493 RE(const char* pat) { Init(pat, NULL); } | |
494 RE(const char* pat, const RE_Options& option) { Init(pat, &option); } | |
495 RE(const unsigned char* pat) { | |
496 Init(reinterpret_cast<const char*>(pat), NULL); | |
497 } | |
498 RE(const unsigned char* pat, const RE_Options& option) { | |
499 Init(reinterpret_cast<const char*>(pat), &option); | |
500 } | |
501 | |
502 // Copy constructor & assignment - note that these are expensive | |
503 // because they recompile the expression. | |
504 RE(const RE& re) { Init(re.pattern_, &re.options_); } | |
505 const RE& operator=(const RE& re) { | |
506 if (this != &re) { | |
507 Cleanup(); | |
508 | |
509 // This is the code that originally came from Google | |
510 // Init(re.pattern_.c_str(), &re.options_); | |
511 | |
512 // This is the replacement from Ari Pollak | |
513 Init(re.pattern_, &re.options_); | |
514 } | |
515 return *this; | |
516 } | |
517 | |
518 | |
519 ~RE(); | |
520 | |
521 // The string specification for this RE. E.g. | |
522 // RE re("ab*c?d+"); | |
523 // re.pattern(); // "ab*c?d+" | |
524 const string& pattern() const { return pattern_; } | |
525 | |
526 // If RE could not be created properly, returns an error string. | |
527 // Else returns the empty string. | |
528 const string& error() const { return *error_; } | |
529 | |
530 /***** The useful part: the matching interface *****/ | |
531 | |
532 // This is provided so one can do pattern.ReplaceAll() just as | |
533 // easily as ReplaceAll(pattern-text, ....) | |
534 | |
535 bool FullMatch(const StringPiece& text, | |
536 const Arg& ptr1 = no_arg, | |
537 const Arg& ptr2 = no_arg, | |
538 const Arg& ptr3 = no_arg, | |
539 const Arg& ptr4 = no_arg, | |
540 const Arg& ptr5 = no_arg, | |
541 const Arg& ptr6 = no_arg, | |
542 const Arg& ptr7 = no_arg, | |
543 const Arg& ptr8 = no_arg, | |
544 const Arg& ptr9 = no_arg, | |
545 const Arg& ptr10 = no_arg, | |
546 const Arg& ptr11 = no_arg, | |
547 const Arg& ptr12 = no_arg, | |
548 const Arg& ptr13 = no_arg, | |
549 const Arg& ptr14 = no_arg, | |
550 const Arg& ptr15 = no_arg, | |
551 const Arg& ptr16 = no_arg) const; | |
552 | |
553 bool PartialMatch(const StringPiece& text, | |
554 const Arg& ptr1 = no_arg, | |
555 const Arg& ptr2 = no_arg, | |
556 const Arg& ptr3 = no_arg, | |
557 const Arg& ptr4 = no_arg, | |
558 const Arg& ptr5 = no_arg, | |
559 const Arg& ptr6 = no_arg, | |
560 const Arg& ptr7 = no_arg, | |
561 const Arg& ptr8 = no_arg, | |
562 const Arg& ptr9 = no_arg, | |
563 const Arg& ptr10 = no_arg, | |
564 const Arg& ptr11 = no_arg, | |
565 const Arg& ptr12 = no_arg, | |
566 const Arg& ptr13 = no_arg, | |
567 const Arg& ptr14 = no_arg, | |
568 const Arg& ptr15 = no_arg, | |
569 const Arg& ptr16 = no_arg) const; | |
570 | |
571 bool Consume(StringPiece* input, | |
572 const Arg& ptr1 = no_arg, | |
573 const Arg& ptr2 = no_arg, | |
574 const Arg& ptr3 = no_arg, | |
575 const Arg& ptr4 = no_arg, | |
576 const Arg& ptr5 = no_arg, | |
577 const Arg& ptr6 = no_arg, | |
578 const Arg& ptr7 = no_arg, | |
579 const Arg& ptr8 = no_arg, | |
580 const Arg& ptr9 = no_arg, | |
581 const Arg& ptr10 = no_arg, | |
582 const Arg& ptr11 = no_arg, | |
583 const Arg& ptr12 = no_arg, | |
584 const Arg& ptr13 = no_arg, | |
585 const Arg& ptr14 = no_arg, | |
586 const Arg& ptr15 = no_arg, | |
587 const Arg& ptr16 = no_arg) const; | |
588 | |
589 bool FindAndConsume(StringPiece* input, | |
590 const Arg& ptr1 = no_arg, | |
591 const Arg& ptr2 = no_arg, | |
592 const Arg& ptr3 = no_arg, | |
593 const Arg& ptr4 = no_arg, | |
594 const Arg& ptr5 = no_arg, | |
595 const Arg& ptr6 = no_arg, | |
596 const Arg& ptr7 = no_arg, | |
597 const Arg& ptr8 = no_arg, | |
598 const Arg& ptr9 = no_arg, | |
599 const Arg& ptr10 = no_arg, | |
600 const Arg& ptr11 = no_arg, | |
601 const Arg& ptr12 = no_arg, | |
602 const Arg& ptr13 = no_arg, | |
603 const Arg& ptr14 = no_arg, | |
604 const Arg& ptr15 = no_arg, | |
605 const Arg& ptr16 = no_arg) const; | |
606 | |
607 bool Replace(const StringPiece& rewrite, | |
608 string *str) const; | |
609 | |
610 int GlobalReplace(const StringPiece& rewrite, | |
611 string *str) const; | |
612 | |
613 bool Extract(const StringPiece &rewrite, | |
614 const StringPiece &text, | |
615 string *out) const; | |
616 | |
617 // Escapes all potentially meaningful regexp characters in | |
618 // 'unquoted'. The returned string, used as a regular expression, | |
619 // will exactly match the original string. For example, | |
620 // 1.5-2.0? | |
621 // may become: | |
622 // 1\.5\-2\.0\? | |
623 // Note QuoteMeta behaves the same as perl's QuoteMeta function, | |
624 // *except* that it escapes the NUL character (\0) as backslash + 0, | |
625 // rather than backslash + NUL. | |
626 static string QuoteMeta(const StringPiece& unquoted); | |
627 | |
628 | |
629 /***** Generic matching interface *****/ | |
630 | |
631 // Type of match (TODO: Should be restructured as part of RE_Options) | |
632 enum Anchor { | |
633 UNANCHORED, // No anchoring | |
634 ANCHOR_START, // Anchor at start only | |
635 ANCHOR_BOTH // Anchor at start and end | |
636 }; | |
637 | |
638 // General matching routine. Stores the length of the match in | |
639 // "*consumed" if successful. | |
640 bool DoMatch(const StringPiece& text, | |
641 Anchor anchor, | |
642 int* consumed, | |
643 const Arg* const* args, int n) const; | |
644 | |
645 // Return the number of capturing subpatterns, or -1 if the | |
646 // regexp wasn't valid on construction. | |
647 int NumberOfCapturingGroups() const; | |
648 | |
649 // The default value for an argument, to indicate the end of the argument | |
650 // list. This must be used only in optional argument defaults. It should NOT | |
651 // be passed explicitly. Some people have tried to use it like this: | |
652 // | |
653 // FullMatch(x, y, &z, no_arg, &w); | |
654 // | |
655 // This is a mistake, and will not work. | |
656 static Arg no_arg; | |
657 | |
658 private: | |
659 | |
660 void Init(const string& pattern, const RE_Options* options); | |
661 void Cleanup(); | |
662 | |
663 // Match against "text", filling in "vec" (up to "vecsize" * 2/3) with | |
664 // pairs of integers for the beginning and end positions of matched | |
665 // text. The first pair corresponds to the entire matched text; | |
666 // subsequent pairs correspond, in order, to parentheses-captured | |
667 // matches. Returns the number of pairs (one more than the number of | |
668 // the last subpattern with a match) if matching was successful | |
669 // and zero if the match failed. | |
670 // I.e. for RE("(foo)|(bar)|(baz)") it will return 2, 3, and 4 when matching | |
671 // against "foo", "bar", and "baz" respectively. | |
672 // When matching RE("(foo)|hello") against "hello", it will return 1. | |
673 // But the values for all subpattern are filled in into "vec". | |
674 int TryMatch(const StringPiece& text, | |
675 int startpos, | |
676 Anchor anchor, | |
677 bool empty_ok, | |
678 int *vec, | |
679 int vecsize) const; | |
680 | |
681 // Append the "rewrite" string, with backslash subsitutions from "text" | |
682 // and "vec", to string "out". | |
683 bool Rewrite(string *out, | |
684 const StringPiece& rewrite, | |
685 const StringPiece& text, | |
686 int *vec, | |
687 int veclen) const; | |
688 | |
689 // internal implementation for DoMatch | |
690 bool DoMatchImpl(const StringPiece& text, | |
691 Anchor anchor, | |
692 int* consumed, | |
693 const Arg* const args[], | |
694 int n, | |
695 int* vec, | |
696 int vecsize) const; | |
697 | |
698 // Compile the regexp for the specified anchoring mode | |
699 pcre* Compile(Anchor anchor); | |
700 | |
701 string pattern_; | |
702 RE_Options options_; | |
703 pcre* re_full_; // For full matches | |
704 pcre* re_partial_; // For partial matches | |
705 const string* error_; // Error indicator (or points to empty string) | |
706 }; | |
707 | |
708 } // namespace pcrecpp | |
709 | |
710 #endif /* _PCRECPP_H */ |