jpayne@69: // Copyright (c) 2005, Google Inc. jpayne@69: // All rights reserved. jpayne@69: // jpayne@69: // Redistribution and use in source and binary forms, with or without jpayne@69: // modification, are permitted provided that the following conditions are jpayne@69: // met: jpayne@69: // jpayne@69: // * Redistributions of source code must retain the above copyright jpayne@69: // notice, this list of conditions and the following disclaimer. jpayne@69: // * Redistributions in binary form must reproduce the above jpayne@69: // copyright notice, this list of conditions and the following disclaimer jpayne@69: // in the documentation and/or other materials provided with the jpayne@69: // distribution. jpayne@69: // * Neither the name of Google Inc. nor the names of its jpayne@69: // contributors may be used to endorse or promote products derived from jpayne@69: // this software without specific prior written permission. jpayne@69: // jpayne@69: // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS jpayne@69: // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT jpayne@69: // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR jpayne@69: // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT jpayne@69: // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, jpayne@69: // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT jpayne@69: // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, jpayne@69: // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY jpayne@69: // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT jpayne@69: // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE jpayne@69: // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. jpayne@69: // jpayne@69: // Author: Sanjay Ghemawat jpayne@69: // jpayne@69: // Regular-expression based scanner for parsing an input stream. jpayne@69: // jpayne@69: // Example 1: parse a sequence of "var = number" entries from input: jpayne@69: // jpayne@69: // Scanner scanner(input); jpayne@69: // string var; jpayne@69: // int number; jpayne@69: // scanner.SetSkipExpression("\\s+"); // Skip any white space we encounter jpayne@69: // while (scanner.Consume("(\\w+) = (\\d+)", &var, &number)) { jpayne@69: // ...; jpayne@69: // } jpayne@69: jpayne@69: #ifndef _PCRE_SCANNER_H jpayne@69: #define _PCRE_SCANNER_H jpayne@69: jpayne@69: #include jpayne@69: #include jpayne@69: #include jpayne@69: jpayne@69: #include jpayne@69: #include jpayne@69: jpayne@69: namespace pcrecpp { jpayne@69: jpayne@69: class PCRECPP_EXP_DEFN Scanner { jpayne@69: public: jpayne@69: Scanner(); jpayne@69: explicit Scanner(const std::string& input); jpayne@69: ~Scanner(); jpayne@69: jpayne@69: // Return current line number. The returned line-number is jpayne@69: // one-based. I.e. it returns 1 + the number of consumed newlines. jpayne@69: // jpayne@69: // Note: this method may be slow. It may take time proportional to jpayne@69: // the size of the input. jpayne@69: int LineNumber() const; jpayne@69: jpayne@69: // Return the byte-offset that the scanner is looking in the jpayne@69: // input data; jpayne@69: int Offset() const; jpayne@69: jpayne@69: // Return true iff the start of the remaining input matches "re" jpayne@69: bool LookingAt(const RE& re) const; jpayne@69: jpayne@69: // Return true iff all of the following are true jpayne@69: // a. the start of the remaining input matches "re", jpayne@69: // b. if any arguments are supplied, matched sub-patterns can be jpayne@69: // parsed and stored into the arguments. jpayne@69: // If it returns true, it skips over the matched input and any jpayne@69: // following input that matches the "skip" regular expression. jpayne@69: bool Consume(const RE& re, jpayne@69: const Arg& arg0 = RE::no_arg, jpayne@69: const Arg& arg1 = RE::no_arg, jpayne@69: const Arg& arg2 = RE::no_arg jpayne@69: // TODO: Allow more arguments? jpayne@69: ); jpayne@69: jpayne@69: // Set the "skip" regular expression. If after consuming some data, jpayne@69: // a prefix of the input matches this RE, it is automatically jpayne@69: // skipped. For example, a programming language scanner would use jpayne@69: // a skip RE that matches white space and comments. jpayne@69: // jpayne@69: // scanner.SetSkipExpression("\\s+|//.*|/[*](.|\n)*?[*]/"); jpayne@69: // jpayne@69: // Skipping repeats as long as it succeeds. We used to let people do jpayne@69: // this by writing "(...)*" in the regular expression, but that added jpayne@69: // up to lots of recursive calls within the pcre library, so now we jpayne@69: // control repetition explicitly via the function call API. jpayne@69: // jpayne@69: // You can pass NULL for "re" if you do not want any data to be skipped. jpayne@69: void Skip(const char* re); // DEPRECATED; does *not* repeat jpayne@69: void SetSkipExpression(const char* re); jpayne@69: jpayne@69: // Temporarily pause "skip"ing. This jpayne@69: // Skip("Foo"); code ; DisableSkip(); code; EnableSkip() jpayne@69: // is similar to jpayne@69: // Skip("Foo"); code ; Skip(NULL); code ; Skip("Foo"); jpayne@69: // but avoids creating/deleting new RE objects. jpayne@69: void DisableSkip(); jpayne@69: jpayne@69: // Reenable previously paused skipping. Any prefix of the input jpayne@69: // that matches the skip pattern is immediately dropped. jpayne@69: void EnableSkip(); jpayne@69: jpayne@69: /***** Special wrappers around SetSkip() for some common idioms *****/ jpayne@69: jpayne@69: // Arranges to skip whitespace, C comments, C++ comments. jpayne@69: // The overall RE is a disjunction of the following REs: jpayne@69: // \\s whitespace jpayne@69: // //.*\n C++ comment jpayne@69: // /[*](.|\n)*?[*]/ C comment (x*? means minimal repetitions of x) jpayne@69: // We get repetition via the semantics of SetSkipExpression, not by using * jpayne@69: void SkipCXXComments() { jpayne@69: SetSkipExpression("\\s|//.*\n|/[*](?:\n|.)*?[*]/"); jpayne@69: } jpayne@69: jpayne@69: void set_save_comments(bool comments) { jpayne@69: save_comments_ = comments; jpayne@69: } jpayne@69: jpayne@69: bool save_comments() { jpayne@69: return save_comments_; jpayne@69: } jpayne@69: jpayne@69: // Append to vector ranges the comments found in the jpayne@69: // byte range [start,end] (inclusive) of the input data. jpayne@69: // Only comments that were extracted entirely within that jpayne@69: // range are returned: no range splitting of atomically-extracted jpayne@69: // comments is performed. jpayne@69: void GetComments(int start, int end, std::vector *ranges); jpayne@69: jpayne@69: // Append to vector ranges the comments added jpayne@69: // since the last time this was called. This jpayne@69: // functionality is provided for efficiency when jpayne@69: // interleaving scanning with parsing. jpayne@69: void GetNextComments(std::vector *ranges); jpayne@69: jpayne@69: private: jpayne@69: std::string data_; // All the input data jpayne@69: StringPiece input_; // Unprocessed input jpayne@69: RE* skip_; // If non-NULL, RE for skipping input jpayne@69: bool should_skip_; // If true, use skip_ jpayne@69: bool skip_repeat_; // If true, repeat skip_ as long as it works jpayne@69: bool save_comments_; // If true, aggregate the skip expression jpayne@69: jpayne@69: // the skipped comments jpayne@69: // TODO: later consider requiring that the StringPieces be added jpayne@69: // in order by their start position jpayne@69: std::vector *comments_; jpayne@69: jpayne@69: // the offset into comments_ that has been returned by GetNextComments jpayne@69: int comments_offset_; jpayne@69: jpayne@69: // helper function to consume *skip_ and honour jpayne@69: // save_comments_ jpayne@69: void ConsumeSkip(); jpayne@69: }; jpayne@69: jpayne@69: } // namespace pcrecpp jpayne@69: jpayne@69: #endif /* _PCRE_SCANNER_H */