diff options
author | Dmitri Gribenko <gribozavr@gmail.com> | 2012-06-26 20:39:18 +0000 |
---|---|---|
committer | Dmitri Gribenko <gribozavr@gmail.com> | 2012-06-26 20:39:18 +0000 |
commit | 2d44d77fed3200e2eff289f55493317e90d3398c (patch) | |
tree | d1d93511e3b05ef54497369d2d0ca603499d2862 | |
parent | 5283c99365ec4697a5a6bb2b2505469a9aa474d5 (diff) |
Implement a lexer for structured comments.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@159223 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/clang-c/Index.h | 6 | ||||
-rw-r--r-- | include/clang/AST/CommentBriefParser.h | 49 | ||||
-rw-r--r-- | include/clang/AST/CommentLexer.h | 352 | ||||
-rw-r--r-- | include/clang/AST/RawCommentList.h | 25 | ||||
-rw-r--r-- | lib/AST/ASTContext.cpp | 4 | ||||
-rw-r--r-- | lib/AST/CMakeLists.txt | 2 | ||||
-rw-r--r-- | lib/AST/CommentBriefParser.cpp | 76 | ||||
-rw-r--r-- | lib/AST/CommentLexer.cpp | 676 | ||||
-rw-r--r-- | lib/AST/RawCommentList.cpp | 21 | ||||
-rw-r--r-- | test/Index/annotate-comments.cpp | 39 | ||||
-rw-r--r-- | tools/c-index-test/c-index-test.c | 50 | ||||
-rw-r--r-- | tools/libclang/CIndex.cpp | 18 | ||||
-rw-r--r-- | tools/libclang/libclang.exports | 1 | ||||
-rw-r--r-- | unittests/AST/CMakeLists.txt | 7 | ||||
-rw-r--r-- | unittests/AST/CommentLexer.cpp | 1010 | ||||
-rw-r--r-- | unittests/AST/Makefile | 15 | ||||
-rw-r--r-- | unittests/Makefile | 2 |
17 files changed, 2326 insertions, 27 deletions
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h index b7bd8bb738..2397ae1925 100644 --- a/include/clang-c/Index.h +++ b/include/clang-c/Index.h @@ -3201,6 +3201,12 @@ CINDEX_LINKAGE CXSourceRange clang_Cursor_getCommentRange(CXCursor C); CINDEX_LINKAGE CXString clang_Cursor_getRawCommentText(CXCursor C); /** + * \brief Given a cursor that represents a declaration, return the associated + * \\brief paragraph; otherwise return the first paragraph. + */ +CINDEX_LINKAGE CXString clang_Cursor_getBriefCommentText(CXCursor C); + +/** * @} */ diff --git a/include/clang/AST/CommentBriefParser.h b/include/clang/AST/CommentBriefParser.h new file mode 100644 index 0000000000..e343b94643 --- /dev/null +++ b/include/clang/AST/CommentBriefParser.h @@ -0,0 +1,49 @@ +//===--- CommentBriefParser.h - Dumb comment parser -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines a very simple Doxygen comment parser. +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_CLANG_AST_BRIEF_COMMENT_PARSER_H +#define LLVM_CLANG_AST_BRIEF_COMMENT_PARSER_H + +#include "clang/AST/CommentLexer.h" + +namespace clang { +namespace comments { + +/// A very simple comment parser that extracts just the brief description or +/// first paragraph. +class BriefParser { + Lexer &L; + + /// Current lookahead token. + Token Tok; + + SourceLocation ConsumeToken() { + SourceLocation Loc = Tok.getLocation(); + L.lex(Tok); + return Loc; + } + +public: + BriefParser(Lexer &L); + + /// Return \\brief paragraph, if it exists; otherwise return the first + /// paragraph. + std::string Parse(); +}; + +} // end namespace comments +} // end namespace clang + +#endif + diff --git a/include/clang/AST/CommentLexer.h b/include/clang/AST/CommentLexer.h new file mode 100644 index 0000000000..7f7ae62758 --- /dev/null +++ b/include/clang/AST/CommentLexer.h @@ -0,0 +1,352 @@ +//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines lexer for structured comments and supporting token class. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H +#define LLVM_CLANG_AST_COMMENT_LEXER_H + +#include "clang/Basic/SourceManager.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace comments { + +class Lexer; + +namespace tok { +enum TokenKind { + eof, + newline, + text, + command, + verbatim_block_begin, + verbatim_block_line, + verbatim_block_end, + verbatim_line, + html_tag_open, // <tag + html_ident, // attr + html_equals, // = + html_quoted_string, // "blah\"blah" or 'blah\'blah' + html_greater, // > + html_tag_close, // </tag> + + // Markdown tokens (not supported yet). + ruler, + md_code_line, // Line indented at least by 4 spaces. + md_code_inline, // `code` + md_emph, // _text_ or *text* + md_strong, // __text__ or *text* + md_header // ### level 3 header ### +}; +} // end namespace tok + +class CommentOptions { +public: + bool Markdown; +}; + +/// \brief Comment token. +class Token { + friend class Lexer; + + /// The location of the token. + SourceLocation Loc; + + /// The actual kind of the token. + tok::TokenKind Kind; + + /// Length of the token spelling in comment. Can be 0 for synthenized + /// tokens. + unsigned Length; + + /// Contains text value associated with a token. + const char *TextPtr1; + unsigned TextLen1; + + /// Contains text value associated with a token. + const char *TextPtr2; + unsigned TextLen2; + +public: + SourceLocation getLocation() const LLVM_READONLY { return Loc; } + void setLocation(SourceLocation SL) { Loc = SL; } + + tok::TokenKind getKind() const LLVM_READONLY { return Kind; } + void setKind(tok::TokenKind K) { Kind = K; } + + bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; } + bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; } + + unsigned getLength() const LLVM_READONLY { return Length; } + void setLength(unsigned L) { Length = L; } + + StringRef getText() const LLVM_READONLY { + assert(is(tok::text)); + return StringRef(TextPtr1, TextLen1); + } + + void setText(StringRef Text) { + assert(is(tok::text)); + TextPtr1 = Text.data(); + TextLen1 = Text.size(); + } + + StringRef getCommandName() const LLVM_READONLY { + assert(is(tok::command)); + return StringRef(TextPtr1, TextLen1); + } + + void setCommandName(StringRef Name) { + assert(is(tok::command)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getVerbatimBlockName() const LLVM_READONLY { + assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); + return StringRef(TextPtr1, TextLen1); + } + + void setVerbatimBlockName(StringRef Name) { + assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getVerbatimBlockText() const LLVM_READONLY { + assert(is(tok::verbatim_block_line)); + return StringRef(TextPtr1, TextLen1); + } + + void setVerbatimBlockText(StringRef Text) { + assert(is(tok::verbatim_block_line)); + TextPtr1 = Text.data(); + TextLen1 = Text.size(); + } + + /// Returns the name of verbatim line command. + StringRef getVerbatimLineName() const LLVM_READONLY { + assert(is(tok::verbatim_line)); + return StringRef(TextPtr1, TextLen1); + } + + void setVerbatimLineName(StringRef Name) { + assert(is(tok::verbatim_line)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getVerbatimLineText() const LLVM_READONLY { + assert(is(tok::verbatim_line)); + return StringRef(TextPtr2, TextLen2); + } + + void setVerbatimLineText(StringRef Text) { + assert(is(tok::verbatim_line)); + TextPtr2 = Text.data(); + TextLen2 = Text.size(); + } + + StringRef getHTMLTagOpenName() const LLVM_READONLY { + assert(is(tok::html_tag_open)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLTagOpenName(StringRef Name) { + assert(is(tok::html_tag_open)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getHTMLIdent() const LLVM_READONLY { + assert(is(tok::html_ident)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLIdent(StringRef Name) { + assert(is(tok::html_ident)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + StringRef getHTMLQuotedString() const LLVM_READONLY { + assert(is(tok::html_quoted_string)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLQuotedString(StringRef Str) { + assert(is(tok::html_quoted_string)); + TextPtr1 = Str.data(); + TextLen1 = Str.size(); + } + + StringRef getHTMLTagCloseName() const LLVM_READONLY { + assert(is(tok::html_tag_close)); + return StringRef(TextPtr1, TextLen1); + } + + void setHTMLTagCloseName(StringRef Name) { + assert(is(tok::html_tag_close)); + TextPtr1 = Name.data(); + TextLen1 = Name.size(); + } + + void dump(const Lexer &L, const SourceManager &SM) const; +}; + +/// \brief Comment lexer. +class Lexer { +private: + Lexer(const Lexer&); // DO NOT IMPLEMENT + void operator=(const Lexer&); // DO NOT IMPLEMENT + + const char *const BufferStart; + const char *const BufferEnd; + SourceLocation FileLoc; + CommentOptions CommOpts; + + const char *BufferPtr; + + /// One past end pointer for the current comment. For BCPL comments points + /// to newline or BufferEnd, for C comments points to star in '*/'. + const char *CommentEnd; + + enum LexerCommentState { + LCS_BeforeComment, + LCS_InsideBCPLComment, + LCS_InsideCComment, + LCS_BetweenComments + }; + + /// Low-level lexer state, track if we are inside or outside of comment. + LexerCommentState CommentState; + + enum LexerState { + /// Lexing normal comment text + LS_Normal, + + /// Finished lexing verbatim block beginning command, will lex first body + /// line. + LS_VerbatimBlockFirstLine, + + /// Lexing verbatim block body line-by-line, skipping line-starting + /// decorations. + LS_VerbatimBlockBody, + + /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes. + LS_HTMLOpenTag + }; + + /// Current lexing mode. + LexerState State; + + /// A verbatim-like block command eats every character (except line starting + /// decorations) until matching end command is seen or comment end is hit. + struct VerbatimBlockCommand { + StringRef BeginName; + StringRef EndName; + }; + + typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector; + + /// Registered verbatim-like block commands. + VerbatimBlockCommandVector VerbatimBlockCommands; + + /// If State is LS_VerbatimBlock, contains the the name of verbatim end + /// command, including command marker. + SmallString<16> VerbatimBlockEndCommandName; + + bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const; + + /// A verbatim-like line command eats everything until a newline is seen or + /// comment end is hit. + struct VerbatimLineCommand { + StringRef Name; + }; + + typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector; + + /// Registered verbatim-like line commands. + VerbatimLineCommandVector VerbatimLineCommands; + + bool isVerbatimLineCommand(StringRef Name) const; + + void formTokenWithChars(Token &Result, const char *TokEnd, + tok::TokenKind Kind) { + const unsigned TokLen = TokEnd - BufferPtr; + Result.setLocation(getSourceLocation(BufferPtr)); + Result.setKind(Kind); + Result.setLength(TokLen); +#ifndef NDEBUG + Result.TextPtr1 = "<UNSET>"; + Result.TextLen1 = 7; + Result.TextPtr2 = "<UNSET>"; + Result.TextLen2 = 7; +#endif + BufferPtr = TokEnd; + } + + SourceLocation getSourceLocation(const char *Loc) const { + assert(Loc >= BufferStart && Loc <= BufferEnd && + "Location out of range for this buffer!"); + + const unsigned CharNo = Loc - BufferStart; + return FileLoc.getLocWithOffset(CharNo); + } + + /// Eat string matching regexp \code \s*\* \endcode. + void skipLineStartingDecorations(); + + /// Lex stuff inside comments. CommentEnd should be set correctly. + void lexCommentText(Token &T); + + void setupAndLexVerbatimBlock(Token &T, + const char *TextBegin, + char Marker, StringRef EndName); + + void lexVerbatimBlockFirstLine(Token &T); + + void lexVerbatimBlockBody(Token &T); + + void lexVerbatimLine(Token &T, const char *TextBegin); + + void setupAndLexHTMLOpenTag(Token &T); + + void lexHTMLOpenTag(Token &T); + + void lexHTMLCloseTag(Token &T); + +public: + Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, + const char *BufferStart, const char *BufferEnd); + + void lex(Token &T); + + StringRef getSpelling(const Token &Tok, + const SourceManager &SourceMgr, + bool *Invalid = NULL) const; + + /// \brief Register a new verbatim block command. + void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName); + + /// \brief Register a new verbatim line command. + void addVerbatimLineCommand(StringRef Name); +}; + +} // end namespace comments +} // end namespace clang + +#endif + diff --git a/include/clang/AST/RawCommentList.h b/include/clang/AST/RawCommentList.h index 0965cb3a62..d670fd1428 100644 --- a/include/clang/AST/RawCommentList.h +++ b/include/clang/AST/RawCommentList.h @@ -15,6 +15,7 @@ namespace clang { +class ASTContext; class ASTReader; class RawComment { @@ -27,7 +28,7 @@ public: CK_BCPLExcl, ///< \code //! stuff \endcode CK_JavaDoc, ///< \code /** stuff */ \endcode CK_Qt, ///< \code /*! stuff */ \endcode, also used by HeaderDoc - CK_Merged ///< Two or more Doxygen comments merged together + CK_Merged ///< Two or more documentation comments merged together }; RawComment() : Kind(CK_Invalid), IsAlmostTrailingComment(false) { } @@ -53,7 +54,7 @@ public: /// \code /**< stuff */ \endcode /// \code /*!< stuff */ \endcode bool isTrailingComment() const LLVM_READONLY { - assert(isDoxygen()); + assert(isDocumentation()); return IsTrailingComment; } @@ -64,13 +65,13 @@ public: return IsAlmostTrailingComment; } - /// Returns true if this comment is not a Doxygen comment. + /// Returns true if this comment is not a documentation comment. bool isOrdinary() const LLVM_READONLY { return (Kind == CK_OrdinaryBCPL) || (Kind == CK_OrdinaryC); } - /// Returns true if this comment any kind of a Doxygen comment. - bool isDoxygen() const LLVM_READONLY { + /// Returns true if this comment any kind of a documentation comment. + bool isDocumentation() const LLVM_READONLY { return !isInvalid() && !isOrdinary(); } @@ -91,11 +92,21 @@ public: unsigned getBeginLine(const SourceManager &SM) const; unsigned getEndLine(const SourceManager &SM) const; + StringRef getBriefText(const ASTContext &Context) const { + if (BriefTextValid) + return BriefText; + + return extractBriefText(Context); + } + private: SourceRange Range; mutable StringRef RawText; - mutable bool RawTextValid : 1; ///< True if RawText is valid + mutable StringRef BriefText; + + mutable bool RawTextValid : 1; ///< True if RawText is valid + mutable bool BriefTextValid : 1; ///< True if BriefText is valid unsigned Kind : 3; @@ -118,6 +129,8 @@ private: StringRef getRawTextSlow(const SourceManager &SourceMgr) const; + StringRef extractBriefText(const ASTContext &Context) const; + friend class ASTReader; }; diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp index 23751a56f2..ca631736d9 100644 --- a/lib/AST/ASTContext.cpp +++ b/lib/AST/ASTContext.cpp @@ -90,7 +90,7 @@ const RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const { // First check whether we have a trailing comment. if (Comment != RawComments.end() && - Comment->isDoxygen() && Comment->isTrailingComment() && + Comment->isDocumentation() && Comment->isTrailingComment() && !isa<TagDecl>(D) && !isa<NamespaceDecl>(D)) { std::pair<FileID, unsigned> CommentBeginDecomp = SourceMgr.getDecomposedLoc(Comment->getSourceRange().getBegin()); @@ -111,7 +111,7 @@ const RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const { --Comment; // Check that we actually have a non-member Doxygen comment. - if (!Comment->isDoxygen() || Comment->isTrailingComment()) + if (!Comment->isDocumentation() || Comment->isTrailingComment()) return NULL; // Decompose the end of the comment. diff --git a/lib/AST/CMakeLists.txt b/lib/AST/CMakeLists.txt index d8605367a7..5dad60c490 100644 --- a/lib/AST/CMakeLists.txt +++ b/lib/AST/CMakeLists.txt @@ -8,6 +8,8 @@ add_clang_library(clangAST ASTImporter.cpp AttrImpl.cpp CXXInheritance.cpp + CommentBriefParser.cpp + CommentLexer.cpp Decl.cpp DeclarationName.cpp DeclBase.cpp diff --git a/lib/AST/CommentBriefParser.cpp b/lib/AST/CommentBriefParser.cpp new file mode 100644 index 0000000000..528fd2606f --- /dev/null +++ b/lib/AST/CommentBriefParser.cpp @@ -0,0 +1,76 @@ +//===--- CommentBriefParser.cpp - Dumb comment parser ---------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "clang/AST/CommentBriefParser.h" + +namespace clang { +namespace comments { + +std::string BriefParser::Parse() { + std::string FirstParagraph; + std::string Brief; + bool InFirstParagraph = true; + bool InBrief = false; + bool BriefDone = false; + + while (Tok.isNot(tok::eof)) { + if (Tok.is(tok::text)) { + if (InFirstParagraph) + FirstParagraph += Tok.getText(); + if (InBrief) + Brief += Tok.getText(); + ConsumeToken(); + continue; + } + + if (!BriefDone && Tok.is(tok::command) && Tok.getCommandName() == "brief") { + InBrief = true; + ConsumeToken(); + continue; + } + + if (Tok.is(tok::newline)) { + if (InFirstParagraph) + FirstParagraph += '\n'; + if (InBrief) + Brief += '\n'; + ConsumeToken(); + + if (Tok.is(tok::newline)) { + ConsumeToken(); + // We found a paragraph end. + InFirstParagraph = false; + if (InBrief) { + InBrief = false; + BriefDone = true; + } + } + continue; + } + + // We didn't handle this token, so just drop it. + ConsumeToken(); + } + + if (Brief.size() > 0) + return Brief; + + return FirstParagraph; +} + +BriefParser::BriefParser(Lexer &L) : L(L) +{ + // Get lookahead token. + ConsumeToken(); +} + +} // end namespace comments +} // end namespace clang + + diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp new file mode 100644 index 0000000000..e5529dad15 --- /dev/null +++ b/lib/AST/CommentLexer.cpp @@ -0,0 +1,676 @@ +#include "clang/AST/CommentLexer.h" +#include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/ErrorHandling.h" + +namespace clang { +namespace comments { + +void Token::dump(const Lexer &L, const SourceManager &SM) const { + llvm::errs() << "comments::Token Kind=" << Kind << " "; + Loc.dump(SM); + llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n"; +} + +bool Lexer::isVerbatimBlockCommand(StringRef BeginName, + StringRef &EndName) const { + const char *Result = llvm::StringSwitch<const char *>(BeginName) + .Case("code", "endcode") + .Case("verbatim", "endverbatim") + .Case("htmlonly", "endhtmlonly") + .Case("latexonly", "endlatexonly") + .Case("xmlonly", "endxmlonly") + .Case("manonly", "endmanonly") + .Case("rtfonly", "endrtfonly") + + .Case("dot", "enddot") + .Case("msc", "endmsc") + + .Case("f$", "f$") // Inline LaTeX formula + .Case("f[", "f]") // Displayed LaTeX formula + .Case("f{", "f}") // LaTeX environment + + .Default(NULL); + + if (Result) { + EndName = Result; + return true; + } + + for (VerbatimBlockCommandVector::const_iterator + I = VerbatimBlockCommands.begin(), + E = VerbatimBlockCommands.end(); + I != E; ++I) + if (I->BeginName == BeginName) { + EndName = I->EndName; + return true; + } + + return false; +} + +bool Lexer::isVerbatimLineCommand(StringRef Name) const { + bool Result = llvm::StringSwitch<bool>(Name) + .Case("fn", true) + .Case("var", true) + .Case("property", true) + .Case("typedef", true) + + .Case("overload", true) + + .Case("defgroup", true) + .Case("ingroup", true) + .Case("addtogroup", true) + .Case("weakgroup", true) + .Case("name", true) + + .Case("section", true) + .Case("subsection", true) + .Case("subsubsection", true) + .Case("paragraph", true) + + .Case("mainpage", true) + .Case("subpage", true) + .Case("ref", true) + + .Default(false); + + if (Result) + return true; + + for (VerbatimLineCommandVector::const_iterator + I = VerbatimLineCommands.begin(), + E = VerbatimLineCommands.end(); + I != E; ++I) + if (I->Name == Name) + return true; + + return false; +} + +void Lexer::skipLineStartingDecorations() { + // This function should be called only for C comments + assert(CommentState == LCS_InsideCComment); + + if (BufferPtr == CommentEnd) + return; + + switch (*BufferPtr) { + case ' ': + case '\t': + case '\f': + case '\v': { + const char *NewBufferPtr = BufferPtr; + NewBufferPtr++; + if (NewBufferPtr == CommentEnd) + return; + + char C = *NewBufferPtr; + while (C == ' ' || C == '\t' || C == '\f' || C == '\v') { + NewBufferPtr++; + if (NewBufferPtr == CommentEnd) + return; + C = *NewBufferPtr; + } + if (C == '*') + BufferPtr = NewBufferPtr + 1; + break; + } + case '*': + BufferPtr++; + break; + } +} + +namespace { +const char *findNewline(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + const char C = *BufferPtr; + if (C == '\n' || C == '\r') + return BufferPtr; + } + return BufferEnd; +} + +const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { + if (BufferPtr == BufferEnd) + return BufferPtr; + + if (*BufferPtr == '\n') + BufferPtr++; + else { + assert(*BufferPtr == '\r'); + BufferPtr++; + if (BufferPtr != BufferEnd && *BufferPtr == '\n') + BufferPtr++; + } + return BufferPtr; +} + +bool isHTMLIdentifierCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z') || + (C >= '0' && C <= '9'); +} + +const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLIdentifierCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +/// Skip HTML string quoted in single or double quotes. Escaping quotes inside +/// string allowed. +/// +/// Returns pointer to closing quote. +const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd) +{ + const char Quote = *BufferPtr; + assert(Quote == '\"' || Quote == '\''); + + BufferPtr++; + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + const char C = *BufferPtr; + if (C == Quote && BufferPtr[-1] != '\\') + return BufferPtr; + } + return BufferEnd; +} + +bool isHorizontalWhitespace(char C) { + return C == ' ' || C == '\t' || C == '\f' || C == '\v'; +} + +bool isWhitespace(char C) { + return C == ' ' || C == '\n' || C == '\r' || + C == '\t' || C == '\f' || C == '\v'; +} + +const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isWhitespace(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +bool isCommandNameCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z') || + (C >= '0' && C <= '9'); +} + +const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isCommandNameCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +/// Return the one past end pointer for BCPL comments. +/// Handles newlines escaped with backslash or trigraph for backslahs. +const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) { + const char *CurPtr = BufferPtr; + while (CurPtr != BufferEnd) { + char C = *CurPtr; + while (C != '\n' && C != '\r') { + CurPtr++; + if (CurPtr == BufferEnd) + return BufferEnd; + C = *CurPtr; + } + // We found a newline, check if it is escaped. + const char *EscapePtr = CurPtr - 1; + while(isHorizontalWhitespace(*EscapePtr)) + EscapePtr--; + + if (*EscapePtr == '\\' || + (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' && + EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) { + // We found an escaped newline. + CurPtr = skipNewline(CurPtr, BufferEnd); + } else + return CurPtr; // Not an escaped newline. + } + return BufferEnd; +} + +/// Return the one past end pointer for C comments. +/// Very dumb, does not handle escaped newlines or trigraphs. +const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (*BufferPtr == '*') { + assert(BufferPtr + 1 != BufferEnd); + if (*(BufferPtr + 1) == '/') + return BufferPtr; + } + } + llvm_unreachable("buffer end hit before '*/' was seen"); +} +} // unnamed namespace + +void Lexer::lexCommentText(Token &T) { + assert(CommentState == LCS_InsideBCPLComment || + CommentState == LCS_InsideCComment); + + switch (State) { + case LS_Normal: + break; + case LS_VerbatimBlockFirstLine: + lexVerbatimBlockFirstLine(T); + return; + case LS_VerbatimBlockBody: + lexVerbatimBlockBody(T); + return; + case LS_HTMLOpenTag: + lexHTMLOpenTag(T); + return; + } + + assert(State == LS_Normal); + + const char *TokenPtr = BufferPtr; + assert(TokenPtr < CommentEnd); + while (TokenPtr != CommentEnd) { + switch(*TokenPtr) { + case '\\': + case '@': { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); + return; + } + char C = *TokenPtr; + switch (C) { + default: + break; + + case '\\': case '@': case '&': case '$': + case '#': case '<': case '>': case '%': + case '\"': case '.': case ':': + // This is one of \\ \@ \& \$ etc escape sequences. + TokenPtr++; + if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') { + // This is the \:: escape sequence. + TokenPtr++; + } + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(StringRef(BufferPtr - (T.getLength() - 1), + T.getLength() - 1)); + return; + } + + // Don't make zero-length commands. + if (!isCommandNameCharacter(*TokenPtr)) { + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); + return; + } + + TokenPtr = skipCommandName(TokenPtr, CommentEnd); + unsigned Length = TokenPtr - (BufferPtr + 1); + + // Hardcoded support for lexing LaTeX formula commands + // \f$ \f[ \f] \f{ \f} as a single command. + if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) { + C = *TokenPtr; + if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') { + TokenPtr++; + Length++; + } + } + + const StringRef CommandName(BufferPtr + 1, Length); + StringRef EndName; + + if (isVerbatimBlockCommand(CommandName, EndName)) { + setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName); + return; + } + if (isVerbatimLineCommand(CommandName)) { + lexVerbatimLine(T, TokenPtr); + return; + } + formTokenWithChars(T, TokenPtr, tok::command); + T.setCommandName(CommandName); + return; + } + + case '<': { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); + return; + } + const char C = *TokenPtr; + if (isHTMLIdentifierCharacter(C)) + setupAndLexHTMLOpenTag(T); + else if (C == '/') + lexHTMLCloseTag(T); + return; + } + + case '\n': + case '\r': + TokenPtr = skipNewline(TokenPtr, CommentEnd); + formTokenWithChars(T, TokenPtr, tok::newline); + + if (CommentState == LCS_InsideCComment) + skipLineStartingDecorations(); + return; + + default: { + while (true) { + TokenPtr++; + if (TokenPtr == CommentEnd) + break; + char C = *TokenPtr; + if(C == '\n' || C == '\r' || + C == '\\' || C == '@' || C == '<') + break; + } + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(StringRef(BufferPtr - T.getLength(), T.getLength())); + return; + } + } + } +} + +void Lexer::setupAndLexVerbatimBlock(Token &T, + const char *TextBegin, + char Mark |