aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/clang-c/Index.h6
-rw-r--r--include/clang/AST/CommentBriefParser.h49
-rw-r--r--include/clang/AST/CommentLexer.h352
-rw-r--r--include/clang/AST/RawCommentList.h25
-rw-r--r--lib/AST/ASTContext.cpp4
-rw-r--r--lib/AST/CMakeLists.txt2
-rw-r--r--lib/AST/CommentBriefParser.cpp76
-rw-r--r--lib/AST/CommentLexer.cpp676
-rw-r--r--lib/AST/RawCommentList.cpp21
-rw-r--r--test/Index/annotate-comments.cpp39
-rw-r--r--tools/c-index-test/c-index-test.c50
-rw-r--r--tools/libclang/CIndex.cpp18
-rw-r--r--tools/libclang/libclang.exports1
-rw-r--r--unittests/AST/CMakeLists.txt7
-rw-r--r--unittests/AST/CommentLexer.cpp1010
-rw-r--r--unittests/AST/Makefile15
-rw-r--r--unittests/Makefile2
17 files changed, 2326 insertions, 27 deletions
diff --git a/include/clang-c/Index.h b/include/clang-c/Index.h
index b7bd8bb738..2397ae1925 100644
--- a/include/clang-c/Index.h
+++ b/include/clang-c/Index.h
@@ -3201,6 +3201,12 @@ CINDEX_LINKAGE CXSourceRange clang_Cursor_getCommentRange(CXCursor C);
CINDEX_LINKAGE CXString clang_Cursor_getRawCommentText(CXCursor C);
/**
+ * \brief Given a cursor that represents a declaration, return the associated
+ * \\brief paragraph; otherwise return the first paragraph.
+ */
+CINDEX_LINKAGE CXString clang_Cursor_getBriefCommentText(CXCursor C);
+
+/**
* @}
*/
diff --git a/include/clang/AST/CommentBriefParser.h b/include/clang/AST/CommentBriefParser.h
new file mode 100644
index 0000000000..e343b94643
--- /dev/null
+++ b/include/clang/AST/CommentBriefParser.h
@@ -0,0 +1,49 @@
+//===--- CommentBriefParser.h - Dumb comment parser -------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a very simple Doxygen comment parser.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_CLANG_AST_BRIEF_COMMENT_PARSER_H
+#define LLVM_CLANG_AST_BRIEF_COMMENT_PARSER_H
+
+#include "clang/AST/CommentLexer.h"
+
+namespace clang {
+namespace comments {
+
+/// A very simple comment parser that extracts just the brief description or
+/// first paragraph.
+class BriefParser {
+ Lexer &L;
+
+ /// Current lookahead token.
+ Token Tok;
+
+ SourceLocation ConsumeToken() {
+ SourceLocation Loc = Tok.getLocation();
+ L.lex(Tok);
+ return Loc;
+ }
+
+public:
+ BriefParser(Lexer &L);
+
+ /// Return \\brief paragraph, if it exists; otherwise return the first
+ /// paragraph.
+ std::string Parse();
+};
+
+} // end namespace comments
+} // end namespace clang
+
+#endif
+
diff --git a/include/clang/AST/CommentLexer.h b/include/clang/AST/CommentLexer.h
new file mode 100644
index 0000000000..7f7ae62758
--- /dev/null
+++ b/include/clang/AST/CommentLexer.h
@@ -0,0 +1,352 @@
+//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines lexer for structured comments and supporting token class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
+#define LLVM_CLANG_AST_COMMENT_LEXER_H
+
+#include "clang/Basic/SourceManager.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace comments {
+
+class Lexer;
+
+namespace tok {
+enum TokenKind {
+ eof,
+ newline,
+ text,
+ command,
+ verbatim_block_begin,
+ verbatim_block_line,
+ verbatim_block_end,
+ verbatim_line,
+ html_tag_open, // <tag
+ html_ident, // attr
+ html_equals, // =
+ html_quoted_string, // "blah\"blah" or 'blah\'blah'
+ html_greater, // >
+ html_tag_close, // </tag>
+
+ // Markdown tokens (not supported yet).
+ ruler,
+ md_code_line, // Line indented at least by 4 spaces.
+ md_code_inline, // `code`
+ md_emph, // _text_ or *text*
+ md_strong, // __text__ or *text*
+ md_header // ### level 3 header ###
+};
+} // end namespace tok
+
+class CommentOptions {
+public:
+ bool Markdown;
+};
+
+/// \brief Comment token.
+class Token {
+ friend class Lexer;
+
+ /// The location of the token.
+ SourceLocation Loc;
+
+ /// The actual kind of the token.
+ tok::TokenKind Kind;
+
+ /// Length of the token spelling in comment. Can be 0 for synthenized
+ /// tokens.
+ unsigned Length;
+
+ /// Contains text value associated with a token.
+ const char *TextPtr1;
+ unsigned TextLen1;
+
+ /// Contains text value associated with a token.
+ const char *TextPtr2;
+ unsigned TextLen2;
+
+public:
+ SourceLocation getLocation() const LLVM_READONLY { return Loc; }
+ void setLocation(SourceLocation SL) { Loc = SL; }
+
+ tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
+ void setKind(tok::TokenKind K) { Kind = K; }
+
+ bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
+ bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
+
+ unsigned getLength() const LLVM_READONLY { return Length; }
+ void setLength(unsigned L) { Length = L; }
+
+ StringRef getText() const LLVM_READONLY {
+ assert(is(tok::text));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setText(StringRef Text) {
+ assert(is(tok::text));
+ TextPtr1 = Text.data();
+ TextLen1 = Text.size();
+ }
+
+ StringRef getCommandName() const LLVM_READONLY {
+ assert(is(tok::command));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setCommandName(StringRef Name) {
+ assert(is(tok::command));
+ TextPtr1 = Name.data();
+ TextLen1 = Name.size();
+ }
+
+ StringRef getVerbatimBlockName() const LLVM_READONLY {
+ assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setVerbatimBlockName(StringRef Name) {
+ assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
+ TextPtr1 = Name.data();
+ TextLen1 = Name.size();
+ }
+
+ StringRef getVerbatimBlockText() const LLVM_READONLY {
+ assert(is(tok::verbatim_block_line));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setVerbatimBlockText(StringRef Text) {
+ assert(is(tok::verbatim_block_line));
+ TextPtr1 = Text.data();
+ TextLen1 = Text.size();
+ }
+
+ /// Returns the name of verbatim line command.
+ StringRef getVerbatimLineName() const LLVM_READONLY {
+ assert(is(tok::verbatim_line));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setVerbatimLineName(StringRef Name) {
+ assert(is(tok::verbatim_line));
+ TextPtr1 = Name.data();
+ TextLen1 = Name.size();
+ }
+
+ StringRef getVerbatimLineText() const LLVM_READONLY {
+ assert(is(tok::verbatim_line));
+ return StringRef(TextPtr2, TextLen2);
+ }
+
+ void setVerbatimLineText(StringRef Text) {
+ assert(is(tok::verbatim_line));
+ TextPtr2 = Text.data();
+ TextLen2 = Text.size();
+ }
+
+ StringRef getHTMLTagOpenName() const LLVM_READONLY {
+ assert(is(tok::html_tag_open));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setHTMLTagOpenName(StringRef Name) {
+ assert(is(tok::html_tag_open));
+ TextPtr1 = Name.data();
+ TextLen1 = Name.size();
+ }
+
+ StringRef getHTMLIdent() const LLVM_READONLY {
+ assert(is(tok::html_ident));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setHTMLIdent(StringRef Name) {
+ assert(is(tok::html_ident));
+ TextPtr1 = Name.data();
+ TextLen1 = Name.size();
+ }
+
+ StringRef getHTMLQuotedString() const LLVM_READONLY {
+ assert(is(tok::html_quoted_string));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setHTMLQuotedString(StringRef Str) {
+ assert(is(tok::html_quoted_string));
+ TextPtr1 = Str.data();
+ TextLen1 = Str.size();
+ }
+
+ StringRef getHTMLTagCloseName() const LLVM_READONLY {
+ assert(is(tok::html_tag_close));
+ return StringRef(TextPtr1, TextLen1);
+ }
+
+ void setHTMLTagCloseName(StringRef Name) {
+ assert(is(tok::html_tag_close));
+ TextPtr1 = Name.data();
+ TextLen1 = Name.size();
+ }
+
+ void dump(const Lexer &L, const SourceManager &SM) const;
+};
+
+/// \brief Comment lexer.
+class Lexer {
+private:
+ Lexer(const Lexer&); // DO NOT IMPLEMENT
+ void operator=(const Lexer&); // DO NOT IMPLEMENT
+
+ const char *const BufferStart;
+ const char *const BufferEnd;
+ SourceLocation FileLoc;
+ CommentOptions CommOpts;
+
+ const char *BufferPtr;
+
+ /// One past end pointer for the current comment. For BCPL comments points
+ /// to newline or BufferEnd, for C comments points to star in '*/'.
+ const char *CommentEnd;
+
+ enum LexerCommentState {
+ LCS_BeforeComment,
+ LCS_InsideBCPLComment,
+ LCS_InsideCComment,
+ LCS_BetweenComments
+ };
+
+ /// Low-level lexer state, track if we are inside or outside of comment.
+ LexerCommentState CommentState;
+
+ enum LexerState {
+ /// Lexing normal comment text
+ LS_Normal,
+
+ /// Finished lexing verbatim block beginning command, will lex first body
+ /// line.
+ LS_VerbatimBlockFirstLine,
+
+ /// Lexing verbatim block body line-by-line, skipping line-starting
+ /// decorations.
+ LS_VerbatimBlockBody,
+
+ /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
+ LS_HTMLOpenTag
+ };
+
+ /// Current lexing mode.
+ LexerState State;
+
+ /// A verbatim-like block command eats every character (except line starting
+ /// decorations) until matching end command is seen or comment end is hit.
+ struct VerbatimBlockCommand {
+ StringRef BeginName;
+ StringRef EndName;
+ };
+
+ typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector;
+
+ /// Registered verbatim-like block commands.
+ VerbatimBlockCommandVector VerbatimBlockCommands;
+
+ /// If State is LS_VerbatimBlock, contains the the name of verbatim end
+ /// command, including command marker.
+ SmallString<16> VerbatimBlockEndCommandName;
+
+ bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const;
+
+ /// A verbatim-like line command eats everything until a newline is seen or
+ /// comment end is hit.
+ struct VerbatimLineCommand {
+ StringRef Name;
+ };
+
+ typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector;
+
+ /// Registered verbatim-like line commands.
+ VerbatimLineCommandVector VerbatimLineCommands;
+
+ bool isVerbatimLineCommand(StringRef Name) const;
+
+ void formTokenWithChars(Token &Result, const char *TokEnd,
+ tok::TokenKind Kind) {
+ const unsigned TokLen = TokEnd - BufferPtr;
+ Result.setLocation(getSourceLocation(BufferPtr));
+ Result.setKind(Kind);
+ Result.setLength(TokLen);
+#ifndef NDEBUG
+ Result.TextPtr1 = "<UNSET>";
+ Result.TextLen1 = 7;
+ Result.TextPtr2 = "<UNSET>";
+ Result.TextLen2 = 7;
+#endif
+ BufferPtr = TokEnd;
+ }
+
+ SourceLocation getSourceLocation(const char *Loc) const {
+ assert(Loc >= BufferStart && Loc <= BufferEnd &&
+ "Location out of range for this buffer!");
+
+ const unsigned CharNo = Loc - BufferStart;
+ return FileLoc.getLocWithOffset(CharNo);
+ }
+
+ /// Eat string matching regexp \code \s*\* \endcode.
+ void skipLineStartingDecorations();
+
+ /// Lex stuff inside comments. CommentEnd should be set correctly.
+ void lexCommentText(Token &T);
+
+ void setupAndLexVerbatimBlock(Token &T,
+ const char *TextBegin,
+ char Marker, StringRef EndName);
+
+ void lexVerbatimBlockFirstLine(Token &T);
+
+ void lexVerbatimBlockBody(Token &T);
+
+ void lexVerbatimLine(Token &T, const char *TextBegin);
+
+ void setupAndLexHTMLOpenTag(Token &T);
+
+ void lexHTMLOpenTag(Token &T);
+
+ void lexHTMLCloseTag(Token &T);
+
+public:
+ Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
+ const char *BufferStart, const char *BufferEnd);
+
+ void lex(Token &T);
+
+ StringRef getSpelling(const Token &Tok,
+ const SourceManager &SourceMgr,
+ bool *Invalid = NULL) const;
+
+ /// \brief Register a new verbatim block command.
+ void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName);
+
+ /// \brief Register a new verbatim line command.
+ void addVerbatimLineCommand(StringRef Name);
+};
+
+} // end namespace comments
+} // end namespace clang
+
+#endif
+
diff --git a/include/clang/AST/RawCommentList.h b/include/clang/AST/RawCommentList.h
index 0965cb3a62..d670fd1428 100644
--- a/include/clang/AST/RawCommentList.h
+++ b/include/clang/AST/RawCommentList.h
@@ -15,6 +15,7 @@
namespace clang {
+class ASTContext;
class ASTReader;
class RawComment {
@@ -27,7 +28,7 @@ public:
CK_BCPLExcl, ///< \code //! stuff \endcode
CK_JavaDoc, ///< \code /** stuff */ \endcode
CK_Qt, ///< \code /*! stuff */ \endcode, also used by HeaderDoc
- CK_Merged ///< Two or more Doxygen comments merged together
+ CK_Merged ///< Two or more documentation comments merged together
};
RawComment() : Kind(CK_Invalid), IsAlmostTrailingComment(false) { }
@@ -53,7 +54,7 @@ public:
/// \code /**< stuff */ \endcode
/// \code /*!< stuff */ \endcode
bool isTrailingComment() const LLVM_READONLY {
- assert(isDoxygen());
+ assert(isDocumentation());
return IsTrailingComment;
}
@@ -64,13 +65,13 @@ public:
return IsAlmostTrailingComment;
}
- /// Returns true if this comment is not a Doxygen comment.
+ /// Returns true if this comment is not a documentation comment.
bool isOrdinary() const LLVM_READONLY {
return (Kind == CK_OrdinaryBCPL) || (Kind == CK_OrdinaryC);
}
- /// Returns true if this comment any kind of a Doxygen comment.
- bool isDoxygen() const LLVM_READONLY {
+ /// Returns true if this comment any kind of a documentation comment.
+ bool isDocumentation() const LLVM_READONLY {
return !isInvalid() && !isOrdinary();
}
@@ -91,11 +92,21 @@ public:
unsigned getBeginLine(const SourceManager &SM) const;
unsigned getEndLine(const SourceManager &SM) const;
+ StringRef getBriefText(const ASTContext &Context) const {
+ if (BriefTextValid)
+ return BriefText;
+
+ return extractBriefText(Context);
+ }
+
private:
SourceRange Range;
mutable StringRef RawText;
- mutable bool RawTextValid : 1; ///< True if RawText is valid
+ mutable StringRef BriefText;
+
+ mutable bool RawTextValid : 1; ///< True if RawText is valid
+ mutable bool BriefTextValid : 1; ///< True if BriefText is valid
unsigned Kind : 3;
@@ -118,6 +129,8 @@ private:
StringRef getRawTextSlow(const SourceManager &SourceMgr) const;
+ StringRef extractBriefText(const ASTContext &Context) const;
+
friend class ASTReader;
};
diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp
index 23751a56f2..ca631736d9 100644
--- a/lib/AST/ASTContext.cpp
+++ b/lib/AST/ASTContext.cpp
@@ -90,7 +90,7 @@ const RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const {
// First check whether we have a trailing comment.
if (Comment != RawComments.end() &&
- Comment->isDoxygen() && Comment->isTrailingComment() &&
+ Comment->isDocumentation() && Comment->isTrailingComment() &&
!isa<TagDecl>(D) && !isa<NamespaceDecl>(D)) {
std::pair<FileID, unsigned> CommentBeginDecomp
= SourceMgr.getDecomposedLoc(Comment->getSourceRange().getBegin());
@@ -111,7 +111,7 @@ const RawComment *ASTContext::getRawCommentForDeclNoCache(const Decl *D) const {
--Comment;
// Check that we actually have a non-member Doxygen comment.
- if (!Comment->isDoxygen() || Comment->isTrailingComment())
+ if (!Comment->isDocumentation() || Comment->isTrailingComment())
return NULL;
// Decompose the end of the comment.
diff --git a/lib/AST/CMakeLists.txt b/lib/AST/CMakeLists.txt
index d8605367a7..5dad60c490 100644
--- a/lib/AST/CMakeLists.txt
+++ b/lib/AST/CMakeLists.txt
@@ -8,6 +8,8 @@ add_clang_library(clangAST
ASTImporter.cpp
AttrImpl.cpp
CXXInheritance.cpp
+ CommentBriefParser.cpp
+ CommentLexer.cpp
Decl.cpp
DeclarationName.cpp
DeclBase.cpp
diff --git a/lib/AST/CommentBriefParser.cpp b/lib/AST/CommentBriefParser.cpp
new file mode 100644
index 0000000000..528fd2606f
--- /dev/null
+++ b/lib/AST/CommentBriefParser.cpp
@@ -0,0 +1,76 @@
+//===--- CommentBriefParser.cpp - Dumb comment parser ---------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/CommentBriefParser.h"
+
+namespace clang {
+namespace comments {
+
+std::string BriefParser::Parse() {
+ std::string FirstParagraph;
+ std::string Brief;
+ bool InFirstParagraph = true;
+ bool InBrief = false;
+ bool BriefDone = false;
+
+ while (Tok.isNot(tok::eof)) {
+ if (Tok.is(tok::text)) {
+ if (InFirstParagraph)
+ FirstParagraph += Tok.getText();
+ if (InBrief)
+ Brief += Tok.getText();
+ ConsumeToken();
+ continue;
+ }
+
+ if (!BriefDone && Tok.is(tok::command) && Tok.getCommandName() == "brief") {
+ InBrief = true;
+ ConsumeToken();
+ continue;
+ }
+
+ if (Tok.is(tok::newline)) {
+ if (InFirstParagraph)
+ FirstParagraph += '\n';
+ if (InBrief)
+ Brief += '\n';
+ ConsumeToken();
+
+ if (Tok.is(tok::newline)) {
+ ConsumeToken();
+ // We found a paragraph end.
+ InFirstParagraph = false;
+ if (InBrief) {
+ InBrief = false;
+ BriefDone = true;
+ }
+ }
+ continue;
+ }
+
+ // We didn't handle this token, so just drop it.
+ ConsumeToken();
+ }
+
+ if (Brief.size() > 0)
+ return Brief;
+
+ return FirstParagraph;
+}
+
+BriefParser::BriefParser(Lexer &L) : L(L)
+{
+ // Get lookahead token.
+ ConsumeToken();
+}
+
+} // end namespace comments
+} // end namespace clang
+
+
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp
new file mode 100644
index 0000000000..e5529dad15
--- /dev/null
+++ b/lib/AST/CommentLexer.cpp
@@ -0,0 +1,676 @@
+#include "clang/AST/CommentLexer.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+
+namespace clang {
+namespace comments {
+
+void Token::dump(const Lexer &L, const SourceManager &SM) const {
+ llvm::errs() << "comments::Token Kind=" << Kind << " ";
+ Loc.dump(SM);
+ llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
+}
+
+bool Lexer::isVerbatimBlockCommand(StringRef BeginName,
+ StringRef &EndName) const {
+ const char *Result = llvm::StringSwitch<const char *>(BeginName)
+ .Case("code", "endcode")
+ .Case("verbatim", "endverbatim")
+ .Case("htmlonly", "endhtmlonly")
+ .Case("latexonly", "endlatexonly")
+ .Case("xmlonly", "endxmlonly")
+ .Case("manonly", "endmanonly")
+ .Case("rtfonly", "endrtfonly")
+
+ .Case("dot", "enddot")
+ .Case("msc", "endmsc")
+
+ .Case("f$", "f$") // Inline LaTeX formula
+ .Case("f[", "f]") // Displayed LaTeX formula
+ .Case("f{", "f}") // LaTeX environment
+
+ .Default(NULL);
+
+ if (Result) {
+ EndName = Result;
+ return true;
+ }
+
+ for (VerbatimBlockCommandVector::const_iterator
+ I = VerbatimBlockCommands.begin(),
+ E = VerbatimBlockCommands.end();
+ I != E; ++I)
+ if (I->BeginName == BeginName) {
+ EndName = I->EndName;
+ return true;
+ }
+
+ return false;
+}
+
+bool Lexer::isVerbatimLineCommand(StringRef Name) const {
+ bool Result = llvm::StringSwitch<bool>(Name)
+ .Case("fn", true)
+ .Case("var", true)
+ .Case("property", true)
+ .Case("typedef", true)
+
+ .Case("overload", true)
+
+ .Case("defgroup", true)
+ .Case("ingroup", true)
+ .Case("addtogroup", true)
+ .Case("weakgroup", true)
+ .Case("name", true)
+
+ .Case("section", true)
+ .Case("subsection", true)
+ .Case("subsubsection", true)
+ .Case("paragraph", true)
+
+ .Case("mainpage", true)
+ .Case("subpage", true)
+ .Case("ref", true)
+
+ .Default(false);
+
+ if (Result)
+ return true;
+
+ for (VerbatimLineCommandVector::const_iterator
+ I = VerbatimLineCommands.begin(),
+ E = VerbatimLineCommands.end();
+ I != E; ++I)
+ if (I->Name == Name)
+ return true;
+
+ return false;
+}
+
+void Lexer::skipLineStartingDecorations() {
+ // This function should be called only for C comments
+ assert(CommentState == LCS_InsideCComment);
+
+ if (BufferPtr == CommentEnd)
+ return;
+
+ switch (*BufferPtr) {
+ case ' ':
+ case '\t':
+ case '\f':
+ case '\v': {
+ const char *NewBufferPtr = BufferPtr;
+ NewBufferPtr++;
+ if (NewBufferPtr == CommentEnd)
+ return;
+
+ char C = *NewBufferPtr;
+ while (C == ' ' || C == '\t' || C == '\f' || C == '\v') {
+ NewBufferPtr++;
+ if (NewBufferPtr == CommentEnd)
+ return;
+ C = *NewBufferPtr;
+ }
+ if (C == '*')
+ BufferPtr = NewBufferPtr + 1;
+ break;
+ }
+ case '*':
+ BufferPtr++;
+ break;
+ }
+}
+
+namespace {
+const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
+ for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+ const char C = *BufferPtr;
+ if (C == '\n' || C == '\r')
+ return BufferPtr;
+ }
+ return BufferEnd;
+}
+
+const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
+ if (BufferPtr == BufferEnd)
+ return BufferPtr;
+
+ if (*BufferPtr == '\n')
+ BufferPtr++;
+ else {
+ assert(*BufferPtr == '\r');
+ BufferPtr++;
+ if (BufferPtr != BufferEnd && *BufferPtr == '\n')
+ BufferPtr++;
+ }
+ return BufferPtr;
+}
+
+bool isHTMLIdentifierCharacter(char C) {
+ return (C >= 'a' && C <= 'z') ||
+ (C >= 'A' && C <= 'Z') ||
+ (C >= '0' && C <= '9');
+}
+
+const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
+ for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+ if (!isHTMLIdentifierCharacter(*BufferPtr))
+ return BufferPtr;
+ }
+ return BufferEnd;
+}
+
+/// Skip HTML string quoted in single or double quotes. Escaping quotes inside
+/// string allowed.
+///
+/// Returns pointer to closing quote.
+const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
+{
+ const char Quote = *BufferPtr;
+ assert(Quote == '\"' || Quote == '\'');
+
+ BufferPtr++;
+ for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+ const char C = *BufferPtr;
+ if (C == Quote && BufferPtr[-1] != '\\')
+ return BufferPtr;
+ }
+ return BufferEnd;
+}
+
+bool isHorizontalWhitespace(char C) {
+ return C == ' ' || C == '\t' || C == '\f' || C == '\v';
+}
+
+bool isWhitespace(char C) {
+ return C == ' ' || C == '\n' || C == '\r' ||
+ C == '\t' || C == '\f' || C == '\v';
+}
+
+const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
+ for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+ if (!isWhitespace(*BufferPtr))
+ return BufferPtr;
+ }
+ return BufferEnd;
+}
+
+bool isCommandNameCharacter(char C) {
+ return (C >= 'a' && C <= 'z') ||
+ (C >= 'A' && C <= 'Z') ||
+ (C >= '0' && C <= '9');
+}
+
+const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
+ for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+ if (!isCommandNameCharacter(*BufferPtr))
+ return BufferPtr;
+ }
+ return BufferEnd;
+}
+
+/// Return the one past end pointer for BCPL comments.
+/// Handles newlines escaped with backslash or trigraph for backslahs.
+const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
+ const char *CurPtr = BufferPtr;
+ while (CurPtr != BufferEnd) {
+ char C = *CurPtr;
+ while (C != '\n' && C != '\r') {
+ CurPtr++;
+ if (CurPtr == BufferEnd)
+ return BufferEnd;
+ C = *CurPtr;
+ }
+ // We found a newline, check if it is escaped.
+ const char *EscapePtr = CurPtr - 1;
+ while(isHorizontalWhitespace(*EscapePtr))
+ EscapePtr--;
+
+ if (*EscapePtr == '\\' ||
+ (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
+ EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
+ // We found an escaped newline.
+ CurPtr = skipNewline(CurPtr, BufferEnd);
+ } else
+ return CurPtr; // Not an escaped newline.
+ }
+ return BufferEnd;
+}
+
+/// Return the one past end pointer for C comments.
+/// Very dumb, does not handle escaped newlines or trigraphs.
+const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
+ for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
+ if (*BufferPtr == '*') {
+ assert(BufferPtr + 1 != BufferEnd);
+ if (*(BufferPtr + 1) == '/')
+ return BufferPtr;
+ }
+ }
+ llvm_unreachable("buffer end hit before '*/' was seen");
+}
+} // unnamed namespace
+
+void Lexer::lexCommentText(Token &T) {
+ assert(CommentState == LCS_InsideBCPLComment ||
+ CommentState == LCS_InsideCComment);
+
+ switch (State) {
+ case LS_Normal:
+ break;
+ case LS_VerbatimBlockFirstLine:
+ lexVerbatimBlockFirstLine(T);
+ return;
+ case LS_VerbatimBlockBody:
+ lexVerbatimBlockBody(T);
+ return;
+ case LS_HTMLOpenTag:
+ lexHTMLOpenTag(T);
+ return;
+ }
+
+ assert(State == LS_Normal);
+
+ const char *TokenPtr = BufferPtr;
+ assert(TokenPtr < CommentEnd);
+ while (TokenPtr != CommentEnd) {
+ switch(*TokenPtr) {
+ case '\\':
+ case '@': {
+ TokenPtr++;
+ if (TokenPtr == CommentEnd) {
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
+ return;
+ }
+ char C = *TokenPtr;
+ switch (C) {
+ default:
+ break;
+
+ case '\\': case '@': case '&': case '$':
+ case '#': case '<': case '>': case '%':
+ case '\"': case '.': case ':':
+ // This is one of \\ \@ \& \$ etc escape sequences.
+ TokenPtr++;
+ if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
+ // This is the \:: escape sequence.
+ TokenPtr++;
+ }
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(StringRef(BufferPtr - (T.getLength() - 1),
+ T.getLength() - 1));
+ return;
+ }
+
+ // Don't make zero-length commands.
+ if (!isCommandNameCharacter(*TokenPtr)) {
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
+ return;
+ }
+
+ TokenPtr = skipCommandName(TokenPtr, CommentEnd);
+ unsigned Length = TokenPtr - (BufferPtr + 1);
+
+ // Hardcoded support for lexing LaTeX formula commands
+ // \f$ \f[ \f] \f{ \f} as a single command.
+ if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
+ C = *TokenPtr;
+ if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
+ TokenPtr++;
+ Length++;
+ }
+ }
+
+ const StringRef CommandName(BufferPtr + 1, Length);
+ StringRef EndName;
+
+ if (isVerbatimBlockCommand(CommandName, EndName)) {
+ setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, EndName);
+ return;
+ }
+ if (isVerbatimLineCommand(CommandName)) {
+ lexVerbatimLine(T, TokenPtr);
+ return;
+ }
+ formTokenWithChars(T, TokenPtr, tok::command);
+ T.setCommandName(CommandName);
+ return;
+ }
+
+ case '<': {
+ TokenPtr++;
+ if (TokenPtr == CommentEnd) {
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
+ return;
+ }
+ const char C = *TokenPtr;
+ if (isHTMLIdentifierCharacter(C))
+ setupAndLexHTMLOpenTag(T);
+ else if (C == '/')
+ lexHTMLCloseTag(T);
+ return;
+ }
+
+ case '\n':
+ case '\r':
+ TokenPtr = skipNewline(TokenPtr, CommentEnd);
+ formTokenWithChars(T, TokenPtr, tok::newline);
+
+ if (CommentState == LCS_InsideCComment)
+ skipLineStartingDecorations();
+ return;
+
+ default: {
+ while (true) {
+ TokenPtr++;
+ if (TokenPtr == CommentEnd)
+ break;
+ char C = *TokenPtr;
+ if(C == '\n' || C == '\r' ||
+ C == '\\' || C == '@' || C == '<')
+ break;
+ }
+ formTokenWithChars(T, TokenPtr, tok::text);
+ T.setText(StringRef(BufferPtr - T.getLength(), T.getLength()));
+ return;
+ }
+ }
+ }
+}
+
+void Lexer::setupAndLexVerbatimBlock(Token &T,
+ const char *TextBegin,
+ char Marker, StringRef EndName) {
+ VerbatimBlockEndCommandName.clear();
+ VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
+ VerbatimBlockEndCommandName.append(EndName);
+
+ formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
+ T.setVerbatimBlockName(StringRef(TextBegin - (T.getLength() - 1),
+ T.getLength() - 1));
+
+ State = LS_VerbatimBlockFirstLine;
+}
+
+void Lexer::lexVerbatimBlockFirstLine(Token &T) {
+ assert(BufferPtr < CommentEnd);
+
+ // FIXME: It would be better to scan the text once, finding either the block
+ // end command or newline.
+ //
+ // Extract current line.
+ const char *Newline = findNewline(BufferPtr, CommentEnd);
+ StringRef Line(BufferPtr, Newline - BufferPtr);
+
+ // Look for end command in current line.
+ size_t Pos