1 files changed, 352 insertions, 0 deletions
diff --git a/include/clang/AST/CommentLexer.h b/include/clang/AST/CommentLexer.h
new file mode 100644
index 0000000000..7f7ae62758
--- /dev/null
+++ b/include/clang/AST/CommentLexer.h
@@ -0,0 +1,352 @@
+//===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines lexer for structured comments and supporting token class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_COMMENT_LEXER_H
+#define LLVM_CLANG_AST_COMMENT_LEXER_H
+
+#include "clang/Basic/SourceManager.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace clang {
+namespace comments {
+
+class Lexer;
+
+namespace tok {
+enum TokenKind {
+  eof,
+  newline,
+  text,
+  command,
+  verbatim_block_begin,
+  verbatim_block_line,
+  verbatim_block_end,
+  verbatim_line,
+  html_tag_open,      // <tag
+  html_ident,         // attr
+  html_equals,        // =
+  html_quoted_string, // "blah\"blah" or 'blah\'blah'
+  html_greater,       // >
+  html_tag_close,     // </tag>
+
+  // Markdown tokens (not supported yet).
+  ruler,
+  md_code_line,   // Line indented at least by 4 spaces.
+  md_code_inline, // `code`
+  md_emph,        // _text_ or *text*
+  md_strong,      // __text__ or *text*
+  md_header       // ### level 3 header ###
+};
+} // end namespace tok
+
+class CommentOptions {
+public:
+  bool Markdown;
+};
+
+/// \brief Comment token.
+class Token {
+  friend class Lexer;
+
+  /// The location of the token.
+  SourceLocation Loc;
+
+  /// The actual kind of the token.
+  tok::TokenKind Kind;
+
+  /// Length of the token spelling in comment.  Can be 0 for synthenized
+  /// tokens.
+  unsigned Length;
+
+  /// Contains text value associated with a token.
+  const char *TextPtr1;
+  unsigned TextLen1;
+
+  /// Contains text value associated with a token.
+  const char *TextPtr2;
+  unsigned TextLen2;
+
+public:
+  SourceLocation getLocation() const LLVM_READONLY { return Loc; }
+  void setLocation(SourceLocation SL) { Loc = SL; }
+
+  tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
+  void setKind(tok::TokenKind K) { Kind = K; }
+
+  bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
+  bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
+
+  unsigned getLength() const LLVM_READONLY { return Length; }
+  void setLength(unsigned L) { Length = L; }
+
+  StringRef getText() const LLVM_READONLY {
+    assert(is(tok::text));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setText(StringRef Text) {
+    assert(is(tok::text));
+    TextPtr1 = Text.data();
+    TextLen1 = Text.size();
+  }
+
+  StringRef getCommandName() const LLVM_READONLY {
+    assert(is(tok::command));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setCommandName(StringRef Name) {
+    assert(is(tok::command));
+    TextPtr1 = Name.data();
+    TextLen1 = Name.size();
+  }
+
+  StringRef getVerbatimBlockName() const LLVM_READONLY {
+    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setVerbatimBlockName(StringRef Name) {
+    assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
+    TextPtr1 = Name.data();
+    TextLen1 = Name.size();
+  }
+
+  StringRef getVerbatimBlockText() const LLVM_READONLY {
+    assert(is(tok::verbatim_block_line));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setVerbatimBlockText(StringRef Text) {
+    assert(is(tok::verbatim_block_line));
+    TextPtr1 = Text.data();
+    TextLen1 = Text.size();
+  }
+
+  /// Returns the name of verbatim line command.
+  StringRef getVerbatimLineName() const LLVM_READONLY {
+    assert(is(tok::verbatim_line));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setVerbatimLineName(StringRef Name) {
+    assert(is(tok::verbatim_line));
+    TextPtr1 = Name.data();
+    TextLen1 = Name.size();
+  }
+
+  StringRef getVerbatimLineText() const LLVM_READONLY {
+    assert(is(tok::verbatim_line));
+    return StringRef(TextPtr2, TextLen2);
+  }
+
+  void setVerbatimLineText(StringRef Text) {
+    assert(is(tok::verbatim_line));
+    TextPtr2 = Text.data();
+    TextLen2 = Text.size();
+  }
+
+  StringRef getHTMLTagOpenName() const LLVM_READONLY {
+    assert(is(tok::html_tag_open));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setHTMLTagOpenName(StringRef Name) {
+    assert(is(tok::html_tag_open));
+    TextPtr1 = Name.data();
+    TextLen1 = Name.size();
+  }
+
+  StringRef getHTMLIdent() const LLVM_READONLY {
+    assert(is(tok::html_ident));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setHTMLIdent(StringRef Name) {
+    assert(is(tok::html_ident));
+    TextPtr1 = Name.data();
+    TextLen1 = Name.size();
+  }
+
+  StringRef getHTMLQuotedString() const LLVM_READONLY {
+    assert(is(tok::html_quoted_string));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setHTMLQuotedString(StringRef Str) {
+    assert(is(tok::html_quoted_string));
+    TextPtr1 = Str.data();
+    TextLen1 = Str.size();
+  }
+
+  StringRef getHTMLTagCloseName() const LLVM_READONLY {
+    assert(is(tok::html_tag_close));
+    return StringRef(TextPtr1, TextLen1);
+  }
+
+  void setHTMLTagCloseName(StringRef Name) {
+    assert(is(tok::html_tag_close));
+    TextPtr1 = Name.data();
+    TextLen1 = Name.size();
+  }
+
+  void dump(const Lexer &L, const SourceManager &SM) const;
+};
+
+/// \brief Comment lexer.
+class Lexer {
+private:
+  Lexer(const Lexer&);          // DO NOT IMPLEMENT
+  void operator=(const Lexer&); // DO NOT IMPLEMENT
+
+  const char *const BufferStart;
+  const char *const BufferEnd;
+  SourceLocation FileLoc;
+  CommentOptions CommOpts;
+
+  const char *BufferPtr;
+
+  /// One past end pointer for the current comment.  For BCPL comments points
+  /// to newline or BufferEnd, for C comments points to star in '*/'.
+  const char *CommentEnd;
+
+  enum LexerCommentState {
+    LCS_BeforeComment,
+    LCS_InsideBCPLComment,
+    LCS_InsideCComment,
+    LCS_BetweenComments
+  };
+
+  /// Low-level lexer state, track if we are inside or outside of comment.
+  LexerCommentState CommentState;
+
+  enum LexerState {
+    /// Lexing normal comment text
+    LS_Normal,
+
+    /// Finished lexing verbatim block beginning command, will lex first body
+    /// line.
+    LS_VerbatimBlockFirstLine,
+
+    /// Lexing verbatim block body line-by-line, skipping line-starting
+    /// decorations.
+    LS_VerbatimBlockBody,
+
+    /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
+    LS_HTMLOpenTag
+  };
+
+  /// Current lexing mode.
+  LexerState State;
+
+  /// A verbatim-like block command eats every character (except line starting
+  /// decorations) until matching end command is seen or comment end is hit.
+  struct VerbatimBlockCommand {
+    StringRef BeginName;
+    StringRef EndName;
+  };
+
+  typedef SmallVector<VerbatimBlockCommand, 4> VerbatimBlockCommandVector;
+
+  /// Registered verbatim-like block commands.
+  VerbatimBlockCommandVector VerbatimBlockCommands;
+
+  /// If State is LS_VerbatimBlock, contains the the name of verbatim end
+  /// command, including command marker.
+  SmallString<16> VerbatimBlockEndCommandName;
+
+  bool isVerbatimBlockCommand(StringRef BeginName, StringRef &EndName) const;
+
+  /// A verbatim-like line command eats everything until a newline is seen or
+  /// comment end is hit.
+  struct VerbatimLineCommand {
+    StringRef Name;
+  };
+
+  typedef SmallVector<VerbatimLineCommand, 4> VerbatimLineCommandVector;
+
+  /// Registered verbatim-like line commands.
+  VerbatimLineCommandVector VerbatimLineCommands;
+
+  bool isVerbatimLineCommand(StringRef Name) const;
+
+  void formTokenWithChars(Token &Result, const char *TokEnd,
+                          tok::TokenKind Kind) {
+    const unsigned TokLen = TokEnd - BufferPtr;
+    Result.setLocation(getSourceLocation(BufferPtr));
+    Result.setKind(Kind);
+    Result.setLength(TokLen);
+#ifndef NDEBUG
+    Result.TextPtr1 = "<UNSET>";
+    Result.TextLen1 = 7;
+    Result.TextPtr2 = "<UNSET>";
+    Result.TextLen2 = 7;
+#endif
+    BufferPtr = TokEnd;
+  }
+
+  SourceLocation getSourceLocation(const char *Loc) const {
+    assert(Loc >= BufferStart && Loc <= BufferEnd &&
+           "Location out of range for this buffer!");
+
+    const unsigned CharNo = Loc - BufferStart;
+    return FileLoc.getLocWithOffset(CharNo);
+  }
+
+  /// Eat string matching regexp \code \s*\* \endcode.
+  void skipLineStartingDecorations();
+
+  /// Lex stuff inside comments.  CommentEnd should be set correctly.
+  void lexCommentText(Token &T);
+
+  void setupAndLexVerbatimBlock(Token &T,
+                                const char *TextBegin,
+                                char Marker, StringRef EndName);
+
+  void lexVerbatimBlockFirstLine(Token &T);
+
+  void lexVerbatimBlockBody(Token &T);
+
+  void lexVerbatimLine(Token &T, const char *TextBegin);
+
+  void setupAndLexHTMLOpenTag(Token &T);
+
+  void lexHTMLOpenTag(Token &T);
+
+  void lexHTMLCloseTag(Token &T);
+
+public:
+  Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts,
+        const char *BufferStart, const char *BufferEnd);
+
+  void lex(Token &T);
+
+  StringRef getSpelling(const Token &Tok,
+                        const SourceManager &SourceMgr,
+                        bool *Invalid = NULL) const;
+
+  /// \brief Register a new verbatim block command.
+  void addVerbatimBlockCommand(StringRef BeginName, StringRef EndName);
+
+  /// \brief Register a new verbatim line command.
+  void addVerbatimLineCommand(StringRef Name);
+};
+
+} // end namespace comments
+} // end namespace clang
+
+#endif
+