diff options
author | Dmitri Gribenko <gribozavr@gmail.com> | 2012-08-22 22:56:08 +0000 |
---|---|---|
committer | Dmitri Gribenko <gribozavr@gmail.com> | 2012-08-22 22:56:08 +0000 |
commit | 834a5bd311b4a32f89937ca5b6dd2b4111891859 (patch) | |
tree | cafae99356d565aa85a005eb86cd930532135caa /lib/AST/CommentLexer.cpp | |
parent | 769bc07f4199b5889a88cf092ab4713d5520ff33 (diff) |
Comment parsing: parse "<blah" as an HTML tag only if "blah" is a known tag
name. This should reduce the amount of warning false positives about bad HTML
in comments when the comment author intended to put a reference to a template.
This change will also enable us parse the comment as intended in these cases.
Fixes part 1 of PR13374.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@162407 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/AST/CommentLexer.cpp')
-rw-r--r-- | lib/AST/CommentLexer.cpp | 40 |
1 files changed, 38 insertions, 2 deletions
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp index b6516ec126..870db2be5f 100644 --- a/lib/AST/CommentLexer.cpp +++ b/lib/AST/CommentLexer.cpp @@ -28,6 +28,33 @@ bool isHTMLHexCharacterReferenceCharacter(char C) { (C >= 'a' && C <= 'f') || (C >= 'A' && C <= 'F'); } + +bool isHTMLTagName(StringRef Name) { + return llvm::StringSwitch<bool>(Name) + .Cases("em", "strong", true) + .Cases("tt", "i", "b", "big", "small", true) + .Cases("strike", "s", "u", "font", true) + .Case("a", true) + .Case("hr", true) + .Cases("div", "span", true) + .Cases("h1", "h2", "h3", true) + .Cases("h4", "h5", "h6", true) + .Case("code", true) + .Case("blockquote", true) + .Cases("sub", "sup", true) + .Case("img", true) + .Case("p", true) + .Case("br", true) + .Case("pre", true) + .Cases("ins", "del", true) + .Cases("ul", "ol", "li", true) + .Cases("dl", "dt", "dd", true) + .Cases("table", "caption", true) + .Cases("thead", "tfoot", "tbody", true) + .Cases("colgroup", "col", true) + .Cases("tr", "th", "td", true) + .Default(false); +} } // unnamed namespace StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { @@ -585,8 +612,12 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) { assert(BufferPtr[0] == '<' && isHTMLIdentifierStartingCharacter(BufferPtr[1])); const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); - StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); + if (!isHTMLTagName(Name)) { + formTextToken(T, TagNameEnd); + return; + } + formTokenWithChars(T, TagNameEnd, tok::html_start_tag); T.setHTMLTagStartName(Name); @@ -665,11 +696,16 @@ void Lexer::setupAndLexHTMLEndTag(Token &T) { const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); + StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); + if (!isHTMLTagName(Name)) { + formTextToken(T, TagNameEnd); + return; + } const char *End = skipWhitespace(TagNameEnd, CommentEnd); formTokenWithChars(T, End, tok::html_end_tag); - T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin)); + T.setHTMLTagEndName(Name); if (BufferPtr != CommentEnd && *BufferPtr == '>') State = LS_HTMLEndTag; |