diff options
author | Dmitri Gribenko <gribozavr@gmail.com> | 2012-08-22 22:56:08 +0000 |
---|---|---|
committer | Dmitri Gribenko <gribozavr@gmail.com> | 2012-08-22 22:56:08 +0000 |
commit | 834a5bd311b4a32f89937ca5b6dd2b4111891859 (patch) | |
tree | cafae99356d565aa85a005eb86cd930532135caa | |
parent | 769bc07f4199b5889a88cf092ab4713d5520ff33 (diff) |
Comment parsing: parse "<blah" as an HTML tag only if "blah" is a known tag
name. This should reduce the amount of warning false positives about bad HTML
in comments when the comment author intended to put a reference to a template.
This change will also enable us parse the comment as intended in these cases.
Fixes part 1 of PR13374.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@162407 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/AST/CommentLexer.cpp | 40 | ||||
-rw-r--r-- | test/Sema/warn-documentation.cpp | 4 | ||||
-rw-r--r-- | unittests/AST/CommentLexer.cpp | 172 |
3 files changed, 134 insertions, 82 deletions
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp index b6516ec126..870db2be5f 100644 --- a/lib/AST/CommentLexer.cpp +++ b/lib/AST/CommentLexer.cpp @@ -28,6 +28,33 @@ bool isHTMLHexCharacterReferenceCharacter(char C) { (C >= 'a' && C <= 'f') || (C >= 'A' && C <= 'F'); } + +bool isHTMLTagName(StringRef Name) { + return llvm::StringSwitch<bool>(Name) + .Cases("em", "strong", true) + .Cases("tt", "i", "b", "big", "small", true) + .Cases("strike", "s", "u", "font", true) + .Case("a", true) + .Case("hr", true) + .Cases("div", "span", true) + .Cases("h1", "h2", "h3", true) + .Cases("h4", "h5", "h6", true) + .Case("code", true) + .Case("blockquote", true) + .Cases("sub", "sup", true) + .Case("img", true) + .Case("p", true) + .Case("br", true) + .Case("pre", true) + .Cases("ins", "del", true) + .Cases("ul", "ol", "li", true) + .Cases("dl", "dt", "dd", true) + .Cases("table", "caption", true) + .Cases("thead", "tfoot", "tbody", true) + .Cases("colgroup", "col", true) + .Cases("tr", "th", "td", true) + .Default(false); +} } // unnamed namespace StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { @@ -585,8 +612,12 @@ void Lexer::setupAndLexHTMLStartTag(Token &T) { assert(BufferPtr[0] == '<' && isHTMLIdentifierStartingCharacter(BufferPtr[1])); const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); - StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); + if (!isHTMLTagName(Name)) { + formTextToken(T, TagNameEnd); + return; + } + formTokenWithChars(T, TagNameEnd, tok::html_start_tag); T.setHTMLTagStartName(Name); @@ -665,11 +696,16 @@ void Lexer::setupAndLexHTMLEndTag(Token &T) { const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd); const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd); + StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin); + if (!isHTMLTagName(Name)) { + formTextToken(T, TagNameEnd); + return; + } const char *End = skipWhitespace(TagNameEnd, CommentEnd); formTokenWithChars(T, End, tok::html_end_tag); - T.setHTMLTagEndName(StringRef(TagNameBegin, TagNameEnd - TagNameBegin)); + T.setHTMLTagEndName(Name); if (BufferPtr != CommentEnd && *BufferPtr == '>') State = LS_HTMLEndTag; diff --git a/test/Sema/warn-documentation.cpp b/test/Sema/warn-documentation.cpp index d99520b673..1cd1358521 100644 --- a/test/Sema/warn-documentation.cpp +++ b/test/Sema/warn-documentation.cpp @@ -38,13 +38,13 @@ int test_html7(int); int test_html8(int); // expected-warning@+2 {{HTML start tag prematurely ended, expected attribute name or '>'}} expected-note@+1 {{HTML tag started here}} -/** Aaa bbb<ccc ddd eee +/** Aaa bbb<img ddd eee * fff ggg. */ int test_html9(int); // expected-warning@+1 {{HTML start tag prematurely ended, expected attribute name or '>'}} -/** Aaa bbb<ccc ddd eee 42% +/** Aaa bbb<img ddd eee 42% * fff ggg. */ int test_html10(int); diff --git a/unittests/AST/CommentLexer.cpp b/unittests/AST/CommentLexer.cpp index cab0fdddbc..1168d1d143 100644 --- a/unittests/AST/CommentLexer.cpp +++ b/unittests/AST/CommentLexer.cpp @@ -822,7 +822,7 @@ TEST_F(CommentLexerTest, HTML2) { TEST_F(CommentLexerTest, HTML3) { const char *Source = - "// < tag"; + "// < img"; std::vector<Token> Toks; @@ -837,15 +837,15 @@ TEST_F(CommentLexerTest, HTML3) { ASSERT_EQ(StringRef("<"), Toks[1].getText()); ASSERT_EQ(tok::text, Toks[2].getKind()); - ASSERT_EQ(StringRef(" tag"), Toks[2].getText()); + ASSERT_EQ(StringRef(" img"), Toks[2].getText()); ASSERT_EQ(tok::newline, Toks[3].getKind()); } TEST_F(CommentLexerTest, HTML4) { const char *Sources[] = { - "// <tag", - "// <tag " + "// <img", + "// <img " }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -859,7 +859,7 @@ TEST_F(CommentLexerTest, HTML4) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::newline, Toks[2].getKind()); } @@ -867,7 +867,7 @@ TEST_F(CommentLexerTest, HTML4) { TEST_F(CommentLexerTest, HTML5) { const char *Source = - "// <tag 42"; + "// <img 42"; std::vector<Token> Toks; @@ -879,7 +879,7 @@ TEST_F(CommentLexerTest, HTML5) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::text, Toks[2].getKind()); ASSERT_EQ(StringRef("42"), Toks[2].getText()); @@ -888,7 +888,7 @@ TEST_F(CommentLexerTest, HTML5) { } TEST_F(CommentLexerTest, HTML6) { - const char *Source = "// <tag> Meow"; + const char *Source = "// <img> Meow"; std::vector<Token> Toks; @@ -900,7 +900,7 @@ TEST_F(CommentLexerTest, HTML6) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_greater, Toks[2].getKind()); @@ -911,7 +911,7 @@ TEST_F(CommentLexerTest, HTML6) { } TEST_F(CommentLexerTest, HTML7) { - const char *Source = "// <tag="; + const char *Source = "// <img="; std::vector<Token> Toks; @@ -923,7 +923,7 @@ TEST_F(CommentLexerTest, HTML7) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::text, Toks[2].getKind()); ASSERT_EQ(StringRef("="), Toks[2].getText()); @@ -932,7 +932,7 @@ TEST_F(CommentLexerTest, HTML7) { } TEST_F(CommentLexerTest, HTML8) { - const char *Source = "// <tag attr=> Meow"; + const char *Source = "// <img src=> Meow"; std::vector<Token> Toks; @@ -944,10 +944,10 @@ TEST_F(CommentLexerTest, HTML8) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_ident, Toks[2].getKind()); - ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + ASSERT_EQ(StringRef("src"), Toks[2].getHTMLIdent()); ASSERT_EQ(tok::html_equals, Toks[3].getKind()); @@ -961,8 +961,8 @@ TEST_F(CommentLexerTest, HTML8) { TEST_F(CommentLexerTest, HTML9) { const char *Sources[] = { - "// <tag attr", - "// <tag attr " + "// <img src", + "// <img src " }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -976,10 +976,10 @@ TEST_F(CommentLexerTest, HTML9) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_ident, Toks[2].getKind()); - ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + ASSERT_EQ(StringRef("src"), Toks[2].getHTMLIdent()); ASSERT_EQ(tok::newline, Toks[3].getKind()); } @@ -987,8 +987,8 @@ TEST_F(CommentLexerTest, HTML9) { TEST_F(CommentLexerTest, HTML10) { const char *Sources[] = { - "// <tag attr=", - "// <tag attr =" + "// <img src=", + "// <img src =" }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -1002,10 +1002,10 @@ TEST_F(CommentLexerTest, HTML10) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_ident, Toks[2].getKind()); - ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + ASSERT_EQ(StringRef("src"), Toks[2].getHTMLIdent()); ASSERT_EQ(tok::html_equals, Toks[3].getKind()); @@ -1015,10 +1015,10 @@ TEST_F(CommentLexerTest, HTML10) { TEST_F(CommentLexerTest, HTML11) { const char *Sources[] = { - "// <tag attr=\"", - "// <tag attr = \"", - "// <tag attr=\'", - "// <tag attr = \'" + "// <img src=\"", + "// <img src = \"", + "// <img src=\'", + "// <img src = \'" }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -1032,10 +1032,10 @@ TEST_F(CommentLexerTest, HTML11) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_ident, Toks[2].getKind()); - ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + ASSERT_EQ(StringRef("src"), Toks[2].getHTMLIdent()); ASSERT_EQ(tok::html_equals, Toks[3].getKind()); @@ -1047,7 +1047,7 @@ TEST_F(CommentLexerTest, HTML11) { } TEST_F(CommentLexerTest, HTML12) { - const char *Source = "// <tag attr=@"; + const char *Source = "// <img src=@"; std::vector<Token> Toks; @@ -1059,10 +1059,10 @@ TEST_F(CommentLexerTest, HTML12) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_ident, Toks[2].getKind()); - ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + ASSERT_EQ(StringRef("src"), Toks[2].getHTMLIdent()); ASSERT_EQ(tok::html_equals, Toks[3].getKind()); @@ -1074,10 +1074,10 @@ TEST_F(CommentLexerTest, HTML12) { TEST_F(CommentLexerTest, HTML13) { const char *Sources[] = { - "// <tag attr=\"val\\\"\\'val", - "// <tag attr=\"val\\\"\\'val\"", - "// <tag attr=\'val\\\"\\'val", - "// <tag attr=\'val\\\"\\'val\'" + "// <img src=\"val\\\"\\'val", + "// <img src=\"val\\\"\\'val\"", + "// <img src=\'val\\\"\\'val", + "// <img src=\'val\\\"\\'val\'" }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -1091,10 +1091,10 @@ TEST_F(CommentLexerTest, HTML13) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_ident, Toks[2].getKind()); - ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + ASSERT_EQ(StringRef("src"), Toks[2].getHTMLIdent()); ASSERT_EQ(tok::html_equals, Toks[3].getKind()); @@ -1107,8 +1107,8 @@ TEST_F(CommentLexerTest, HTML13) { TEST_F(CommentLexerTest, HTML14) { const char *Sources[] = { - "// <tag attr=\"val\\\"\\'val\">", - "// <tag attr=\'val\\\"\\'val\'>" + "// <img src=\"val\\\"\\'val\">", + "// <img src=\'val\\\"\\'val\'>" }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -1122,10 +1122,10 @@ TEST_F(CommentLexerTest, HTML14) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_ident, Toks[2].getKind()); - ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + ASSERT_EQ(StringRef("src"), Toks[2].getHTMLIdent()); ASSERT_EQ(tok::html_equals, Toks[3].getKind()); @@ -1140,8 +1140,8 @@ TEST_F(CommentLexerTest, HTML14) { TEST_F(CommentLexerTest, HTML15) { const char *Sources[] = { - "// <tag/>", - "// <tag />" + "// <img/>", + "// <img />" }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -1155,7 +1155,7 @@ TEST_F(CommentLexerTest, HTML15) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::html_slash_greater, Toks[2].getKind()); @@ -1165,8 +1165,8 @@ TEST_F(CommentLexerTest, HTML15) { TEST_F(CommentLexerTest, HTML16) { const char *Sources[] = { - "// <tag/ Aaa", - "// <tag / Aaa" + "// <img/ Aaa", + "// <img / Aaa" }; for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { @@ -1180,7 +1180,7 @@ TEST_F(CommentLexerTest, HTML16) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_start_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagStartName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagStartName()); ASSERT_EQ(tok::text, Toks[2].getKind()); ASSERT_EQ(StringRef("/"), Toks[2].getText()); @@ -1201,13 +1201,13 @@ TEST_F(CommentLexerTest, HTML17) { ASSERT_EQ(3U, Toks.size()); - ASSERT_EQ(tok::text, Toks[0].getKind()); - ASSERT_EQ(StringRef(" "), Toks[0].getText()); + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); - ASSERT_EQ(tok::html_end_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef(""), Toks[1].getHTMLTagEndName()); + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("</"), Toks[1].getText()); - ASSERT_EQ(tok::newline, Toks[2].getKind()); + ASSERT_EQ(tok::newline, Toks[2].getKind()); } TEST_F(CommentLexerTest, HTML18) { @@ -1219,20 +1219,20 @@ TEST_F(CommentLexerTest, HTML18) { ASSERT_EQ(4U, Toks.size()); - ASSERT_EQ(tok::text, Toks[0].getKind()); - ASSERT_EQ(StringRef(" "), Toks[0].getText()); + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); - ASSERT_EQ(tok::html_end_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef(""), Toks[1].getHTMLTagEndName()); + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("</"), Toks[1].getText()); - ASSERT_EQ(tok::text, Toks[2].getKind()); - ASSERT_EQ(StringRef("@"), Toks[2].getText()); + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("@"), Toks[2].getText()); - ASSERT_EQ(tok::newline, Toks[3].getKind()); + ASSERT_EQ(tok::newline, Toks[3].getKind()); } TEST_F(CommentLexerTest, HTML19) { - const char *Source = "// </tag"; + const char *Source = "// </img"; std::vector<Token> Toks; @@ -1244,35 +1244,51 @@ TEST_F(CommentLexerTest, HTML19) { ASSERT_EQ(StringRef(" "), Toks[0].getText()); ASSERT_EQ(tok::html_end_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagEndName()); + ASSERT_EQ(StringRef("img"), Toks[1].getHTMLTagEndName()); ASSERT_EQ(tok::newline, Toks[2].getKind()); } -TEST_F(CommentLexerTest, HTML20) { - const char *Sources[] = { - "// </tag>", - "// </ tag>", - "// </ tag >" - }; +TEST_F(CommentLexerTest, NotAKnownHTMLTag1) { + const char *Source = "// <tag>"; - for (size_t i = 0, e = array_lengthof(Sources); i != e; i++) { - std::vector<Token> Toks; + std::vector<Token> Toks; - lexString(Sources[i], Toks); + lexString(Source, Toks); - ASSERT_EQ(4U, Toks.size()); + ASSERT_EQ(4U, Toks.size()); - ASSERT_EQ(tok::text, Toks[0].getKind()); - ASSERT_EQ(StringRef(" "), Toks[0].getText()); + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); - ASSERT_EQ(tok::html_end_tag, Toks[1].getKind()); - ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagEndName()); + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("<tag"), Toks[1].getText()); - ASSERT_EQ(tok::html_greater, Toks[2].getKind()); + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef(">"), Toks[2].getText()); - ASSERT_EQ(tok::newline, Toks[3].getKind()); - } + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, NotAKnownHTMLTag2) { + const char *Source = "// </tag>"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("</tag"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef(">"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); } TEST_F(CommentLexerTest, HTMLCharacterReferences1) { |