diff options
author | Dmitri Gribenko <gribozavr@gmail.com> | 2012-07-09 21:32:40 +0000 |
---|---|---|
committer | Dmitri Gribenko <gribozavr@gmail.com> | 2012-07-09 21:32:40 +0000 |
commit | a99ec107ba6b5abaf27c6cc9318e65689163f2a1 (patch) | |
tree | aa25e23366c546a85e3bb12e2c006f6805e44140 | |
parent | 34f60a4a7fb87e9f4dfd08f8751ce76db9981215 (diff) |
Comment lexing: fix lexing to actually work in non-error cases.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@159963 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | lib/AST/CommentLexer.cpp | 31 | ||||
-rw-r--r-- | unittests/AST/CommentLexer.cpp | 119 |
2 files changed, 125 insertions, 25 deletions
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp index 77d2a9b72d..55cd409a9c 100644 --- a/lib/AST/CommentLexer.cpp +++ b/lib/AST/CommentLexer.cpp @@ -147,6 +147,11 @@ const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { return BufferPtr; } +bool isHTMLIdentifierStartingCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z'); +} + bool isHTMLIdentifierCharacter(char C) { return (C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z') || @@ -357,7 +362,7 @@ void Lexer::lexCommentText(Token &T) { return; } const char C = *TokenPtr; - if (isHTMLIdentifierCharacter(C)) + if (isHTMLIdentifierStartingCharacter(C)) setupAndLexHTMLOpenTag(T); else if (C == '/') setupAndLexHTMLCloseTag(T); @@ -383,7 +388,7 @@ void Lexer::lexCommentText(Token &T) { TokenPtr++; if (TokenPtr == CommentEnd) break; - char C = *TokenPtr; + const char C = *TokenPtr; if(C == '\n' || C == '\r' || C == '\\' || C == '@' || C == '<') break; @@ -492,7 +497,8 @@ void Lexer::lexVerbatimLineText(Token &T) { } void Lexer::setupAndLexHTMLOpenTag(Token &T) { - assert(BufferPtr[0] == '<' && isHTMLIdentifierCharacter(BufferPtr[1])); + assert(BufferPtr[0] == '<' && + isHTMLIdentifierStartingCharacter(BufferPtr[1])); const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd); StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1)); @@ -501,12 +507,9 @@ void Lexer::setupAndLexHTMLOpenTag(Token &T) { BufferPtr = skipWhitespace(BufferPtr, CommentEnd); - if (BufferPtr != CommentEnd && *BufferPtr == '>') { - BufferPtr++; - return; - } - - if (BufferPtr != CommentEnd && isHTMLIdentifierCharacter(*BufferPtr)) + const char C = *BufferPtr; + if (BufferPtr != CommentEnd && + (C == '>' || isHTMLIdentifierStartingCharacter(C))) State = LS_HTMLOpenTag; } @@ -541,7 +544,8 @@ void Lexer::lexHTMLOpenTag(Token &T) { case '>': TokenPtr++; formTokenWithChars(T, TokenPtr, tok::html_greater); - break; + State = LS_Normal; + return; } } @@ -554,7 +558,7 @@ void Lexer::lexHTMLOpenTag(Token &T) { } C = *BufferPtr; - if (!isHTMLIdentifierCharacter(C) && + if (!isHTMLIdentifierStartingCharacter(C) && C != '=' && C != '\"' && C != '\'' && C != '>') { State = LS_Normal; return; @@ -656,8 +660,9 @@ again: EndWhitespace++; // Turn any whitespace between comments (and there is only whitespace - // between them) into a newline. We have two newlines between C comments - // in total (first one was synthesized after a comment). + // between them -- guaranteed by comment extraction) into a newline. We + // have two newlines between C comments in total (first one was synthesized + // after a comment). formTokenWithChars(T, EndWhitespace, tok::newline); CommentState = LCS_BeforeComment; diff --git a/unittests/AST/CommentLexer.cpp b/unittests/AST/CommentLexer.cpp index 0a52364987..e1089cc5dc 100644 --- a/unittests/AST/CommentLexer.cpp +++ b/unittests/AST/CommentLexer.cpp @@ -803,6 +803,28 @@ TEST_F(CommentLexerTest, HTML1) { TEST_F(CommentLexerTest, HTML2) { const char *Source = + "// a<2"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" a"), Toks[0].getText()); + + ASSERT_EQ(tok::text, Toks[1].getKind()); + ASSERT_EQ(StringRef("<"), Toks[1].getText()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("2"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTML3) { + const char *Source = "// < tag"; std::vector<Token> Toks; @@ -823,7 +845,7 @@ TEST_F(CommentLexerTest, HTML2) { ASSERT_EQ(tok::newline, Toks[3].getKind()); } -TEST_F(CommentLexerTest, HTML3) { +TEST_F(CommentLexerTest, HTML4) { const char *Sources[] = { "// <tag", "// <tag " @@ -846,7 +868,52 @@ TEST_F(CommentLexerTest, HTML3) { } } -TEST_F(CommentLexerTest, HTML4) { +TEST_F(CommentLexerTest, HTML5) { + const char *Source = + "// <tag 42"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(4U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::html_tag_open, Toks[1].getKind()); + ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagOpenName()); + + ASSERT_EQ(tok::text, Toks[2].getKind()); + ASSERT_EQ(StringRef("42"), Toks[2].getText()); + + ASSERT_EQ(tok::newline, Toks[3].getKind()); +} + +TEST_F(CommentLexerTest, HTML6) { + const char *Source = "// <tag> Meow"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(5U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::html_tag_open, Toks[1].getKind()); + ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagOpenName()); + + ASSERT_EQ(tok::html_greater, Toks[2].getKind()); + + ASSERT_EQ(tok::text, Toks[3].getKind()); + ASSERT_EQ(StringRef(" Meow"), Toks[3].getText()); + + ASSERT_EQ(tok::newline, Toks[4].getKind()); +} + +TEST_F(CommentLexerTest, HTML7) { const char *Source = "// <tag="; std::vector<Token> Toks; @@ -867,7 +934,35 @@ TEST_F(CommentLexerTest, HTML4) { ASSERT_EQ(tok::newline, Toks[3].getKind()); } -TEST_F(CommentLexerTest, HTML5) { +TEST_F(CommentLexerTest, HTML8) { + const char *Source = "// <tag attr=> Meow"; + + std::vector<Token> Toks; + + lexString(Source, Toks); + + ASSERT_EQ(7U, Toks.size()); + + ASSERT_EQ(tok::text, Toks[0].getKind()); + ASSERT_EQ(StringRef(" "), Toks[0].getText()); + + ASSERT_EQ(tok::html_tag_open, Toks[1].getKind()); + ASSERT_EQ(StringRef("tag"), Toks[1].getHTMLTagOpenName()); + + ASSERT_EQ(tok::html_ident, Toks[2].getKind()); + ASSERT_EQ(StringRef("attr"), Toks[2].getHTMLIdent()); + + ASSERT_EQ(tok::html_equals, Toks[3].getKind()); + + ASSERT_EQ(tok::html_greater, Toks[4].getKind()); + + ASSERT_EQ(tok::text, Toks[5].getKind()); + ASSERT_EQ(StringRef(" Meow"), Toks[5].getText()); + + ASSERT_EQ(tok::newline, Toks[6].getKind()); +} + +TEST_F(CommentLexerTest, HTML9) { const char *Sources[] = { "// <tag attr", "// <tag attr " @@ -893,7 +988,7 @@ TEST_F(CommentLexerTest, HTML5) { } } -TEST_F(CommentLexerTest, HTML6) { +TEST_F(CommentLexerTest, HTML10) { const char *Sources[] = { "// <tag attr=", "// <tag attr =" @@ -921,7 +1016,7 @@ TEST_F(CommentLexerTest, HTML6) { } } -TEST_F(CommentLexerTest, HTML7) { +TEST_F(CommentLexerTest, HTML11) { const char *Sources[] = { "// <tag attr=\"", "// <tag attr = \"", @@ -954,7 +1049,7 @@ TEST_F(CommentLexerTest, HTML7) { } } -TEST_F(CommentLexerTest, HTML8) { +TEST_F(CommentLexerTest, HTML12) { const char *Source = "// <tag attr=@"; std::vector<Token> Toks; @@ -980,7 +1075,7 @@ TEST_F(CommentLexerTest, HTML8) { ASSERT_EQ(tok::newline, Toks[5].getKind()); } -TEST_F(CommentLexerTest, HTML9) { +TEST_F(CommentLexerTest, HTML13) { const char *Sources[] = { "// <tag attr=\"val\\\"\\'val", "// <tag attr=\"val\\\"\\'val\"", @@ -1013,7 +1108,7 @@ TEST_F(CommentLexerTest, HTML9) { } } -TEST_F(CommentLexerTest, HTML10) { +TEST_F(CommentLexerTest, HTML14) { const char *Sources[] = { "// <tag attr=\"val\\\"\\'val\">", "// <tag attr=\'val\\\"\\'val\'>" @@ -1046,7 +1141,7 @@ TEST_F(CommentLexerTest, HTML10) { } } -TEST_F(CommentLexerTest, HTML11) { +TEST_F(CommentLexerTest, HTML15) { const char *Source = "// </"; std::vector<Token> Toks; @@ -1065,7 +1160,7 @@ TEST_F(CommentLexerTest, HTML11) { } -TEST_F(CommentLexerTest, HTML12) { +TEST_F(CommentLexerTest, HTML16) { const char *Source = "// </@"; std::vector<Token> Toks; @@ -1086,7 +1181,7 @@ TEST_F(CommentLexerTest, HTML12) { ASSERT_EQ(tok::newline, Toks[3].getKind()); } -TEST_F(CommentLexerTest, HTML13) { +TEST_F(CommentLexerTest, HTML17) { const char *Source = "// </tag"; std::vector<Token> Toks; @@ -1104,7 +1199,7 @@ TEST_F(CommentLexerTest, HTML13) { ASSERT_EQ(tok::newline, Toks[2].getKind()); } -TEST_F(CommentLexerTest, HTML14) { +TEST_F(CommentLexerTest, HTML18) { const char *Sources[] = { "// </tag>", "// </ tag>", |