From 477a9f58c1b197f315befd03b42a8a0b3a2f0ff9 Mon Sep 17 00:00:00 2001 From: Dmitri Gribenko Date: Fri, 27 Jul 2012 20:37:06 +0000 Subject: Implement resolving of HTML character references (named: &, decimal: *, hex: ) during comment parsing. Now internal representation of plain text in comment AST does not contain character references, but the characters themselves. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@160891 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/AST/ASTContext.cpp | 3 +- lib/AST/CommentLexer.cpp | 198 +++++++++++++++++++++++++++++++++++++++------ lib/AST/RawCommentList.cpp | 8 +- 3 files changed, 183 insertions(+), 26 deletions(-) (limited to 'lib/AST') diff --git a/lib/AST/ASTContext.cpp b/lib/AST/ASTContext.cpp index 5b57ce4557..46a4d87f9a 100644 --- a/lib/AST/ASTContext.cpp +++ b/lib/AST/ASTContext.cpp @@ -216,7 +216,8 @@ comments::FullComment *ASTContext::getCommentForDecl(const Decl *D) const { return NULL; const StringRef RawText = RC->getRawText(SourceMgr); - comments::Lexer L(RC->getSourceRange().getBegin(), comments::CommentOptions(), + comments::Lexer L(getAllocator(), + RC->getSourceRange().getBegin(), comments::CommentOptions(), RawText.begin(), RawText.end()); comments::Sema S(getAllocator(), getSourceManager(), getDiagnostics()); diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp index 31468321cf..dde484510f 100644 --- a/lib/AST/CommentLexer.cpp +++ b/lib/AST/CommentLexer.cpp @@ -1,4 +1,5 @@ #include "clang/AST/CommentLexer.h" +#include "clang/Basic/ConvertUTF.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" @@ -87,6 +88,71 @@ bool Lexer::isVerbatimLineCommand(StringRef Name) const { return false; } +namespace { +bool isHTMLNamedCharacterReferenceCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z'); +} + +bool isHTMLDecimalCharacterReferenceCharacter(char C) { + return C >= '0' && C <= '9'; +} + +bool isHTMLHexCharacterReferenceCharacter(char C) { + return (C >= '0' && C <= '9') || + (C >= 'a' && C <= 'f') || + (C >= 'A' && C <= 'F'); +} +} // unnamed namespace + +StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { + return llvm::StringSwitch(Name) + .Case("amp", "&") + .Case("lt", "<") + .Case("gt", ">") + .Case("quot", "\"") + .Case("apos", "\'") + .Default(""); +} + +StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); + CodePoint *= 10; + CodePoint += Name[i] - '0'; + } + + char *Resolved = Allocator.Allocate(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + +StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + CodePoint *= 16; + const char C = Name[i]; + assert(isHTMLHexCharacterReferenceCharacter(C)); + if (C >= '0' && C <= '9') + CodePoint += Name[i] - '0'; + else if (C >= 'a' && C <= 'f') + CodePoint += Name[i] - 'a' + 10; + else + CodePoint += Name[i] - 'A' + 10; + } + + char *Resolved = Allocator.Allocate(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + void Lexer::skipLineStartingDecorations() { // This function should be called only for C comments assert(CommentState == LCS_InsideCComment); @@ -147,6 +213,33 @@ const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { return BufferPtr; } +const char *skipNamedCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipDecimalCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipHexCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + bool isHTMLIdentifierStartingCharacter(char C) { return (C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z'); @@ -295,9 +388,7 @@ void Lexer::lexCommentText(Token &T) { case '@': { TokenPtr++; if (TokenPtr == CommentEnd) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } char C = *TokenPtr; @@ -322,9 +413,7 @@ void Lexer::lexCommentText(Token &T) { // Don't make zero-length commands. if (!isCommandNameCharacter(*TokenPtr)) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } @@ -357,12 +446,14 @@ void Lexer::lexCommentText(Token &T) { return; } + case '&': + lexHTMLCharacterReference(T); + return; + case '<': { TokenPtr++; if (TokenPtr == CommentEnd) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } const char C = *TokenPtr; @@ -370,11 +461,9 @@ void Lexer::lexCommentText(Token &T) { setupAndLexHTMLStartTag(T); else if (C == '/') setupAndLexHTMLEndTag(T); - else { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); - } + else + formTextToken(T, TokenPtr); + return; } @@ -394,12 +483,10 @@ void Lexer::lexCommentText(Token &T) { break; const char C = *TokenPtr; if(C == '\n' || C == '\r' || - C == '\\' || C == '@' || C == '<') + C == '\\' || C == '@' || C == '&' || C == '<') break; } - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } } @@ -506,6 +593,69 @@ void Lexer::lexVerbatimLineText(Token &T) { State = LS_Normal; } +void Lexer::lexHTMLCharacterReference(Token &T) { + const char *TokenPtr = BufferPtr; + assert(*TokenPtr == '&'); + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + const char *NamePtr; + bool isNamed = false; + bool isDecimal = false; + char C = *TokenPtr; + if (isHTMLNamedCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); + isNamed = true; + } else if (C == '#') { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + C = *TokenPtr; + if (isHTMLDecimalCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); + isDecimal = true; + } else if (C == 'x' || C == 'X') { + TokenPtr++; + NamePtr = TokenPtr; + TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); + } else { + formTextToken(T, TokenPtr); + return; + } + } else { + formTextToken(T, TokenPtr); + return; + } + if (NamePtr == TokenPtr || TokenPtr == CommentEnd || + *TokenPtr != ';') { + formTextToken(T, TokenPtr); + return; + } + StringRef Name(NamePtr, TokenPtr - NamePtr); + TokenPtr++; // Skip semicolon. + StringRef Resolved; + if (isNamed) + Resolved = resolveHTMLNamedCharacterReference(Name); + else if (isDecimal) + Resolved = resolveHTMLDecimalCharacterReference(Name); + else + Resolved = resolveHTMLHexCharacterReference(Name); + + if (Resolved.empty()) { + formTextToken(T, TokenPtr); + return; + } + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(Resolved); + return; +} + void Lexer::setupAndLexHTMLStartTag(Token &T) { assert(BufferPtr[0] == '<' && isHTMLIdentifierStartingCharacter(BufferPtr[1])); @@ -561,11 +711,9 @@ void Lexer::lexHTMLStartTag(Token &T) { if (TokenPtr != CommentEnd && *TokenPtr == '>') { TokenPtr++; formTokenWithChars(T, TokenPtr, tok::html_slash_greater); - } else { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); - } + } else + formTextToken(T, TokenPtr); + State = LS_Normal; return; } @@ -609,8 +757,10 @@ void Lexer::lexHTMLEndTag(Token &T) { State = LS_Normal; } -Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, +Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, + SourceLocation FileLoc, const CommentOptions &CommOpts, const char *BufferStart, const char *BufferEnd): + Allocator(Allocator), BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal) { diff --git a/lib/AST/RawCommentList.cpp b/lib/AST/RawCommentList.cpp index 7e183e2f2d..41866cf03f 100644 --- a/lib/AST/RawCommentList.cpp +++ b/lib/AST/RawCommentList.cpp @@ -134,7 +134,13 @@ const char *RawComment::extractBriefText(const ASTContext &Context) const { // Make sure that RawText is valid. getRawText(Context.getSourceManager()); - comments::Lexer L(Range.getBegin(), comments::CommentOptions(), + // Since we will be copying the resulting text, all allocations made during + // parsing are garbage after resulting string is formed. Thus we can use + // a separate allocator for all temporary stuff. + llvm::BumpPtrAllocator Allocator; + + comments::Lexer L(Allocator, + Range.getBegin(), comments::CommentOptions(), RawText.begin(), RawText.end()); comments::BriefParser P(L); -- cgit v1.2.3-18-g5258