diff options
Diffstat (limited to 'lib/AST/CommentLexer.cpp')
-rw-r--r-- | lib/AST/CommentLexer.cpp | 198 |
1 files changed, 174 insertions, 24 deletions
diff --git a/lib/AST/CommentLexer.cpp b/lib/AST/CommentLexer.cpp index 31468321cf..dde484510f 100644 --- a/lib/AST/CommentLexer.cpp +++ b/lib/AST/CommentLexer.cpp @@ -1,4 +1,5 @@ #include "clang/AST/CommentLexer.h" +#include "clang/Basic/ConvertUTF.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" @@ -87,6 +88,71 @@ bool Lexer::isVerbatimLineCommand(StringRef Name) const { return false; } +namespace { +bool isHTMLNamedCharacterReferenceCharacter(char C) { + return (C >= 'a' && C <= 'z') || + (C >= 'A' && C <= 'Z'); +} + +bool isHTMLDecimalCharacterReferenceCharacter(char C) { + return C >= '0' && C <= '9'; +} + +bool isHTMLHexCharacterReferenceCharacter(char C) { + return (C >= '0' && C <= '9') || + (C >= 'a' && C <= 'f') || + (C >= 'A' && C <= 'F'); +} +} // unnamed namespace + +StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const { + return llvm::StringSwitch<StringRef>(Name) + .Case("amp", "&") + .Case("lt", "<") + .Case("gt", ">") + .Case("quot", "\"") + .Case("apos", "\'") + .Default(""); +} + +StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + assert(isHTMLDecimalCharacterReferenceCharacter(Name[i])); + CodePoint *= 10; + CodePoint += Name[i] - '0'; + } + + char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + +StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const { + unsigned CodePoint = 0; + for (unsigned i = 0, e = Name.size(); i != e; ++i) { + CodePoint *= 16; + const char C = Name[i]; + assert(isHTMLHexCharacterReferenceCharacter(C)); + if (C >= '0' && C <= '9') + CodePoint += Name[i] - '0'; + else if (C >= 'a' && C <= 'f') + CodePoint += Name[i] - 'a' + 10; + else + CodePoint += Name[i] - 'A' + 10; + } + + char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT); + char *ResolvedPtr = Resolved; + if (ConvertCodePointToUTF8(CodePoint, ResolvedPtr)) + return StringRef(Resolved, ResolvedPtr - Resolved); + else + return StringRef(); +} + void Lexer::skipLineStartingDecorations() { // This function should be called only for C comments assert(CommentState == LCS_InsideCComment); @@ -147,6 +213,33 @@ const char *skipNewline(const char *BufferPtr, const char *BufferEnd) { return BufferPtr; } +const char *skipNamedCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipDecimalCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + +const char *skipHexCharacterReference(const char *BufferPtr, + const char *BufferEnd) { + for ( ; BufferPtr != BufferEnd; ++BufferPtr) { + if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr)) + return BufferPtr; + } + return BufferEnd; +} + bool isHTMLIdentifierStartingCharacter(char C) { return (C >= 'a' && C <= 'z') || (C >= 'A' && C <= 'Z'); @@ -295,9 +388,7 @@ void Lexer::lexCommentText(Token &T) { case '@': { TokenPtr++; if (TokenPtr == CommentEnd) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } char C = *TokenPtr; @@ -322,9 +413,7 @@ void Lexer::lexCommentText(Token &T) { // Don't make zero-length commands. if (!isCommandNameCharacter(*TokenPtr)) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } @@ -357,12 +446,14 @@ void Lexer::lexCommentText(Token &T) { return; } + case '&': + lexHTMLCharacterReference(T); + return; + case '<': { TokenPtr++; if (TokenPtr == CommentEnd) { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } const char C = *TokenPtr; @@ -370,11 +461,9 @@ void Lexer::lexCommentText(Token &T) { setupAndLexHTMLStartTag(T); else if (C == '/') setupAndLexHTMLEndTag(T); - else { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); - } + else + formTextToken(T, TokenPtr); + return; } @@ -394,12 +483,10 @@ void Lexer::lexCommentText(Token &T) { break; const char C = *TokenPtr; if(C == '\n' || C == '\r' || - C == '\\' || C == '@' || C == '<') + C == '\\' || C == '@' || C == '&' || C == '<') break; } - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); + formTextToken(T, TokenPtr); return; } } @@ -506,6 +593,69 @@ void Lexer::lexVerbatimLineText(Token &T) { State = LS_Normal; } +void Lexer::lexHTMLCharacterReference(Token &T) { + const char *TokenPtr = BufferPtr; + assert(*TokenPtr == '&'); + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + const char *NamePtr; + bool isNamed = false; + bool isDecimal = false; + char C = *TokenPtr; + if (isHTMLNamedCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd); + isNamed = true; + } else if (C == '#') { + TokenPtr++; + if (TokenPtr == CommentEnd) { + formTextToken(T, TokenPtr); + return; + } + C = *TokenPtr; + if (isHTMLDecimalCharacterReferenceCharacter(C)) { + NamePtr = TokenPtr; + TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd); + isDecimal = true; + } else if (C == 'x' || C == 'X') { + TokenPtr++; + NamePtr = TokenPtr; + TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd); + } else { + formTextToken(T, TokenPtr); + return; + } + } else { + formTextToken(T, TokenPtr); + return; + } + if (NamePtr == TokenPtr || TokenPtr == CommentEnd || + *TokenPtr != ';') { + formTextToken(T, TokenPtr); + return; + } + StringRef Name(NamePtr, TokenPtr - NamePtr); + TokenPtr++; // Skip semicolon. + StringRef Resolved; + if (isNamed) + Resolved = resolveHTMLNamedCharacterReference(Name); + else if (isDecimal) + Resolved = resolveHTMLDecimalCharacterReference(Name); + else + Resolved = resolveHTMLHexCharacterReference(Name); + + if (Resolved.empty()) { + formTextToken(T, TokenPtr); + return; + } + formTokenWithChars(T, TokenPtr, tok::text); + T.setText(Resolved); + return; +} + void Lexer::setupAndLexHTMLStartTag(Token &T) { assert(BufferPtr[0] == '<' && isHTMLIdentifierStartingCharacter(BufferPtr[1])); @@ -561,11 +711,9 @@ void Lexer::lexHTMLStartTag(Token &T) { if (TokenPtr != CommentEnd && *TokenPtr == '>') { TokenPtr++; formTokenWithChars(T, TokenPtr, tok::html_slash_greater); - } else { - StringRef Text(BufferPtr, TokenPtr - BufferPtr); - formTokenWithChars(T, TokenPtr, tok::text); - T.setText(Text); - } + } else + formTextToken(T, TokenPtr); + State = LS_Normal; return; } @@ -609,8 +757,10 @@ void Lexer::lexHTMLEndTag(Token &T) { State = LS_Normal; } -Lexer::Lexer(SourceLocation FileLoc, const CommentOptions &CommOpts, +Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, + SourceLocation FileLoc, const CommentOptions &CommOpts, const char *BufferStart, const char *BufferEnd): + Allocator(Allocator), BufferStart(BufferStart), BufferEnd(BufferEnd), FileLoc(FileLoc), CommOpts(CommOpts), BufferPtr(BufferStart), CommentState(LCS_BeforeComment), State(LS_Normal) { |