diff options
author | Douglas Gregor <dgregor@apple.com> | 2011-07-27 05:40:30 +0000 |
---|---|---|
committer | Douglas Gregor <dgregor@apple.com> | 2011-07-27 05:40:30 +0000 |
commit | 5cee1195584fa8672253139c86e922daeda69b9e (patch) | |
tree | e1b36e0f628359bb42d22d78c74e931057b962de | |
parent | 6fa8f86b8188c6d3c4d6616122a71ccd72a0c78a (diff) |
Add support for C++0x unicode string and character literals, from Craig Topper!
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@136210 91177308-0d34-0410-b5e6-96231b3b80d8
46 files changed, 608 insertions, 246 deletions
diff --git a/include/clang/AST/Expr.h b/include/clang/AST/Expr.h index f623fd1d52..9e4c0f0915 100644 --- a/include/clang/AST/Expr.h +++ b/include/clang/AST/Expr.h @@ -1112,29 +1112,39 @@ public: }; class CharacterLiteral : public Expr { +public: + enum CharacterKind { + Ascii, + Wide, + UTF16, + UTF32 + }; + +private: unsigned Value; SourceLocation Loc; - bool IsWide; + unsigned Kind : 2; public: // type should be IntTy - CharacterLiteral(unsigned value, bool iswide, QualType type, SourceLocation l) + CharacterLiteral(unsigned value, CharacterKind kind, QualType type, + SourceLocation l) : Expr(CharacterLiteralClass, type, VK_RValue, OK_Ordinary, false, false, false, false), - Value(value), Loc(l), IsWide(iswide) { + Value(value), Loc(l), Kind(kind) { } /// \brief Construct an empty character literal. CharacterLiteral(EmptyShell Empty) : Expr(CharacterLiteralClass, Empty) { } SourceLocation getLocation() const { return Loc; } - bool isWide() const { return IsWide; } + CharacterKind getKind() const { return static_cast<CharacterKind>(Kind); } SourceRange getSourceRange() const { return SourceRange(Loc); } unsigned getValue() const { return Value; } void setLocation(SourceLocation Location) { Loc = Location; } - void setWide(bool W) { IsWide = W; } + void setKind(CharacterKind kind) { Kind = kind; } void setValue(unsigned Val) { Value = Val; } static bool classof(const Stmt *T) { @@ -1243,13 +1253,23 @@ public: /// In this case, getByteLength() will return 6, but the string literal will /// have type "char[2]". class StringLiteral : public Expr { +public: + enum StringKind { + Ascii, + Wide, + UTF8, + UTF16, + UTF32 + }; + +private: friend class ASTStmtReader; const char *StrData; unsigned ByteLength; - bool IsWide; - bool IsPascal; unsigned NumConcatenated; + unsigned Kind : 3; + bool IsPascal : 1; SourceLocation TokLocs[1]; StringLiteral(QualType Ty) : @@ -1259,14 +1279,15 @@ class StringLiteral : public Expr { public: /// This is the "fully general" constructor that allows representation of /// strings formed from multiple concatenated tokens. - static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide, + static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind, bool Pascal, QualType Ty, const SourceLocation *Loc, unsigned NumStrs); /// Simple constructor for string literals made from one token. - static StringLiteral *Create(ASTContext &C, StringRef Str, bool Wide, - bool Pascal, QualType Ty, SourceLocation Loc) { - return Create(C, Str, Wide, Pascal, Ty, &Loc, 1); + static StringLiteral *Create(ASTContext &C, StringRef Str, StringKind Kind, + bool Pascal, QualType Ty, + SourceLocation Loc) { + return Create(C, Str, Kind, Pascal, Ty, &Loc, 1); } /// \brief Construct an empty string literal. @@ -1281,9 +1302,14 @@ public: /// \brief Sets the string data to the given string data. void setString(ASTContext &C, StringRef Str); - bool isWide() const { return IsWide; } + StringKind getKind() const { return static_cast<StringKind>(Kind); } + bool isAscii() const { return Kind == Ascii; } + bool isWide() const { return Kind == Wide; } + bool isUTF8() const { return Kind == UTF8; } + bool isUTF16() const { return Kind == UTF16; } + bool isUTF32() const { return Kind == UTF32; } bool isPascal() const { return IsPascal; } - + bool containsNonAsciiOrNull() const { StringRef Str = getString(); for (unsigned i = 0, e = Str.size(); i != e; ++i) diff --git a/include/clang/AST/Type.h b/include/clang/AST/Type.h index 8a842da440..2b72610226 100644 --- a/include/clang/AST/Type.h +++ b/include/clang/AST/Type.h @@ -1368,6 +1368,8 @@ public: bool isBooleanType() const; bool isCharType() const; bool isWideCharType() const; + bool isChar16Type() const; + bool isChar32Type() const; bool isAnyCharacterType() const; bool isIntegralType(ASTContext &Ctx) const; diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td index 9e431a2d21..e23921be0b 100644 --- a/include/clang/Basic/DiagnosticLexKinds.td +++ b/include/clang/Basic/DiagnosticLexKinds.td @@ -77,8 +77,8 @@ def err_invalid_suffix_integer_constant : Error< "invalid suffix '%0' on integer constant">; def err_invalid_suffix_float_constant : Error< "invalid suffix '%0' on floating constant">; -def warn_extraneous_wide_char_constant : Warning< - "extraneous characters in wide character constant ignored">; +def warn_extraneous_char_constant : Warning< + "extraneous characters in character constant ignored">; def warn_char_constant_too_large : Warning< "character constant too long for its type">; def err_exponent_has_no_digits : Error<"exponent has no digits">; @@ -102,6 +102,8 @@ def warn_ucn_escape_too_large : ExtWarn< "character unicode escape sequence too long for its type">; def warn_ucn_not_valid_in_c89 : ExtWarn< "unicode escape sequences are only valid in C99 or C++">; +def err_unsupported_string_concat : Error< + "unsupported non-standard concatenation of string literals">; //===----------------------------------------------------------------------===// // PTH Diagnostics diff --git a/include/clang/Basic/IdentifierTable.h b/include/clang/Basic/IdentifierTable.h index be1fa196c0..3390f7809d 100644 --- a/include/clang/Basic/IdentifierTable.h +++ b/include/clang/Basic/IdentifierTable.h @@ -50,8 +50,8 @@ namespace clang { /// set, and all tok::identifier tokens have a pointer to one of these. class IdentifierInfo { // Note: DON'T make TokenID a 'tok::TokenKind'; MSVC will treat it as a - // signed char and TokenKinds > 127 won't be handled correctly. - unsigned TokenID : 8; // Front-end token ID or tok::identifier. + // signed char and TokenKinds > 255 won't be handled correctly. + unsigned TokenID : 9; // Front-end token ID or tok::identifier. // Objective-C keyword ('protocol' in '@protocol') or builtin (__builtin_inf). // First NUM_OBJC_KEYWORDS values are for Objective-C, the remaining values // are for builtins. @@ -65,7 +65,7 @@ class IdentifierInfo { // file and wasn't modified since. bool RevertedTokenID : 1; // True if RevertTokenIDToIdentifier was // called. - // 6 bits left in 32-bit word. + // 5 bits left in 32-bit word. void *FETokenInfo; // Managed by the language front-end. llvm::StringMapEntry<IdentifierInfo*> *Entry; @@ -409,6 +409,7 @@ public: IdentifierInfo &get(StringRef Name, tok::TokenKind TokenCode) { IdentifierInfo &II = get(Name); II.TokenID = TokenCode; + assert(II.TokenID == TokenCode && "TokenCode too large"); return II; } diff --git a/include/clang/Basic/TokenKinds.def b/include/clang/Basic/TokenKinds.def index 86172b83ff..d057559889 100644 --- a/include/clang/Basic/TokenKinds.def +++ b/include/clang/Basic/TokenKinds.def @@ -114,13 +114,23 @@ TOK(raw_identifier) // Used only in raw lexing mode. TOK(numeric_constant) // 0x123 // C99 6.4.4: Character Constants -TOK(char_constant) // 'a' L'b' +TOK(char_constant) // 'a' +TOK(wide_char_constant) // L'b' + +// C++0x Character Constants +TOK(utf16_char_constant) // u'a' +TOK(utf32_char_constant) // U'a' // C99 6.4.5: String Literals. TOK(string_literal) // "foo" TOK(wide_string_literal) // L"foo" TOK(angle_string_literal)// <foo> +// C++0x String Literals. +TOK(utf8_string_literal) // u8"foo" +TOK(utf16_string_literal)// u"foo" +TOK(utf32_string_literal)// U"foo" + // C99 6.4.6: Punctuators. PUNCTUATOR(l_square, "[") PUNCTUATOR(r_square, "]") diff --git a/include/clang/Lex/Lexer.h b/include/clang/Lex/Lexer.h index 990c1eedbb..2c25597433 100644 --- a/include/clang/Lex/Lexer.h +++ b/include/clang/Lex/Lexer.h @@ -471,9 +471,11 @@ private: // Helper functions to lex the remainder of a token of the specific type. void LexIdentifier (Token &Result, const char *CurPtr); void LexNumericConstant (Token &Result, const char *CurPtr); - void LexStringLiteral (Token &Result, const char *CurPtr,bool Wide); + void LexStringLiteral (Token &Result, const char *CurPtr, + tok::TokenKind Kind); void LexAngledStringLiteral(Token &Result, const char *CurPtr); - void LexCharConstant (Token &Result, const char *CurPtr); + void LexCharConstant (Token &Result, const char *CurPtr, + tok::TokenKind Kind); bool LexEndOfFile (Token &Result, const char *CurPtr); bool SkipWhitespace (Token &Result, const char *CurPtr); diff --git a/include/clang/Lex/LiteralSupport.h b/include/clang/Lex/LiteralSupport.h index 6486c38a40..15057299b2 100644 --- a/include/clang/Lex/LiteralSupport.h +++ b/include/clang/Lex/LiteralSupport.h @@ -19,6 +19,7 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/SmallString.h" #include "llvm/Support/DataTypes.h" +#include "clang/Basic/TokenKinds.h" #include <cctype> namespace clang { @@ -124,15 +125,19 @@ private: /// character literal. class CharLiteralParser { uint64_t Value; - bool IsWide; + tok::TokenKind Kind; bool IsMultiChar; bool HadError; public: CharLiteralParser(const char *begin, const char *end, - SourceLocation Loc, Preprocessor &PP); + SourceLocation Loc, Preprocessor &PP, + tok::TokenKind kind); bool hadError() const { return HadError; } - bool isWide() const { return IsWide; } + bool isAscii() const { return Kind == tok::char_constant; } + bool isWide() const { return Kind == tok::wide_char_constant; } + bool isUTF16() const { return Kind == tok::utf16_char_constant; } + bool isUTF32() const { return Kind == tok::utf32_char_constant; } bool isMultiChar() const { return IsMultiChar; } uint64_t getValue() const { return Value; } }; @@ -148,7 +153,8 @@ class StringLiteralParser { unsigned MaxTokenLength; unsigned SizeBound; - unsigned wchar_tByteWidth; + unsigned CharByteWidth; + tok::TokenKind Kind; llvm::SmallString<512> ResultBuf; char *ResultPtr; // cursor public: @@ -158,14 +164,13 @@ public: const SourceManager &sm, const LangOptions &features, const TargetInfo &target, Diagnostic *diags = 0) : SM(sm), Features(features), Target(target), Diags(diags), - MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0), - ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) { + MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown), + ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) { init(StringToks, NumStringToks); } bool hadError; - bool AnyWide; bool Pascal; StringRef GetString() const { @@ -174,9 +179,7 @@ public: unsigned GetStringLength() const { return ResultPtr-ResultBuf.data(); } unsigned GetNumStringChars() const { - if (AnyWide) - return GetStringLength() / wchar_tByteWidth; - return GetStringLength(); + return GetStringLength() / CharByteWidth; } /// getOffsetOfStringByte - This function returns the offset of the /// specified byte of the string data represented by Token. This handles @@ -185,7 +188,13 @@ public: /// If the Diagnostics pointer is non-null, then this will do semantic /// checking of the string literal and emit errors and warnings. unsigned getOffsetOfStringByte(const Token &TheTok, unsigned ByteNo) const; - + + bool isAscii() { return Kind == tok::string_literal; } + bool isWide() { return Kind == tok::wide_string_literal; } + bool isUTF8() { return Kind == tok::utf8_string_literal; } + bool isUTF16() { return Kind == tok::utf16_string_literal; } + bool isUTF32() { return Kind == tok::utf32_string_literal; } + private: void init(const Token *StringToks, unsigned NumStringToks); }; diff --git a/include/clang/Lex/Token.h b/include/clang/Lex/Token.h index 9cf11d9a64..e6dd1607e8 100644 --- a/include/clang/Lex/Token.h +++ b/include/clang/Lex/Token.h @@ -96,7 +96,10 @@ public: /// constant, string, etc. bool isLiteral() const { return is(tok::numeric_constant) || is(tok::char_constant) || - is(tok::string_literal) || is(tok::wide_string_literal) || + is(tok::wide_char_constant) || is(tok::utf16_char_constant) || + is(tok::utf32_char_constant) || is(tok::string_literal) || + is(tok::wide_string_literal) || is(tok::utf8_string_literal) || + is(tok::utf16_string_literal) || is(tok::utf32_string_literal) || is(tok::angle_string_literal); } diff --git a/include/clang/Lex/TokenConcatenation.h b/include/clang/Lex/TokenConcatenation.h index 094990a6e3..551300f402 100644 --- a/include/clang/Lex/TokenConcatenation.h +++ b/include/clang/Lex/TokenConcatenation.h @@ -63,12 +63,9 @@ namespace clang { const Token &Tok) const; private: - /// StartsWithL - Return true if the spelling of this token starts with 'L'. - bool StartsWithL(const Token &Tok) const; - - /// IsIdentifierL - Return true if the spelling of this token is literally - /// 'L'. - bool IsIdentifierL(const Token &Tok) const; + /// IsIdentifierStringPrefix - Return true if the spelling of the token + /// is literally 'L', 'u', 'U', or 'u8'. + bool IsIdentifierStringPrefix(const Token &Tok) const; }; } // end clang namespace diff --git a/include/clang/Parse/Parser.h b/include/clang/Parse/Parser.h index 5d9376c1f7..83b0cd455e 100644 --- a/include/clang/Parse/Parser.h +++ b/include/clang/Parse/Parser.h @@ -265,7 +265,10 @@ private: /// bool isTokenStringLiteral() const { return Tok.getKind() == tok::string_literal || - Tok.getKind() == tok::wide_string_literal; + Tok.getKind() == tok::wide_string_literal || + Tok.getKind() == tok::utf8_string_literal || + Tok.getKind() == tok::utf16_string_literal || + Tok.getKind() == tok::utf32_string_literal; } /// \brief Returns true if the current token is a '=' or '==' and diff --git a/lib/AST/ASTImporter.cpp b/lib/AST/ASTImporter.cpp index 2ea79912d1..d6e7d77d0f 100644 --- a/lib/AST/ASTImporter.cpp +++ b/lib/AST/ASTImporter.cpp @@ -3814,8 +3814,8 @@ Expr *ASTNodeImporter::VisitCharacterLiteral(CharacterLiteral *E) { if (T.isNull()) return 0; - return new (Importer.getToContext()) CharacterLiteral(E->getValue(), - E->isWide(), T, + return new (Importer.getToContext()) CharacterLiteral(E->getValue(), + E->getKind(), T, Importer.Import(E->getLocation())); } diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp index 58fb32d278..5e795be56d 100644 --- a/lib/AST/Expr.cpp +++ b/lib/AST/Expr.cpp @@ -533,8 +533,7 @@ double FloatingLiteral::getValueAsApproximateDouble() const { } StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str, - bool Wide, - bool Pascal, QualType Ty, + StringKind Kind, bool Pascal, QualType Ty, const SourceLocation *Loc, unsigned NumStrs) { // Allocate enough space for the StringLiteral plus an array of locations for @@ -549,7 +548,7 @@ StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str, memcpy(AStrData, Str.data(), Str.size()); SL->StrData = AStrData; SL->ByteLength = Str.size(); - SL->IsWide = Wide; + SL->Kind = Kind; SL->IsPascal = Pascal; SL->TokLocs[0] = Loc[0]; SL->NumConcatenated = NumStrs; @@ -587,8 +586,8 @@ void StringLiteral::setString(ASTContext &C, StringRef Str) { SourceLocation StringLiteral:: getLocationOfByte(unsigned ByteNo, const SourceManager &SM, const LangOptions &Features, const TargetInfo &Target) const { - assert(!isWide() && "This doesn't work for wide strings yet"); - + assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings"); + // Loop over all of the tokens in this string until we find the one that // contains the byte we're looking for. unsigned TokNo = 0; diff --git a/lib/AST/StmtDumper.cpp b/lib/AST/StmtDumper.cpp index 7218af570f..ce4ae8e773 100644 --- a/lib/AST/StmtDumper.cpp +++ b/lib/AST/StmtDumper.cpp @@ -443,8 +443,13 @@ void StmtDumper::VisitStringLiteral(StringLiteral *Str) { DumpExpr(Str); // FIXME: this doesn't print wstrings right. OS << " "; - if (Str->isWide()) - OS << "L"; + switch (Str->getKind()) { + case StringLiteral::Ascii: break; // No prefix + case StringLiteral::Wide: OS << 'L'; break; + case StringLiteral::UTF8: OS << "u8"; break; + case StringLiteral::UTF16: OS << 'u'; break; + case StringLiteral::UTF32: OS << 'U'; break; + } OS << '"'; OS.write_escaped(Str->getString()); OS << '"'; diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp index 8fcad14ec2..79f14bc658 100644 --- a/lib/AST/StmtPrinter.cpp +++ b/lib/AST/StmtPrinter.cpp @@ -599,8 +599,14 @@ void StmtPrinter::VisitPredefinedExpr(PredefinedExpr *Node) { void StmtPrinter::VisitCharacterLiteral(CharacterLiteral *Node) { unsigned value = Node->getValue(); - if (Node->isWide()) - OS << "L"; + + switch (Node->getKind()) { + case CharacterLiteral::Ascii: break; // no prefix. + case CharacterLiteral::Wide: OS << 'L'; break; + case CharacterLiteral::UTF16: OS << 'u'; break; + case CharacterLiteral::UTF32: OS << 'U'; break; + } + switch (value) { case '\\': OS << "'\\\\'"; @@ -672,7 +678,13 @@ void StmtPrinter::VisitImaginaryLiteral(ImaginaryLiteral *Node) { } void StmtPrinter::VisitStringLiteral(StringLiteral *Str) { - if (Str->isWide()) OS << 'L'; + switch (Str->getKind()) { + case StringLiteral::Ascii: break; // no prefix. + case StringLiteral::Wide: OS << 'L'; break; + case StringLiteral::UTF8: OS << "u8"; break; + case StringLiteral::UTF16: OS << 'u'; break; + case StringLiteral::UTF32: OS << 'U'; break; + } OS << '"'; // FIXME: this doesn't print wstrings right. diff --git a/lib/AST/StmtProfile.cpp b/lib/AST/StmtProfile.cpp index 120c9e50a9..12321ef0d6 100644 --- a/lib/AST/StmtProfile.cpp +++ b/lib/AST/StmtProfile.cpp @@ -252,7 +252,7 @@ void StmtProfiler::VisitIntegerLiteral(const IntegerLiteral *S) { void StmtProfiler::VisitCharacterLiteral(const CharacterLiteral *S) { VisitExpr(S); - ID.AddBoolean(S->isWide()); + ID.AddInteger(S->getKind()); ID.AddInteger(S->getValue()); } @@ -269,7 +269,7 @@ void StmtProfiler::VisitImaginaryLiteral(const ImaginaryLiteral *S) { void StmtProfiler::VisitStringLiteral(const StringLiteral *S) { VisitExpr(S); ID.AddString(S->getString()); - ID.AddBoolean(S->isWide()); + ID.AddInteger(S->getKind()); } void StmtProfiler::VisitParenExpr(const ParenExpr *S) { diff --git a/lib/AST/Type.cpp b/lib/AST/Type.cpp index 7cd3be2fb4..2555ab31fb 100644 --- a/lib/AST/Type.cpp +++ b/lib/AST/Type.cpp @@ -635,6 +635,18 @@ bool Type::isWideCharType() const { return false; } +bool Type::isChar16Type() const { + if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType)) + return BT->getKind() == BuiltinType::Char16; + return false; +} + +bool Type::isChar32Type() const { + if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType)) + return BT->getKind() == BuiltinType::Char32; + return false; +} + /// \brief Determine whether this type is any of the built-in character /// types. bool Type::isAnyCharacterType() const { diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp index 290fe242c9..ce32325aca 100644 --- a/lib/CodeGen/CodeGenModule.cpp +++ b/lib/CodeGen/CodeGenModule.cpp @@ -1877,8 +1877,20 @@ std::string CodeGenModule::GetStringForStringLiteral(const StringLiteral *E) { // Resize the string to the right size. uint64_t RealLen = CAT->getSize().getZExtValue(); - if (E->isWide()) + switch (E->getKind()) { + case StringLiteral::Ascii: + case StringLiteral::UTF8: + break; + case StringLiteral::Wide: RealLen *= Context.Target.getWCharWidth() / Context.getCharWidth(); + break; + case StringLiteral::UTF16: + RealLen *= Context.Target.getChar16Width() / Context.getCharWidth(); + break; + case StringLiteral::UTF32: + RealLen *= Context.Target.getChar32Width() / Context.getCharWidth(); + break; + } std::string Str = E->getString().str(); Str.resize(RealLen, '\0'); @@ -1893,7 +1905,7 @@ CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S) { // FIXME: This can be more efficient. // FIXME: We shouldn't need to bitcast the constant in the wide string case. llvm::Constant *C = GetAddrOfConstantString(GetStringForStringLiteral(S)); - if (S->isWide()) { + if (S->isWide() || S->isUTF16() || S->isUTF32()) { llvm::Type *DestTy = llvm::PointerType::getUnqual(getTypes().ConvertType(S->getType())); C = llvm::ConstantExpr::getBitCast(C, DestTy); diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp index 6c7169f89b..44674a93d7 100644 --- a/lib/Lex/Lexer.cpp +++ b/lib/Lex/Lexer.cpp @@ -1267,8 +1267,9 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) { } /// LexStringLiteral - Lex the remainder of a string literal, after having lexed -/// either " or L". -void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { +/// either " or L" or u8" or u" or U". +void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, + tok::TokenKind Kind) { const char *NulCharacter = 0; // Does this string contain the \0 character? char C = getAndAdvanceChar(CurPtr, Result); @@ -1299,8 +1300,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) { // Update the location of the token as well as the BufferPtr instance var. const char *TokStart = Buff |