aboutsummaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
Diffstat (limited to 'lib')
-rw-r--r--lib/AST/ASTImporter.cpp4
-rw-r--r--lib/AST/Expr.cpp9
-rw-r--r--lib/AST/StmtDumper.cpp9
-rw-r--r--lib/AST/StmtPrinter.cpp18
-rw-r--r--lib/AST/StmtProfile.cpp4
-rw-r--r--lib/AST/Type.cpp12
-rw-r--r--lib/CodeGen/CodeGenModule.cpp16
-rw-r--r--lib/Lex/Lexer.cpp77
-rw-r--r--lib/Lex/LiteralSupport.cpp157
-rw-r--r--lib/Lex/MacroArgs.cpp8
-rw-r--r--lib/Lex/PPDirectives.cpp4
-rw-r--r--lib/Lex/PPExpressions.cpp16
-rw-r--r--lib/Lex/Pragma.cpp6
-rw-r--r--lib/Lex/TokenConcatenation.cpp64
-rw-r--r--lib/Parse/ParseCXXInlineMethods.cpp3
-rw-r--r--lib/Parse/ParseExpr.cpp6
-rw-r--r--lib/Parse/ParseTentative.cpp6
-rw-r--r--lib/Parse/Parser.cpp3
-rw-r--r--lib/Rewrite/HTMLRewrite.cpp9
-rw-r--r--lib/Rewrite/RewriteObjC.cpp19
-rw-r--r--lib/Sema/SemaChecking.cpp4
-rw-r--r--lib/Sema/SemaDeclAttr.cpp12
-rw-r--r--lib/Sema/SemaExpr.cpp39
-rw-r--r--lib/Sema/SemaExprCXX.cpp20
-rw-r--r--lib/Sema/SemaExprObjC.cpp6
-rw-r--r--lib/Sema/SemaInit.cpp32
-rw-r--r--lib/Sema/SemaStmt.cpp8
-rw-r--r--lib/Sema/SemaTemplate.cpp18
-rw-r--r--lib/Serialization/ASTReaderStmt.cpp4
-rw-r--r--lib/Serialization/ASTWriterStmt.cpp4
30 files changed, 403 insertions, 194 deletions
diff --git a/lib/AST/ASTImporter.cpp b/lib/AST/ASTImporter.cpp
index 2ea79912d1..d6e7d77d0f 100644
--- a/lib/AST/ASTImporter.cpp
+++ b/lib/AST/ASTImporter.cpp
@@ -3814,8 +3814,8 @@ Expr *ASTNodeImporter::VisitCharacterLiteral(CharacterLiteral *E) {
if (T.isNull())
return 0;
- return new (Importer.getToContext()) CharacterLiteral(E->getValue(),
- E->isWide(), T,
+ return new (Importer.getToContext()) CharacterLiteral(E->getValue(),
+ E->getKind(), T,
Importer.Import(E->getLocation()));
}
diff --git a/lib/AST/Expr.cpp b/lib/AST/Expr.cpp
index 58fb32d278..5e795be56d 100644
--- a/lib/AST/Expr.cpp
+++ b/lib/AST/Expr.cpp
@@ -533,8 +533,7 @@ double FloatingLiteral::getValueAsApproximateDouble() const {
}
StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str,
- bool Wide,
- bool Pascal, QualType Ty,
+ StringKind Kind, bool Pascal, QualType Ty,
const SourceLocation *Loc,
unsigned NumStrs) {
// Allocate enough space for the StringLiteral plus an array of locations for
@@ -549,7 +548,7 @@ StringLiteral *StringLiteral::Create(ASTContext &C, StringRef Str,
memcpy(AStrData, Str.data(), Str.size());
SL->StrData = AStrData;
SL->ByteLength = Str.size();
- SL->IsWide = Wide;
+ SL->Kind = Kind;
SL->IsPascal = Pascal;
SL->TokLocs[0] = Loc[0];
SL->NumConcatenated = NumStrs;
@@ -587,8 +586,8 @@ void StringLiteral::setString(ASTContext &C, StringRef Str) {
SourceLocation StringLiteral::
getLocationOfByte(unsigned ByteNo, const SourceManager &SM,
const LangOptions &Features, const TargetInfo &Target) const {
- assert(!isWide() && "This doesn't work for wide strings yet");
-
+ assert(Kind == StringLiteral::Ascii && "This only works for ASCII strings");
+
// Loop over all of the tokens in this string until we find the one that
// contains the byte we're looking for.
unsigned TokNo = 0;
diff --git a/lib/AST/StmtDumper.cpp b/lib/AST/StmtDumper.cpp
index 7218af570f..ce4ae8e773 100644
--- a/lib/AST/StmtDumper.cpp
+++ b/lib/AST/StmtDumper.cpp
@@ -443,8 +443,13 @@ void StmtDumper::VisitStringLiteral(StringLiteral *Str) {
DumpExpr(Str);
// FIXME: this doesn't print wstrings right.
OS << " ";
- if (Str->isWide())
- OS << "L";
+ switch (Str->getKind()) {
+ case StringLiteral::Ascii: break; // No prefix
+ case StringLiteral::Wide: OS << 'L'; break;
+ case StringLiteral::UTF8: OS << "u8"; break;
+ case StringLiteral::UTF16: OS << 'u'; break;
+ case StringLiteral::UTF32: OS << 'U'; break;
+ }
OS << '"';
OS.write_escaped(Str->getString());
OS << '"';
diff --git a/lib/AST/StmtPrinter.cpp b/lib/AST/StmtPrinter.cpp
index 8fcad14ec2..79f14bc658 100644
--- a/lib/AST/StmtPrinter.cpp
+++ b/lib/AST/StmtPrinter.cpp
@@ -599,8 +599,14 @@ void StmtPrinter::VisitPredefinedExpr(PredefinedExpr *Node) {
void StmtPrinter::VisitCharacterLiteral(CharacterLiteral *Node) {
unsigned value = Node->getValue();
- if (Node->isWide())
- OS << "L";
+
+ switch (Node->getKind()) {
+ case CharacterLiteral::Ascii: break; // no prefix.
+ case CharacterLiteral::Wide: OS << 'L'; break;
+ case CharacterLiteral::UTF16: OS << 'u'; break;
+ case CharacterLiteral::UTF32: OS << 'U'; break;
+ }
+
switch (value) {
case '\\':
OS << "'\\\\'";
@@ -672,7 +678,13 @@ void StmtPrinter::VisitImaginaryLiteral(ImaginaryLiteral *Node) {
}
void StmtPrinter::VisitStringLiteral(StringLiteral *Str) {
- if (Str->isWide()) OS << 'L';
+ switch (Str->getKind()) {
+ case StringLiteral::Ascii: break; // no prefix.
+ case StringLiteral::Wide: OS << 'L'; break;
+ case StringLiteral::UTF8: OS << "u8"; break;
+ case StringLiteral::UTF16: OS << 'u'; break;
+ case StringLiteral::UTF32: OS << 'U'; break;
+ }
OS << '"';
// FIXME: this doesn't print wstrings right.
diff --git a/lib/AST/StmtProfile.cpp b/lib/AST/StmtProfile.cpp
index 120c9e50a9..12321ef0d6 100644
--- a/lib/AST/StmtProfile.cpp
+++ b/lib/AST/StmtProfile.cpp
@@ -252,7 +252,7 @@ void StmtProfiler::VisitIntegerLiteral(const IntegerLiteral *S) {
void StmtProfiler::VisitCharacterLiteral(const CharacterLiteral *S) {
VisitExpr(S);
- ID.AddBoolean(S->isWide());
+ ID.AddInteger(S->getKind());
ID.AddInteger(S->getValue());
}
@@ -269,7 +269,7 @@ void StmtProfiler::VisitImaginaryLiteral(const ImaginaryLiteral *S) {
void StmtProfiler::VisitStringLiteral(const StringLiteral *S) {
VisitExpr(S);
ID.AddString(S->getString());
- ID.AddBoolean(S->isWide());
+ ID.AddInteger(S->getKind());
}
void StmtProfiler::VisitParenExpr(const ParenExpr *S) {
diff --git a/lib/AST/Type.cpp b/lib/AST/Type.cpp
index 7cd3be2fb4..2555ab31fb 100644
--- a/lib/AST/Type.cpp
+++ b/lib/AST/Type.cpp
@@ -635,6 +635,18 @@ bool Type::isWideCharType() const {
return false;
}
+bool Type::isChar16Type() const {
+ if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
+ return BT->getKind() == BuiltinType::Char16;
+ return false;
+}
+
+bool Type::isChar32Type() const {
+ if (const BuiltinType *BT = dyn_cast<BuiltinType>(CanonicalType))
+ return BT->getKind() == BuiltinType::Char32;
+ return false;
+}
+
/// \brief Determine whether this type is any of the built-in character
/// types.
bool Type::isAnyCharacterType() const {
diff --git a/lib/CodeGen/CodeGenModule.cpp b/lib/CodeGen/CodeGenModule.cpp
index 290fe242c9..ce32325aca 100644
--- a/lib/CodeGen/CodeGenModule.cpp
+++ b/lib/CodeGen/CodeGenModule.cpp
@@ -1877,8 +1877,20 @@ std::string CodeGenModule::GetStringForStringLiteral(const StringLiteral *E) {
// Resize the string to the right size.
uint64_t RealLen = CAT->getSize().getZExtValue();
- if (E->isWide())
+ switch (E->getKind()) {
+ case StringLiteral::Ascii:
+ case StringLiteral::UTF8:
+ break;
+ case StringLiteral::Wide:
RealLen *= Context.Target.getWCharWidth() / Context.getCharWidth();
+ break;
+ case StringLiteral::UTF16:
+ RealLen *= Context.Target.getChar16Width() / Context.getCharWidth();
+ break;
+ case StringLiteral::UTF32:
+ RealLen *= Context.Target.getChar32Width() / Context.getCharWidth();
+ break;
+ }
std::string Str = E->getString().str();
Str.resize(RealLen, '\0');
@@ -1893,7 +1905,7 @@ CodeGenModule::GetAddrOfConstantStringFromLiteral(const StringLiteral *S) {
// FIXME: This can be more efficient.
// FIXME: We shouldn't need to bitcast the constant in the wide string case.
llvm::Constant *C = GetAddrOfConstantString(GetStringForStringLiteral(S));
- if (S->isWide()) {
+ if (S->isWide() || S->isUTF16() || S->isUTF32()) {
llvm::Type *DestTy =
llvm::PointerType::getUnqual(getTypes().ConvertType(S->getType()));
C = llvm::ConstantExpr::getBitCast(C, DestTy);
diff --git a/lib/Lex/Lexer.cpp b/lib/Lex/Lexer.cpp
index 6c7169f89b..44674a93d7 100644
--- a/lib/Lex/Lexer.cpp
+++ b/lib/Lex/Lexer.cpp
@@ -1267,8 +1267,9 @@ void Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
}
/// LexStringLiteral - Lex the remainder of a string literal, after having lexed
-/// either " or L".
-void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
+/// either " or L" or u8" or u" or U".
+void Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
+ tok::TokenKind Kind) {
const char *NulCharacter = 0; // Does this string contain the \0 character?
char C = getAndAdvanceChar(CurPtr, Result);
@@ -1299,8 +1300,7 @@ void Lexer::LexStringLiteral(Token &Result, const char *CurPtr, bool Wide) {
// Update the location of the token as well as the BufferPtr instance var.
const char *TokStart = BufferPtr;
- FormTokenWithChars(Result, CurPtr,
- Wide ? tok::wide_string_literal : tok::string_literal);
+ FormTokenWithChars(Result, CurPtr, Kind);
Result.setLiteralData(TokStart);
}
@@ -1339,8 +1339,9 @@ void Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
/// LexCharConstant - Lex the remainder of a character constant, after having
-/// lexed either ' or L'.
-void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
+/// lexed either ' or L' or u' or U'.
+void Lexer::LexCharConstant(Token &Result, const char *CurPtr,
+ tok::TokenKind Kind) {
const char *NulCharacter = 0; // Does this character contain the \0 character?
char C = getAndAdvanceChar(CurPtr, Result);
@@ -1377,7 +1378,7 @@ void Lexer::LexCharConstant(Token &Result, const char *CurPtr) {
// Update the location of token as well as BufferPtr.
const char *TokStart = BufferPtr;
- FormTokenWithChars(Result, CurPtr, tok::char_constant);
+ FormTokenWithChars(Result, CurPtr, Kind);
Result.setLiteralData(TokStart);
}
@@ -2185,6 +2186,55 @@ LexNextToken:
MIOpt.ReadToken();
return LexNumericConstant(Result, CurPtr);
+ case 'u': // Identifier (uber) or C++0x UTF-8 or UTF-16 string literal
+ // Notify MIOpt that we read a non-whitespace/non-comment token.
+ MIOpt.ReadToken();
+
+ if (Features.CPlusPlus0x) {
+ Char = getCharAndSize(CurPtr, SizeTmp);
+
+ // UTF-16 string literal
+ if (Char == '"')
+ return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf16_string_literal);
+
+ // UTF-16 character constant
+ if (Char == '\'')
+ return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf16_char_constant);
+
+ // UTF-8 string literal
+ if (Char == '8' && getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
+ return LexStringLiteral(Result,
+ ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
+ SizeTmp2, Result),
+ tok::utf8_string_literal);
+ }
+
+ // treat u like the start of an identifier.
+ return LexIdentifier(Result, CurPtr);
+
+ case 'U': // Identifier (Uber) or C++0x UTF-32 string literal
+ // Notify MIOpt that we read a non-whitespace/non-comment token.
+ MIOpt.ReadToken();
+
+ if (Features.CPlusPlus0x) {
+ Char = getCharAndSize(CurPtr, SizeTmp);
+
+ // UTF-32 string literal
+ if (Char == '"')
+ return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf32_string_literal);
+
+ // UTF-32 character constant
+ if (Char == '\'')
+ return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::utf32_char_constant);
+ }
+
+ // treat U like the start of an identifier.
+ return LexIdentifier(Result, CurPtr);
+
case 'L': // Identifier (Loony) or wide literal (L'x' or L"xyz").
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
@@ -2193,21 +2243,22 @@ LexNextToken:
// Wide string literal.
if (Char == '"')
return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
- true);
+ tok::wide_string_literal);
// Wide character constant.
if (Char == '\'')
- return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
+ return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
+ tok::wide_char_constant);
// FALL THROUGH, treating L like the start of an identifier.
// C99 6.4.2: Identifiers.
case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
case 'H': case 'I': case 'J': case 'K': /*'L'*/case 'M': case 'N':
- case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': case 'U':
+ case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T': /*'U'*/
case 'V': case 'W': case 'X': case 'Y': case 'Z':
case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
- case 'o': case 'p': case 'q': case 'r': case 's': case 't': case 'u':
+ case 'o': case 'p': case 'q': case 'r': case 's': case 't': /*'u'*/
case 'v': case 'w': case 'x': case 'y': case 'z':
case '_':
// Notify MIOpt that we read a non-whitespace/non-comment token.
@@ -2230,13 +2281,13 @@ LexNextToken:
case '\'':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
- return LexCharConstant(Result, CurPtr);
+ return LexCharConstant(Result, CurPtr, tok::char_constant);
// C99 6.4.5: String Literals.
case '"':
// Notify MIOpt that we read a non-whitespace/non-comment token.
MIOpt.ReadToken();
- return LexStringLiteral(Result, CurPtr, false);
+ return LexStringLiteral(Result, CurPtr, tok::string_literal);
// C99 6.4.6: Punctuators.
case '?':
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index f8a2a55117..82493408e6 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -28,12 +28,31 @@ static int HexDigitValue(char C) {
return -1;
}
+static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
+ switch (kind) {
+ default: assert(0 && "Unknown token type!");
+ case tok::char_constant:
+ case tok::string_literal:
+ case tok::utf8_string_literal:
+ return Target.getCharWidth();
+ case tok::wide_char_constant:
+ case tok::wide_string_literal:
+ return Target.getWCharWidth();
+ case tok::utf16_char_constant:
+ case tok::utf16_string_literal:
+ return Target.getChar16Width();
+ case tok::utf32_char_constant:
+ case tok::utf32_string_literal:
+ return Target.getChar32Width();
+ }
+}
+
/// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
/// either a character or a string literal.
static unsigned ProcessCharEscape(const char *&ThisTokBuf,
const char *ThisTokEnd, bool &HadError,
- FullSourceLoc Loc, bool IsWide,
- Diagnostic *Diags, const TargetInfo &Target) {
+ FullSourceLoc Loc, unsigned CharWidth,
+ Diagnostic *Diags) {
// Skip the '\' char.
++ThisTokBuf;
@@ -98,9 +117,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
}
// See if any bits will be truncated when evaluated as a character.
- unsigned CharWidth =
- IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
Overflow = true;
ResultChar &= ~0U >> (32-CharWidth);
@@ -128,9 +144,6 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
// Check for overflow. Reject '\777', but not L'\777'.
- unsigned CharWidth =
- IsWide ? Target.getWCharWidth() : Target.getCharWidth();
-
if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
if (Diags)
Diags->Report(Loc, diag::warn_octal_escape_too_large);
@@ -219,8 +232,8 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
/// we will likely rework our support for UCN's.
static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
char *&ResultBuf, bool &HadError,
- FullSourceLoc Loc, bool wide, Diagnostic *Diags,
- const LangOptions &Features) {
+ FullSourceLoc Loc, unsigned CharByteWidth,
+ Diagnostic *Diags, const LangOptions &Features) {
typedef uint32_t UTF32;
UTF32 UcnVal = 0;
unsigned short UcnLen = 0;
@@ -230,19 +243,22 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
return;
}
- if (wide) {
- (void)UcnLen;
- assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
+ assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth) &&
+ "only character widths of 1, 2, or 4 bytes supported");
- if (!Features.ShortWChar) {
- // Note: our internal rep of wide char tokens is always little-endian.
- *ResultBuf++ = (UcnVal & 0x000000FF);
- *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
- *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
- *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
- return;
- }
+ (void)UcnLen;
+ assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
+
+ if (CharByteWidth == 4) {
+ // Note: our internal rep of wide char tokens is always little-endian.
+ *ResultBuf++ = (UcnVal & 0x000000FF);
+ *ResultBuf++ = (UcnVal & 0x0000FF00) >> 8;
+ *ResultBuf++ = (UcnVal & 0x00FF0000) >> 16;
+ *ResultBuf++ = (UcnVal & 0xFF000000) >> 24;
+ return;
+ }
+ if (CharByteWidth == 2) {
// Convert to UTF16.
if (UcnVal < (UTF32)0xFFFF) {
*ResultBuf++ = (UcnVal & 0x000000FF);
@@ -261,6 +277,9 @@ static void EncodeUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
*ResultBuf++ = (surrogate2 & 0x0000FF00) >> 8;
return;
}
+
+ assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
+
// Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
// The conversion below was inspired by:
// http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
@@ -695,13 +714,18 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
- SourceLocation Loc, Preprocessor &PP) {
+ SourceLocation Loc, Preprocessor &PP,
+ tok::TokenKind kind) {
// At this point we know that the character matches the regex "L?'.*'".
HadError = false;
- // Determine if this is a wide character.
- IsWide = begin[0] == 'L';
- if (IsWide) ++begin;
+ Kind = kind;
+
+ // Determine if this is a wide or UTF character.
+ if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
+ Kind == tok::utf32_char_constant) {
+ ++begin;
+ }
// Skip over the entry quote.
assert(begin[0] == '\'' && "Invalid token lexed");
@@ -742,17 +766,17 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
ResultChar = utf32;
} else {
// Otherwise, this is a non-UCN escape character. Process it.
+ unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
ResultChar = ProcessCharEscape(begin, end, HadError,
FullSourceLoc(Loc,PP.getSourceManager()),
- IsWide,
- &PP.getDiagnostics(), PP.getTargetInfo());
+ CharWidth, &PP.getDiagnostics());
}
}
// If this is a multi-character constant (e.g. 'abc'), handle it. These are
// implementation defined (C99 6.4.4.4p10).
if (NumCharsSoFar) {
- if (IsWide) {
+ if (!isAscii()) {
// Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
LitVal = 0;
} else {
@@ -774,8 +798,8 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
if (NumCharsSoFar > 1) {
// Warn about discarding the top bits for multi-char wide-character
// constants (L'abcd').
- if (IsWide)
- PP.Diag(Loc, diag::warn_extraneous_wide_char_constant);
+ if (!isAscii())
+ PP.Diag(Loc, diag::warn_extraneous_char_constant);
else if (NumCharsSoFar != 4)
PP.Diag(Loc, diag::ext_multichar_character_literal);
else
@@ -787,14 +811,15 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
// Transfer the value from APInt to uint64_t
Value = LitVal.getZExtValue();
- if (IsWide && PP.getLangOptions().ShortWChar && Value > 0xFFFF)
+ if (((isWide() && PP.getLangOptions().ShortWChar) || isUTF16()) &&
+ Value > 0xFFFF)
PP.Diag(Loc, diag::warn_ucn_escape_too_large);
// If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
// if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
// character constants are not sign extended in the this implementation:
// '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
- if (!IsWide && NumCharsSoFar == 1 && (Value & 128) &&
+ if (isAscii() && NumCharsSoFar == 1 && (Value & 128) &&
PP.getLangOptions().CharIsSigned)
Value = (signed char)Value;
}
@@ -839,8 +864,8 @@ StringLiteralParser(const Token *StringToks, unsigned NumStringToks,
Preprocessor &PP, bool Complain)
: SM(PP.getSourceManager()), Features(PP.getLangOptions()),
Target(PP.getTargetInfo()), Diags(Complain ? &PP.getDiagnostics() : 0),
- MaxTokenLength(0), SizeBound(0), wchar_tByteWidth(0),
- ResultPtr(ResultBuf.data()), hadError(false), AnyWide(false), Pascal(false) {
+ MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
+ ResultPtr(ResultBuf.data()), hadError(false), Pascal(false) {
init(StringToks, NumStringToks);
}
@@ -860,7 +885,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
MaxTokenLength = StringToks[0].getLength();
assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
SizeBound = StringToks[0].getLength()-2; // -2 for "".
- AnyWide = StringToks[0].is(tok::wide_string_literal);
+ Kind = StringToks[0].getKind();
hadError = false;
@@ -881,8 +906,18 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
if (StringToks[i].getLength() > MaxTokenLength)
MaxTokenLength = StringToks[i].getLength();
- // Remember if we see any wide strings.
- AnyWide |= StringToks[i].is(tok::wide_string_literal);
+ // Remember if we see any wide or utf-8/16/32 strings.
+ // Also check for illegal concatenations.
+ if (StringToks[i].isNot(Kind) && StringToks[i].isNot(tok::string_literal)) {
+ if (isAscii()) {
+ Kind = StringToks[i].getKind();
+ } else {
+ if (Diags)
+ Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+ diag::err_unsupported_string_concat);
+ hadError = true;
+ }
+ }
}
// Include space for the null terminator.
@@ -890,19 +925,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
// TODO: K&R warning: "traditional C rejects string constant concatenation"
- // Get the width in bytes of wchar_t. If no wchar_t strings are used, do not
- // query the target. As such, wchar_tByteWidth is only valid if AnyWide=true.
- wchar_tByteWidth = ~0U;
- if (AnyWide) {
- wchar_tByteWidth = Target.getWCharWidth();
- assert((wchar_tByteWidth & 7) == 0 && "Assumes wchar_t is byte multiple!");
- wchar_tByteWidth /= 8;
- }
+ // Get the width in bytes of char/wchar_t/char16_t/char32_t
+ CharByteWidth = getCharWidth(Kind, Target);
+ assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
+ CharByteWidth /= 8;
// The output buffer size needs to be large enough to hold wide characters.
// This is a worst-case assumption which basically corresponds to L"" "long".
- if (AnyWide)
- SizeBound *= wchar_tByteWidth;
+ SizeBound *= CharByteWidth;
// Size the temporary buffer to hold the result string data.
ResultBuf.resize(SizeBound);
@@ -927,18 +957,19 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
&StringInvalid);
if (StringInvalid) {
- hadError = 1;
+ hadError = true;
continue;
}
const char *ThisTokEnd = ThisTokBuf+ThisTokLen-1; // Skip end quote.
- bool wide = false;
// TODO: Input character set mapping support.
// Skip L marker for wide strings.
- if (ThisTokBuf[0] == 'L') {
- wide = true;
+ if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
++ThisTokBuf;
+ // Skip 8 of u8 marker for utf8 strings.
+ if (ThisTokBuf[0] == '8')
+ ++ThisTokBuf;
}
assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?");
@@ -967,7 +998,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
// Copy the character span over.
unsigned Len = ThisTokBuf-InStart;
- if (!AnyWide) {
+ if (CharByteWidth == 1) {
memcpy(ResultPtr, InStart, Len);
ResultPtr += Len;
} else {
@@ -975,7 +1006,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
for (; Len; --Len, ++InStart) {
*ResultPtr++ = InStart[0];
// Add zeros at the end.
- for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
+ for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
*ResultPtr++ = 0;
}
}
@@ -985,29 +1016,26 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U') {
EncodeUCNEscape(ThisTokBuf, ThisTokEnd, ResultPtr,
hadError, FullSourceLoc(StringToks[i].getLocation(),SM),
- wide, Diags, Features);
+ CharByteWidth, Diags, Features);
continue;
}
// Otherwise, this is a non-UCN escape character. Process it.
unsigned ResultChar =
ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
FullSourceLoc(StringToks[i].getLocation(), SM),
- AnyWide, Diags, Target);
+ CharByteWidth*8, Diags);
// Note: our internal rep of wide char tokens is always little-endian.
*ResultPtr++ = ResultChar & 0xFF;
- if (AnyWide) {
- for (unsigned i = 1, e = wchar_tByteWidth; i != e; ++i)
- *ResultPtr++ = ResultChar >> i*8;
- }
+ for (unsigned i = 1, e = CharByteWidth; i != e; ++i)
+ *ResultPtr++ = ResultChar >> i*8;
}
}
if (Pascal) {
ResultBuf[0] = ResultPtr-&ResultBuf[0]-1;
- if (AnyWide)
- ResultBuf[0] /= wchar_tByteWidth;
+ ResultBuf[0] /= CharByteWidth;
// Verify that pascal strings aren't too large.
if (GetStringLength() > 256) {
@@ -1016,7 +1044,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
diag::err_pascal_string_too_long)
<< SourceRange(StringToks[0].getLocation(),
StringToks[NumStringToks-1].getLocation());
- hadError = 1;
+ hadError = true;
return;
}
} else if (Diags) {
@@ -1050,7 +1078,8 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
if (StringInvalid)
return 0;
- assert(SpellingPtr[0] != 'L' && "Doesn't handle wide strings yet");
+ assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
+ SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
const char *SpellingStart = SpellingPtr;
@@ -1075,7 +1104,7 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
bool HadError = false;
ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
FullSourceLoc(Tok.getLocation(), SM),
- false, Diags, Target);
+ CharByteWidth*8, Diags);
assert(!HadError && "This method isn't valid on erroneous strings");
--ByteNo;
}
diff --git a/lib/Lex/MacroArgs.cpp b/lib/Lex/MacroArgs.cpp
index 968c15e3c2..ccd0b705c8 100644
--- a/lib/Lex/MacroArgs.cpp
+++ b/lib/Lex/MacroArgs.cpp
@@ -208,7 +208,13 @@ Token MacroArgs::StringifyArgument(const Token *ArgToks,
// by 6.10.3.2p2.
if (Tok.is(tok::string_literal) || // "foo"
Tok.is(tok::wide_string_literal) || // L"foo"
- Tok.is(tok::char_constant)) { // 'x' and L'x'.
+ Tok.is(tok::utf8_string_literal) || // u8"foo"
+ Tok.is(tok::utf16_string_literal) || // u"foo"
+ Tok.is(tok::utf32_string_literal) || // U"foo"
+ Tok.is(tok::char_constant) || // 'x'
+ Tok.is(tok::wide_char_constant) || // L'x'.
+ Tok.is(tok::utf16_char_constant) || // u'x'.
+ Tok.is(tok::utf32_char_constant)) { // U'x'.
bool Invalid = false;
std::string TokStr = PP.getSpelling(Tok, &Invalid);
if (!Invalid) {
diff --git a/lib/Lex/PPDirectives.cpp b/lib/Lex/PPDirectives.cpp
index 212ffeef1b..383c6f5aa1 100644
--- a/lib/Lex/PPDirectives.cpp
+++ b/