aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSeth Cantrell <seth.cantrell@gmail.com>2012-01-18 12:27:04 +0000
committerSeth Cantrell <seth.cantrell@gmail.com>2012-01-18 12:27:04 +0000
commitbe773526230b5a7121a8b321b05f2e53fa473f5c (patch)
treea755e66cb063593de8749f6f94c0aba1647d8eaf
parent7d6a7c004361c45a463874a58da28d9f3c35f2e6 (diff)
Improves support for Unicode in character literals
Updates ProcessUCNExcape() for C++. C++11 allows UCNs in character and string literals that represent control characters and basic source characters. Also C++03 allows UCNs that refer to surrogate codepoints. UTF-8 sequences in character literals are now handled as single c-chars. Added error for multiple characters in Unicode character literals. Added errors for when a the execution charset encoding of a c-char cannot be represented as a single code unit in the associated character type. Note that for the purposes of this error the asso- ciated character type for a narrow character literal is char, not int, even though in C narrow character literals have type int. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@148389 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/clang/Basic/DiagnosticLexKinds.td8
-rw-r--r--lib/Lex/LiteralSupport.cpp179
2 files changed, 121 insertions, 66 deletions
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index f4d867d4fe..547f2722c0 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -107,6 +107,8 @@ def warn_extraneous_char_constant : Warning<
"extraneous characters in character constant ignored">;
def warn_char_constant_too_large : Warning<
"character constant too long for its type">;
+def err_multichar_utf_character_literal : Error<
+ "Unicode character literals may not contain multiple characters">;
def err_exponent_has_no_digits : Error<"exponent has no digits">;
def ext_imaginary_constant : Extension<"imaginary constants are an extension">;
def err_hexconstant_requires_exponent : Error<
@@ -121,8 +123,8 @@ def warn_hex_escape_too_large : ExtWarn<"hex escape sequence out of range">;
def ext_string_too_long : Extension<"string literal of length %0 exceeds "
"maximum length %1 that %select{C90|ISO C99|C++}2 compilers are required to "
"support">, InGroup<OverlengthStrings>;
-def warn_ucn_escape_too_large : ExtWarn<
- "character unicode escape sequence too long for its type">, InGroup<Unicode>;
+def err_character_too_large : Error<
+ "character too large for enclosing character literal type">;
def warn_ucn_not_valid_in_c89 : ExtWarn<
"unicode escape sequences are only valid in C99 or C++">, InGroup<Unicode>;
def warn_cxx98_compat_unicode_literal : Warning<
@@ -132,6 +134,8 @@ def err_unsupported_string_concat : Error<
"unsupported non-standard concatenation of string literals">;
def err_bad_string_encoding : Error<
"illegal sequence in string literal">;
+def err_bad_character_encoding : Error<
+ "illegal sequence in character literal">;
//===----------------------------------------------------------------------===//
// PTH Diagnostics
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index 296a89461f..8265c6fde4 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -182,7 +182,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
uint32_t &UcnVal, unsigned short &UcnLen,
FullSourceLoc Loc, DiagnosticsEngine *Diags,
- const LangOptions &Features) {
+ const LangOptions &Features,
+ bool in_char_string_literal = false) {
if (!Features.CPlusPlus && !Features.C99 && Diags)
Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
@@ -216,11 +217,20 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
}
return false;
}
- // Check UCN constraints (C99 6.4.3p2).
- if ((UcnVal < 0xa0 &&
- (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
- || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
- || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
+ // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
+ bool invalid_ucn = (0xD800<=UcnVal && UcnVal<=0xDFFF) // surrogate codepoints
+ || 0x10FFFF < UcnVal; // maximum legal UTF32 value
+
+ // C++11 allows UCNs that refer to control characters and basic source
+ // characters inside character and string literals
+ if (!Features.CPlusPlus0x || !in_char_string_literal) {
+ if ((UcnVal < 0xa0 &&
+ (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 ))) { // $, @, `
+ invalid_ucn = true;
+ }
+ }
+
+ if (invalid_ucn) {
if (Diags)
Diags->Report(Loc, diag::err_ucn_escape_invalid);
return false;
@@ -747,14 +757,13 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
SourceLocation Loc, Preprocessor &PP,
tok::TokenKind kind) {
- // At this point we know that the character matches the regex "L?'.*'".
+ // At this point we know that the character matches the regex "(L|u|U)?'.*'".
HadError = false;
Kind = kind;
- // Determine if this is a wide or UTF character.
- if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
- Kind == tok::utf32_char_constant) {
+ // Skip over wide character determinant.
+ if (Kind != tok::char_constant) {
++begin;
}
@@ -762,6 +771,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
assert(begin[0] == '\'' && "Invalid token lexed");
++begin;
+ // Trim the ending quote.
+ assert(end[-1] == '\'' && "Invalid token lexed");
+ --end;
+
// FIXME: The "Value" is an uint64_t so we can handle char literals of
// up to 64-bits.
// FIXME: This extensively assumes that 'char' is 8-bits.
@@ -773,76 +786,114 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
"Assumes sizeof(wchar) on target is <= 64");
- // This is what we will use for overflow detection
- llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
-
- unsigned NumCharsSoFar = 0;
- bool Warned = false;
- while (begin[0] != '\'') {
- uint64_t ResultChar;
-
- // Is this a Universal Character Name escape?
- if (begin[0] != '\\') // If this is a normal character, consume it.
- ResultChar = (unsigned char)*begin++;
- else { // Otherwise, this is an escape character.
- unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
- // Check for UCN.
- if (begin[1] == 'u' || begin[1] == 'U') {
- uint32_t utf32 = 0;
- unsigned short UcnLen = 0;
- if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
- FullSourceLoc(Loc, PP.getSourceManager()),
- &PP.getDiagnostics(), PP.getLangOptions())) {
- HadError = 1;
- }
- ResultChar = utf32;
- if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
- PP.Diag(Loc, diag::warn_ucn_escape_too_large);
- ResultChar &= ~0U >> (32-CharWidth);
- }
- } else {
- // Otherwise, this is a non-UCN escape character. Process it.
- ResultChar = ProcessCharEscape(begin, end, HadError,
- FullSourceLoc(Loc,PP.getSourceManager()),
- CharWidth, &PP.getDiagnostics());
- }
- }
+ SmallVector<uint32_t,4> codepoint_buffer;
+ codepoint_buffer.resize(end-begin);
+ uint32_t *buffer_begin = &codepoint_buffer.front();
+ uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
+
+ // Unicode escapes representing characters that cannot be correctly
+ // represented in a single code unit are disallowed in character literals
+ // by this implementation.
+ uint32_t largest_character_for_kind;
+ if (tok::wide_char_constant == Kind) {
+ largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
+ } else if (tok::utf16_char_constant == Kind) {
+ largest_character_for_kind = 0xFFFF;
+ } else if (tok::utf32_char_constant == Kind) {
+ largest_character_for_kind = 0x10FFFF;
+ } else {
+ largest_character_for_kind = 0x7Fu;
+ }
- // If this is a multi-character constant (e.g. 'abc'), handle it. These are
- // implementation defined (C99 6.4.4.4p10).
- if (NumCharsSoFar) {
- if (!isAscii()) {
- // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
- LitVal = 0;
+ while (begin!=end) {
+ // Is this a span of non-escape characters?
+ if (begin[0] != '\\') {
+ char const *start = begin;
+ do {
+ ++begin;
+ } while (begin != end && *begin != '\\');
+
+ uint32_t *tmp_begin = buffer_begin;
+ ConversionResult res =
+ ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
+ reinterpret_cast<UTF8 const *>(begin),
+ &buffer_begin,buffer_end,strictConversion);
+ if (res!=conversionOK) {
+ PP.Diag(Loc, diag::err_bad_character_encoding);
+ HadError = true;
} else {
- // Narrow character literals act as though their value is concatenated
- // in this implementation, but warn on overflow.
- if (LitVal.countLeadingZeros() < 8 && !Warned) {
- PP.Diag(Loc, diag::warn_char_constant_too_large);
- Warned = true;
+ for (; tmp_begin<buffer_begin; ++tmp_begin) {
+ if (*tmp_begin > largest_character_for_kind) {
+ HadError = true;
+ PP.Diag(Loc, diag::err_character_too_large);
+ }
}
- LitVal <<= 8;
}
+
+ continue;
}
+ // Is this a Universal Character Name excape?
+ if (begin[1] == 'u' || begin[1] == 'U') {
+ unsigned short UcnLen = 0;
+ if (!ProcessUCNEscape(begin, end, *buffer_begin, UcnLen,
+ FullSourceLoc(Loc, PP.getSourceManager()),
+ &PP.getDiagnostics(), PP.getLangOptions(),
+ true))
+ {
+ HadError = true;
+ } else if (*buffer_begin > largest_character_for_kind) {
+ HadError = true;
+ PP.Diag(Loc,diag::err_character_too_large);
+ }
- LitVal = LitVal + ResultChar;
- ++NumCharsSoFar;
+ ++buffer_begin;
+ continue;
+ }
+ unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
+ uint64_t result =
+ ProcessCharEscape(begin, end, HadError,
+ FullSourceLoc(Loc,PP.getSourceManager()),
+ CharWidth, &PP.getDiagnostics());
+ *buffer_begin++ = result;
}
- // If this is the second character being processed, do special handling.
+ unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
+
if (NumCharsSoFar > 1) {
- // Warn about discarding the top bits for multi-char wide-character
- // constants (L'abcd').
- if (!isAscii())
+ if (isWide())
PP.Diag(Loc, diag::warn_extraneous_char_constant);
- else if (NumCharsSoFar != 4)
+ else if (isAscii() && NumCharsSoFar == 4)
+ PP.Diag(Loc, diag::ext_four_char_character_literal);
+ else if (isAscii())
PP.Diag(Loc, diag::ext_multichar_character_literal);
else
- PP.Diag(Loc, diag::ext_four_char_character_literal);
+ PP.Diag(Loc, diag::err_multichar_utf_character_literal);
IsMultiChar = true;
} else
IsMultiChar = false;
+ llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
+
+ // Narrow character literals act as though their value is concatenated
+ // in this implementation, but warn on overflow.
+ bool multi_char_too_long = false;
+ if (isAscii() && isMultiChar()) {
+ LitVal = 0;
+ for (size_t i=0;i<NumCharsSoFar;++i) {
+ // check for enough leading zeros to shift into
+ multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
+ LitVal <<= 8;
+ LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
+ }
+ } else if (NumCharsSoFar > 0) {
+ // otherwise just take the last character
+ LitVal = buffer_begin[-1];
+ }
+
+ if (!HadError && multi_char_too_long) {
+ PP.Diag(Loc,diag::warn_char_constant_too_large);
+ }
+
// Transfer the value from APInt to uint64_t
Value = LitVal.getZExtValue();