Improves support for Unicode in character literals

Updates ProcessUCNExcape() for C++. C++11 allows UCNs in character and string literals that represent control characters and basic source characters. Also C++03 allows UCNs that refer to surrogate codepoints. UTF-8 sequences in character literals are now handled as single c-chars. Added error for multiple characters in Unicode character literals. Added errors for when a the execution charset encoding of a c-char cannot be represented as a single code unit in the associated character type. Note that for the purposes of this error the asso- ciated character type for a narrow character literal is char, not int, even though in C narrow character literals have type int. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@148389 91177308-0d34-0410-b5e6-96231b3b80d8
author: Seth Cantrell <seth.cantrell@gmail.com> 2012-01-18 12:27:04 +0000
committer: Seth Cantrell <seth.cantrell@gmail.com> 2012-01-18 12:27:04 +0000
commit: be773526230b5a7121a8b321b05f2e53fa473f5c (patch)
tree: a755e66cb063593de8749f6f94c0aba1647d8eaf
parent: 7d6a7c004361c45a463874a58da28d9f3c35f2e6 (diff)
2 files changed, 121 insertions, 66 deletions
diff --git a/include/clang/Basic/DiagnosticLexKinds.td b/include/clang/Basic/DiagnosticLexKinds.td
index f4d867d4fe..547f2722c0 100644
--- a/include/clang/Basic/DiagnosticLexKinds.td
+++ b/include/clang/Basic/DiagnosticLexKinds.td
@@ -107,6 +107,8 @@ def warn_extraneous_char_constant : Warning<
   "extraneous characters in character constant ignored">;
 def warn_char_constant_too_large : Warning<
   "character constant too long for its type">;
+def err_multichar_utf_character_literal : Error<
+  "Unicode character literals may not contain multiple characters">;
 def err_exponent_has_no_digits : Error<"exponent has no digits">;
 def ext_imaginary_constant : Extension<"imaginary constants are an extension">;
 def err_hexconstant_requires_exponent : Error<
@@ -121,8 +123,8 @@ def warn_hex_escape_too_large : ExtWarn<"hex escape sequence out of range">;
 def ext_string_too_long : Extension<"string literal of length %0 exceeds "
   "maximum length %1 that %select{C90|ISO C99|C++}2 compilers are required to "
   "support">, InGroup<OverlengthStrings>;
-def warn_ucn_escape_too_large : ExtWarn<
-  "character unicode escape sequence too long for its type">, InGroup<Unicode>;
+def err_character_too_large : Error<
+  "character too large for enclosing character literal type">;
 def warn_ucn_not_valid_in_c89 : ExtWarn<
   "unicode escape sequences are only valid in C99 or C++">, InGroup<Unicode>;
 def warn_cxx98_compat_unicode_literal : Warning<
@@ -132,6 +134,8 @@ def err_unsupported_string_concat : Error<
   "unsupported non-standard concatenation of string literals">;
 def err_bad_string_encoding : Error<
   "illegal sequence in string literal">;
+def err_bad_character_encoding : Error<
+  "illegal sequence in character literal">;
   
 //===----------------------------------------------------------------------===//
 // PTH Diagnostics
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index 296a89461f..8265c6fde4 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -182,7 +182,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
 static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
                              uint32_t &UcnVal, unsigned short &UcnLen,
                              FullSourceLoc Loc, DiagnosticsEngine *Diags, 
-                             const LangOptions &Features) {
+                             const LangOptions &Features,
+                             bool in_char_string_literal = false) {
   if (!Features.CPlusPlus && !Features.C99 && Diags)
     Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
 
@@ -216,11 +217,20 @@ static bool ProcessUCNEscape(const char *&ThisTokBuf, const char *ThisTokEnd,
     }
     return false;
   }
-  // Check UCN constraints (C99 6.4.3p2).
-  if ((UcnVal < 0xa0 &&
-      (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 )) // $, @, `
-      || (UcnVal >= 0xD800 && UcnVal <= 0xDFFF)
-      || (UcnVal > 0x10FFFF)) /* the maximum legal UTF32 value */ {
+  // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
+  bool invalid_ucn = (0xD800<=UcnVal && UcnVal<=0xDFFF) // surrogate codepoints
+                       || 0x10FFFF < UcnVal; // maximum legal UTF32 value
+
+  // C++11 allows UCNs that refer to control characters and basic source
+  // characters inside character and string literals
+  if (!Features.CPlusPlus0x || !in_char_string_literal) {
+    if ((UcnVal < 0xa0 &&
+         (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60 ))) {  // $, @, `
+      invalid_ucn = true;
+    }
+  }
+
+  if (invalid_ucn) {
     if (Diags)
       Diags->Report(Loc, diag::err_ucn_escape_invalid);
     return false;
@@ -747,14 +757,13 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
                                      SourceLocation Loc, Preprocessor &PP,
                                      tok::TokenKind kind) {
-  // At this point we know that the character matches the regex "L?'.*'".
+  // At this point we know that the character matches the regex "(L|u|U)?'.*'".
   HadError = false;
 
   Kind = kind;
 
-  // Determine if this is a wide or UTF character.
-  if (Kind == tok::wide_char_constant || Kind == tok::utf16_char_constant ||
-      Kind == tok::utf32_char_constant) {
+  // Skip over wide character determinant.
+  if (Kind != tok::char_constant) {
     ++begin;
   }
 
@@ -762,6 +771,10 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
   assert(begin[0] == '\'' && "Invalid token lexed");
   ++begin;
 
+  // Trim the ending quote.
+  assert(end[-1] == '\'' && "Invalid token lexed");
+  --end;
+
   // FIXME: The "Value" is an uint64_t so we can handle char literals of
   // up to 64-bits.
   // FIXME: This extensively assumes that 'char' is 8-bits.
@@ -773,76 +786,114 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
          "Assumes sizeof(wchar) on target is <= 64");
 
-  // This is what we will use for overflow detection
-  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
-
-  unsigned NumCharsSoFar = 0;
-  bool Warned = false;
-  while (begin[0] != '\'') {
-    uint64_t ResultChar;
-
-      // Is this a Universal Character Name escape?
-    if (begin[0] != '\\')     // If this is a normal character, consume it.
-      ResultChar = (unsigned char)*begin++;
-    else {                    // Otherwise, this is an escape character.
-      unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
-      // Check for UCN.
-      if (begin[1] == 'u' || begin[1] == 'U') {
-        uint32_t utf32 = 0;
-        unsigned short UcnLen = 0;
-        if (!ProcessUCNEscape(begin, end, utf32, UcnLen,
-                              FullSourceLoc(Loc, PP.getSourceManager()),
-                              &PP.getDiagnostics(), PP.getLangOptions())) {
-          HadError = 1;
-        }
-        ResultChar = utf32;
-        if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
-          PP.Diag(Loc, diag::warn_ucn_escape_too_large);
-          ResultChar &= ~0U >> (32-CharWidth);
-        }
-      } else {
-        // Otherwise, this is a non-UCN escape character.  Process it.
-        ResultChar = ProcessCharEscape(begin, end, HadError,
-                                       FullSourceLoc(Loc,PP.getSourceManager()),
-                                       CharWidth, &PP.getDiagnostics());
-      }
-    }
+  SmallVector<uint32_t,4> codepoint_buffer;
+  codepoint_buffer.resize(end-begin);
+  uint32_t *buffer_begin = &codepoint_buffer.front();
+  uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
+
+  // Unicode escapes representing characters that cannot be correctly
+  // represented in a single code unit are disallowed in character literals
+  // by this implementation.
+  uint32_t largest_character_for_kind;
+  if (tok::wide_char_constant == Kind) {
+    largest_character_for_kind = 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
+  } else if (tok::utf16_char_constant == Kind) {
+    largest_character_for_kind = 0xFFFF;
+  } else if (tok::utf32_char_constant == Kind) {
+    largest_character_for_kind = 0x10FFFF;
+  } else {
+    largest_character_for_kind = 0x7Fu;
+  }
 
-    // If this is a multi-character constant (e.g. 'abc'), handle it.  These are
-    // implementation defined (C99 6.4.4.4p10).
-    if (NumCharsSoFar) {
-      if (!isAscii()) {
-        // Emulate GCC's (unintentional?) behavior: L'ab' -> L'b'.
-        LitVal = 0;
+  while (begin!=end) {
+    // Is this a span of non-escape characters?
+    if (begin[0] != '\\') {
+      char const *start = begin;
+      do {
+        ++begin;
+      } while (begin != end && *begin != '\\');
+
+      uint32_t *tmp_begin = buffer_begin;
+      ConversionResult res =
+      ConvertUTF8toUTF32(reinterpret_cast<UTF8 const **>(&start),
+                         reinterpret_cast<UTF8 const *>(begin),
+                         &buffer_begin,buffer_end,strictConversion);
+      if (res!=conversionOK) {
+        PP.Diag(Loc, diag::err_bad_character_encoding);
+        HadError = true;
       } else {
-        // Narrow character literals act as though their value is concatenated
-        // in this implementation, but warn on overflow.
-        if (LitVal.countLeadingZeros() < 8 && !Warned) {
-          PP.Diag(Loc, diag::warn_char_constant_too_large);
-          Warned = true;
+        for (; tmp_begin<buffer_begin; ++tmp_begin) {
+          if (*tmp_begin > largest_character_for_kind) {
+            HadError = true;
+            PP.Diag(Loc, diag::err_character_too_large);
+          }
         }
-        LitVal <<= 8;
       }
+
+      continue;
     }
+    // Is this a Universal Character Name excape?
+    if (begin[1] == 'u' || begin[1] == 'U') {
+      unsigned short UcnLen = 0;
+      if (!ProcessUCNEscape(begin, end, *buffer_begin, UcnLen,
+                            FullSourceLoc(Loc, PP.getSourceManager()),
+                            &PP.getDiagnostics(), PP.getLangOptions(),
+                            true))
+      {
+        HadError = true;
+      } else if (*buffer_begin > largest_character_for_kind) {
+        HadError = true;
+        PP.Diag(Loc,diag::err_character_too_large);
+      }
 
-    LitVal = LitVal + ResultChar;
-    ++NumCharsSoFar;
+      ++buffer_begin;
+      continue;
+    }
+    unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
+    uint64_t result =
+    ProcessCharEscape(begin, end, HadError,
+                      FullSourceLoc(Loc,PP.getSourceManager()),
+                      CharWidth, &PP.getDiagnostics());
+    *buffer_begin++ = result;
   }
 
-  // If this is the second character being processed, do special handling.
+  unsigned NumCharsSoFar = buffer_begin-&codepoint_buffer.front();
+
   if (NumCharsSoFar > 1) {
-    // Warn about discarding the top bits for multi-char wide-character
-    // constants (L'abcd').
-    if (!isAscii())
+    if (isWide())
       PP.Diag(Loc, diag::warn_extraneous_char_constant);
-    else if (NumCharsSoFar != 4)
+    else if (isAscii() && NumCharsSoFar == 4)
+      PP.Diag(Loc, diag::ext_four_char_character_literal);
+    else if (isAscii())
       PP.Diag(Loc, diag::ext_multichar_character_literal);
     else
-      PP.Diag(Loc, diag::ext_four_char_character_literal);
+      PP.Diag(Loc, diag::err_multichar_utf_character_literal);
     IsMultiChar = true;
   } else
     IsMultiChar = false;
 
+  llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
+
+  // Narrow character literals act as though their value is concatenated
+  // in this implementation, but warn on overflow.
+  bool multi_char_too_long = false;
+  if (isAscii() && isMultiChar()) {
+    LitVal = 0;
+    for (size_t i=0;i<NumCharsSoFar;++i) {
+      // check for enough leading zeros to shift into
+      multi_char_too_long |= (LitVal.countLeadingZeros() < 8);
+      LitVal <<= 8;
+      LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
+    }
+  } else if (NumCharsSoFar > 0) {
+    // otherwise just take the last character
+    LitVal = buffer_begin[-1];
+  }
+
+  if (!HadError && multi_char_too_long) {
+    PP.Diag(Loc,diag::warn_char_constant_too_large);
+  }
+
   // Transfer the value from APInt to uint64_t
   Value = LitVal.getZExtValue();
author	Seth Cantrell <seth.cantrell@gmail.com>	2012-01-18 12:27:04 +0000
committer	Seth Cantrell <seth.cantrell@gmail.com>	2012-01-18 12:27:04 +0000
commit	be773526230b5a7121a8b321b05f2e53fa473f5c (patch)
tree	a755e66cb063593de8749f6f94c0aba1647d8eaf
parent	7d6a7c004361c45a463874a58da28d9f3c35f2e6 (diff)