When a bad UTF-8 encoding or bogus escape sequence is encountered in a

string literal, produce a diagnostic pointing at the erroneous character range, not at the start of the literal. git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@163459 91177308-0d34-0410-b5e6-96231b3b80d8
author: Richard Smith <richard-llvm@metafoo.co.uk> 2012-09-08 07:16:20 +0000
committer: Richard Smith <richard-llvm@metafoo.co.uk> 2012-09-08 07:16:20 +0000
commit: e5f0588840b20897631cc8110344fd2745ef4caa (patch)
tree: 8fb7656a4564a1c23cb52532ca9622efe9f9ba67 /lib/Lex
parent: 49f9434a4a69d56779aa37feb2d85e06e7289fce (diff)
1 files changed, 101 insertions, 60 deletions
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp
index 9e3c7786a7..be53429e6f 100644
--- a/lib/Lex/LiteralSupport.cpp
+++ b/lib/Lex/LiteralSupport.cpp
@@ -49,12 +49,35 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
   }
 }
 
+/// \brief Produce a diagnostic highlighting some portion of a literal.
+///
+/// Emits the diagnostic \p DiagID, highlighting the range of characters from
+/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
+/// a substring of a spelling buffer for the token beginning at \p TokBegin.
+static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
+                              const LangOptions &Features, FullSourceLoc TokLoc,
+                              const char *TokBegin, const char *TokRangeBegin,
+                              const char *TokRangeEnd, unsigned DiagID) {
+  SourceLocation Begin =
+    Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
+                                   TokLoc.getManager(), Features);
+  SourceLocation End =
+    Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
+                                   TokLoc.getManager(), Features);
+  return Diags->Report(Begin, DiagID)
+      << CharSourceRange::getCharRange(Begin, End);
+}
+
 /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
 /// either a character or a string literal.
-static unsigned ProcessCharEscape(const char *&ThisTokBuf,
+static unsigned ProcessCharEscape(const char *ThisTokBegin,
+                                  const char *&ThisTokBuf,
                                   const char *ThisTokEnd, bool &HadError,
                                   FullSourceLoc Loc, unsigned CharWidth,
-                                  DiagnosticsEngine *Diags) {
+                                  DiagnosticsEngine *Diags,
+                                  const LangOptions &Features) {
+  const char *EscapeBegin = ThisTokBuf;
+
   // Skip the '\' char.
   ++ThisTokBuf;
 
@@ -75,12 +98,14 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     break;
   case 'e':
     if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape) << "e";
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape) << "e";
     ResultChar = 27;
     break;
   case 'E':
     if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape) << "E";
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape) << "E";
     ResultChar = 27;
     break;
   case 'f':
@@ -102,7 +127,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     ResultChar = 0;
     if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
       if (Diags)
-        Diags->Report(Loc, diag::err_hex_escape_no_digits);
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::err_hex_escape_no_digits);
       HadError = 1;
       break;
     }
@@ -126,7 +152,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
 
     // Check for overflow.
     if (Overflow && Diags)   // Too many digits to fit in
-      Diags->Report(Loc, diag::warn_hex_escape_too_large);
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::warn_hex_escape_too_large);
     break;
   }
   case '0': case '1': case '2': case '3':
@@ -148,7 +175,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
     // Check for overflow.  Reject '\777', but not L'\777'.
     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
       if (Diags)
-        Diags->Report(Loc, diag::warn_octal_escape_too_large);
+        Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+             diag::warn_octal_escape_too_large);
       ResultChar &= ~0U >> (32-CharWidth);
     }
     break;
@@ -158,19 +186,22 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf,
   case '(': case '{': case '[': case '%':
     // GCC accepts these as extensions.  We warn about them as such though.
     if (Diags)
-      Diags->Report(Loc, diag::ext_nonstandard_escape)
-        << std::string()+(char)ResultChar;
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_nonstandard_escape)
+        << std::string(1, ResultChar);
     break;
   default:
     if (Diags == 0)
       break;
-      
+
     if (isgraph(ResultChar))
-      Diags->Report(Loc, diag::ext_unknown_escape)
-        << std::string()+(char)ResultChar;
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_unknown_escape)
+        << std::string(1, ResultChar);
     else
-      Diags->Report(Loc, diag::ext_unknown_escape)
-        << "x"+llvm::utohexstr(ResultChar);
+      Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
+           diag::ext_unknown_escape)
+        << "x" + llvm::utohexstr(ResultChar);
     break;
   }
 
@@ -185,9 +216,6 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
                              FullSourceLoc Loc, DiagnosticsEngine *Diags, 
                              const LangOptions &Features,
                              bool in_char_string_literal = false) {
-  if (!Features.CPlusPlus && !Features.C99 && Diags)
-    Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89);
-
   const char *UcnBegin = ThisTokBuf;
 
   // Skip the '\u' char's.
@@ -195,7 +223,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
 
   if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) {
     if (Diags)
-      Diags->Report(Loc, diag::err_ucn_escape_no_digits);
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_no_digits);
     return false;
   }
   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
@@ -208,12 +237,9 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
   }
   // If we didn't consume the proper number of digits, there is a problem.
   if (UcnLenSave) {
-    if (Diags) {
-      SourceLocation L =
-        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
-                                       Loc.getManager(), Features);
-      Diags->Report(L, diag::err_ucn_escape_incomplete);
-    }
+    if (Diags)
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_incomplete);
     return false;
   }
 
@@ -221,7 +247,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
     if (Diags)
-      Diags->Report(Loc, diag::err_ucn_escape_invalid);
+      Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+           diag::err_ucn_escape_invalid);
     return false;
   }
 
@@ -231,22 +258,25 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {  // $, @, `
     bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal);
     if (Diags) {
-      SourceLocation UcnBeginLoc =
-        Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin,
-                                       Loc.getManager(), Features);
       char BasicSCSChar = UcnVal;
       if (UcnVal >= 0x20 && UcnVal < 0x7f)
-        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs :
-                      diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
-          << StringRef(&BasicSCSChar, 1);
+        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+             IsError ? diag::err_ucn_escape_basic_scs :
+                       diag::warn_cxx98_compat_literal_ucn_escape_basic_scs)
+            << StringRef(&BasicSCSChar, 1);
       else
-        Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character :
-                      diag::warn_cxx98_compat_literal_ucn_control_character);
+        Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+             IsError ? diag::err_ucn_control_character :
+                       diag::warn_cxx98_compat_literal_ucn_control_character);
     }
     if (IsError)
       return false;
   }
 
+  if (!Features.CPlusPlus && !Features.C99 && Diags)
+    Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
+         diag::warn_ucn_not_valid_in_c89);
+
   return true;
 }
 
@@ -943,7 +973,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
         HadError = true;
       } else if (*buffer_begin > largest_character_for_kind) {
         HadError = true;
-        PP.Diag(Loc,diag::err_character_too_large);
+        PP.Diag(Loc, diag::err_character_too_large);
       }
 
       ++buffer_begin;
@@ -951,9 +981,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
     }
     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
     uint64_t result =
-    ProcessCharEscape(begin, end, HadError,
-                      FullSourceLoc(Loc,PP.getSourceManager()),
-                      CharWidth, &PP.getDiagnostics());
+      ProcessCharEscape(TokBegin, begin, end, HadError,
+                        FullSourceLoc(Loc,PP.getSourceManager()),
+                        CharWidth, &PP.getDiagnostics(), PP.getLangOpts());
     *buffer_begin++ = result;
   }
 
@@ -1110,7 +1140,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
         Kind = StringToks[i].getKind();
       } else {
         if (Diags)
-          Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM),
+          Diags->Report(StringToks[i].getLocation(),
                         diag::err_unsupported_string_concat);
         hadError = true;
       }
@@ -1218,9 +1248,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
       assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal");
 
       // Copy the string over
-      if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
-        if (DiagnoseBadString(StringToks[i]))
-          hadError = true;
+      if (CopyStringFragment(StringToks[i], ThisTokBegin,
+                             StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)))
+        hadError = true;
     } else {
       if (ThisTokBuf[0] != '"') {
         // The file may have come from PCH and then changed after loading the
@@ -1251,9 +1281,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
 
           // Copy the character span over.
-          if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)))
-            if (DiagnoseBadString(StringToks[i]))
-              hadError = true;
+          if (CopyStringFragment(StringToks[i], ThisTokBegin,
+                                 StringRef(InStart, ThisTokBuf - InStart)))
+            hadError = true;
           continue;
         }
         // Is this a Universal Character Name escape?
@@ -1266,9 +1296,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
         }
         // Otherwise, this is a non-UCN escape character.  Process it.
         unsigned ResultChar =
-          ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError,
+          ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
                             FullSourceLoc(StringToks[i].getLocation(), SM),
-                            CharByteWidth*8, Diags);
+                            CharByteWidth*8, Diags, Features);
 
         if (CharByteWidth == 4) {
           // FIXME: Make the type of the result buffer correct instead of
@@ -1308,8 +1338,8 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
 
     // Verify that pascal strings aren't too large.
     if (GetStringLength() > 256) {
-      if (Diags) 
-        Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+      if (Diags)
+        Diags->Report(StringToks[0].getLocation(),
                       diag::err_pascal_string_too_long)
           << SourceRange(StringToks[0].getLocation(),
                          StringToks[NumStringToks-1].getLocation());
@@ -1321,7 +1351,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
     
     if (GetNumStringChars() > MaxChars)
-      Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM),
+      Diags->Report(StringToks[0].getLocation(),
                     diag::ext_string_too_long)
         << GetNumStringChars() << MaxChars
         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
@@ -1330,21 +1360,32 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){
   }
 }
 
-/// copyStringFragment - This function copies from Start to End into ResultPtr.
+/// \brief This function copies from Fragment, which is a sequence of bytes
+/// within Tok's contents (which begin at TokBegin) into ResultPtr.
 /// Performs widening for multi-byte characters.
-bool StringLiteralParser::CopyStringFragment(StringRef Fragment) {
-  return !ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr);
-}
+bool StringLiteralParser::CopyStringFragment(const Token &Tok,
+                                             const char *TokBegin,
+                                             StringRef Fragment) {
+  const UTF8 *ErrorPtrTmp;
+  if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
+    return false;
+  const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
 
-bool StringLiteralParser::DiagnoseBadString(const Token &Tok) {
   // If we see bad encoding for unprefixed string literals, warn and
   // simply copy the byte values, for compatibility with gcc and older
   // versions of clang.
   bool NoErrorOnBadEncoding = isAscii();
-  unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding :
-                                        diag::err_bad_string_encoding;
-  if (Diags)
-    Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg);
+  if (NoErrorOnBadEncoding) {
+    memcpy(ResultPtr, Fragment.data(), Fragment.size());
+    ResultPtr += Fragment.size();
+  }
+  if (Diags) {
+    Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin,
+         ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr),
+                                                 Fragment.end() - ErrorPtr),
+         NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
+                              : diag::err_bad_string_encoding);
+  }
   return !NoErrorOnBadEncoding;
 }
 
@@ -1422,9 +1463,9 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
       }
       ByteNo -= Len;
     } else {
-      ProcessCharEscape(SpellingPtr, SpellingEnd, HadError,
+      ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
                         FullSourceLoc(Tok.getLocation(), SM),
-                        CharByteWidth*8, Diags);
+                        CharByteWidth*8, Diags, Features);
       --ByteNo;
     }
     assert(!HadError && "This method isn't valid on erroneous strings");
author	Richard Smith <richard-llvm@metafoo.co.uk>	2012-09-08 07:16:20 +0000
committer	Richard Smith <richard-llvm@metafoo.co.uk>	2012-09-08 07:16:20 +0000
commit	e5f0588840b20897631cc8110344fd2745ef4caa (patch)
tree	8fb7656a4564a1c23cb52532ca9622efe9f9ba67 /lib/Lex
parent	49f9434a4a69d56779aa37feb2d85e06e7289fce (diff)