diff options
author | Richard Smith <richard-llvm@metafoo.co.uk> | 2012-09-08 07:16:20 +0000 |
---|---|---|
committer | Richard Smith <richard-llvm@metafoo.co.uk> | 2012-09-08 07:16:20 +0000 |
commit | e5f0588840b20897631cc8110344fd2745ef4caa (patch) | |
tree | 8fb7656a4564a1c23cb52532ca9622efe9f9ba67 /lib/Lex | |
parent | 49f9434a4a69d56779aa37feb2d85e06e7289fce (diff) |
When a bad UTF-8 encoding or bogus escape sequence is encountered in a
string literal, produce a diagnostic pointing at the erroneous character
range, not at the start of the literal.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@163459 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Lex')
-rw-r--r-- | lib/Lex/LiteralSupport.cpp | 161 |
1 files changed, 101 insertions, 60 deletions
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index 9e3c7786a7..be53429e6f 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -49,12 +49,35 @@ static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) { } } +/// \brief Produce a diagnostic highlighting some portion of a literal. +/// +/// Emits the diagnostic \p DiagID, highlighting the range of characters from +/// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be +/// a substring of a spelling buffer for the token beginning at \p TokBegin. +static DiagnosticBuilder Diag(DiagnosticsEngine *Diags, + const LangOptions &Features, FullSourceLoc TokLoc, + const char *TokBegin, const char *TokRangeBegin, + const char *TokRangeEnd, unsigned DiagID) { + SourceLocation Begin = + Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin, + TokLoc.getManager(), Features); + SourceLocation End = + Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin, + TokLoc.getManager(), Features); + return Diags->Report(Begin, DiagID) + << CharSourceRange::getCharRange(Begin, End); +} + /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in /// either a character or a string literal. -static unsigned ProcessCharEscape(const char *&ThisTokBuf, +static unsigned ProcessCharEscape(const char *ThisTokBegin, + const char *&ThisTokBuf, const char *ThisTokEnd, bool &HadError, FullSourceLoc Loc, unsigned CharWidth, - DiagnosticsEngine *Diags) { + DiagnosticsEngine *Diags, + const LangOptions &Features) { + const char *EscapeBegin = ThisTokBuf; + // Skip the '\' char. ++ThisTokBuf; @@ -75,12 +98,14 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, break; case 'e': if (Diags) - Diags->Report(Loc, diag::ext_nonstandard_escape) << "e"; + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_nonstandard_escape) << "e"; ResultChar = 27; break; case 'E': if (Diags) - Diags->Report(Loc, diag::ext_nonstandard_escape) << "E"; + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_nonstandard_escape) << "E"; ResultChar = 27; break; case 'f': @@ -102,7 +127,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, ResultChar = 0; if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { if (Diags) - Diags->Report(Loc, diag::err_hex_escape_no_digits); + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::err_hex_escape_no_digits); HadError = 1; break; } @@ -126,7 +152,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, // Check for overflow. if (Overflow && Diags) // Too many digits to fit in - Diags->Report(Loc, diag::warn_hex_escape_too_large); + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::warn_hex_escape_too_large); break; } case '0': case '1': case '2': case '3': @@ -148,7 +175,8 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, // Check for overflow. Reject '\777', but not L'\777'. if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) { if (Diags) - Diags->Report(Loc, diag::warn_octal_escape_too_large); + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::warn_octal_escape_too_large); ResultChar &= ~0U >> (32-CharWidth); } break; @@ -158,19 +186,22 @@ static unsigned ProcessCharEscape(const char *&ThisTokBuf, case '(': case '{': case '[': case '%': // GCC accepts these as extensions. We warn about them as such though. if (Diags) - Diags->Report(Loc, diag::ext_nonstandard_escape) - << std::string()+(char)ResultChar; + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_nonstandard_escape) + << std::string(1, ResultChar); break; default: if (Diags == 0) break; - + if (isgraph(ResultChar)) - Diags->Report(Loc, diag::ext_unknown_escape) - << std::string()+(char)ResultChar; + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_unknown_escape) + << std::string(1, ResultChar); else - Diags->Report(Loc, diag::ext_unknown_escape) - << "x"+llvm::utohexstr(ResultChar); + Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf, + diag::ext_unknown_escape) + << "x" + llvm::utohexstr(ResultChar); break; } @@ -185,9 +216,6 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, FullSourceLoc Loc, DiagnosticsEngine *Diags, const LangOptions &Features, bool in_char_string_literal = false) { - if (!Features.CPlusPlus && !Features.C99 && Diags) - Diags->Report(Loc, diag::warn_ucn_not_valid_in_c89); - const char *UcnBegin = ThisTokBuf; // Skip the '\u' char's. @@ -195,7 +223,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, if (ThisTokBuf == ThisTokEnd || !isxdigit(*ThisTokBuf)) { if (Diags) - Diags->Report(Loc, diag::err_ucn_escape_no_digits); + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_ucn_escape_no_digits); return false; } UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8); @@ -208,12 +237,9 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, } // If we didn't consume the proper number of digits, there is a problem. if (UcnLenSave) { - if (Diags) { - SourceLocation L = - Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin, - Loc.getManager(), Features); - Diags->Report(L, diag::err_ucn_escape_incomplete); - } + if (Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_ucn_escape_incomplete); return false; } @@ -221,7 +247,8 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints UcnVal > 0x10FFFF) { // maximum legal UTF32 value if (Diags) - Diags->Report(Loc, diag::err_ucn_escape_invalid); + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::err_ucn_escape_invalid); return false; } @@ -231,22 +258,25 @@ static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf, (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) { // $, @, ` bool IsError = (!Features.CPlusPlus0x || !in_char_string_literal); if (Diags) { - SourceLocation UcnBeginLoc = - Lexer::AdvanceToTokenCharacter(Loc, UcnBegin - ThisTokBegin, - Loc.getManager(), Features); char BasicSCSChar = UcnVal; if (UcnVal >= 0x20 && UcnVal < 0x7f) - Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_escape_basic_scs : - diag::warn_cxx98_compat_literal_ucn_escape_basic_scs) - << StringRef(&BasicSCSChar, 1); + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + IsError ? diag::err_ucn_escape_basic_scs : + diag::warn_cxx98_compat_literal_ucn_escape_basic_scs) + << StringRef(&BasicSCSChar, 1); else - Diags->Report(UcnBeginLoc, IsError ? diag::err_ucn_control_character : - diag::warn_cxx98_compat_literal_ucn_control_character); + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + IsError ? diag::err_ucn_control_character : + diag::warn_cxx98_compat_literal_ucn_control_character); } if (IsError) return false; } + if (!Features.CPlusPlus && !Features.C99 && Diags) + Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf, + diag::warn_ucn_not_valid_in_c89); + return true; } @@ -943,7 +973,7 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, HadError = true; } else if (*buffer_begin > largest_character_for_kind) { HadError = true; - PP.Diag(Loc,diag::err_character_too_large); + PP.Diag(Loc, diag::err_character_too_large); } ++buffer_begin; @@ -951,9 +981,9 @@ CharLiteralParser::CharLiteralParser(const char *begin, const char *end, } unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo()); uint64_t result = - ProcessCharEscape(begin, end, HadError, - FullSourceLoc(Loc,PP.getSourceManager()), - CharWidth, &PP.getDiagnostics()); + ProcessCharEscape(TokBegin, begin, end, HadError, + FullSourceLoc(Loc,PP.getSourceManager()), + CharWidth, &PP.getDiagnostics(), PP.getLangOpts()); *buffer_begin++ = result; } @@ -1110,7 +1140,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ Kind = StringToks[i].getKind(); } else { if (Diags) - Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + Diags->Report(StringToks[i].getLocation(), diag::err_unsupported_string_concat); hadError = true; } @@ -1218,9 +1248,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ assert(ThisTokEnd >= ThisTokBuf && "malformed raw string literal"); // Copy the string over - if (CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf))) - if (DiagnoseBadString(StringToks[i])) - hadError = true; + if (CopyStringFragment(StringToks[i], ThisTokBegin, + StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf))) + hadError = true; } else { if (ThisTokBuf[0] != '"') { // The file may have come from PCH and then changed after loading the @@ -1251,9 +1281,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); // Copy the character span over. - if (CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart))) - if (DiagnoseBadString(StringToks[i])) - hadError = true; + if (CopyStringFragment(StringToks[i], ThisTokBegin, + StringRef(InStart, ThisTokBuf - InStart))) + hadError = true; continue; } // Is this a Universal Character Name escape? @@ -1266,9 +1296,9 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } // Otherwise, this is a non-UCN escape character. Process it. unsigned ResultChar = - ProcessCharEscape(ThisTokBuf, ThisTokEnd, hadError, + ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError, FullSourceLoc(StringToks[i].getLocation(), SM), - CharByteWidth*8, Diags); + CharByteWidth*8, Diags, Features); if (CharByteWidth == 4) { // FIXME: Make the type of the result buffer correct instead of @@ -1308,8 +1338,8 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ // Verify that pascal strings aren't too large. if (GetStringLength() > 256) { - if (Diags) - Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM), + if (Diags) + Diags->Report(StringToks[0].getLocation(), diag::err_pascal_string_too_long) << SourceRange(StringToks[0].getLocation(), StringToks[NumStringToks-1].getLocation()); @@ -1321,7 +1351,7 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509; if (GetNumStringChars() > MaxChars) - Diags->Report(FullSourceLoc(StringToks[0].getLocation(), SM), + Diags->Report(StringToks[0].getLocation(), diag::ext_string_too_long) << GetNumStringChars() << MaxChars << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0) @@ -1330,21 +1360,32 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } } -/// copyStringFragment - This function copies from Start to End into ResultPtr. +/// \brief This function copies from Fragment, which is a sequence of bytes +/// within Tok's contents (which begin at TokBegin) into ResultPtr. /// Performs widening for multi-byte characters. -bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { - return !ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr); -} +bool StringLiteralParser::CopyStringFragment(const Token &Tok, + const char *TokBegin, + StringRef Fragment) { + const UTF8 *ErrorPtrTmp; + if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp)) + return false; + const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp); -bool StringLiteralParser::DiagnoseBadString(const Token &Tok) { // If we see bad encoding for unprefixed string literals, warn and // simply copy the byte values, for compatibility with gcc and older // versions of clang. bool NoErrorOnBadEncoding = isAscii(); - unsigned Msg = NoErrorOnBadEncoding ? diag::warn_bad_string_encoding : - diag::err_bad_string_encoding; - if (Diags) - Diags->Report(FullSourceLoc(Tok.getLocation(), SM), Msg); + if (NoErrorOnBadEncoding) { + memcpy(ResultPtr, Fragment.data(), Fragment.size()); + ResultPtr += Fragment.size(); + } + if (Diags) { + Diag(Diags, Features, FullSourceLoc(Tok.getLocation(), SM), TokBegin, + ErrorPtr, ErrorPtr + std::min<unsigned>(getNumBytesForUTF8(*ErrorPtr), + Fragment.end() - ErrorPtr), + NoErrorOnBadEncoding ? diag::warn_bad_string_encoding + : diag::err_bad_string_encoding); + } return !NoErrorOnBadEncoding; } @@ -1422,9 +1463,9 @@ unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok, } ByteNo -= Len; } else { - ProcessCharEscape(SpellingPtr, SpellingEnd, HadError, + ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError, FullSourceLoc(Tok.getLocation(), SM), - CharByteWidth*8, Diags); + CharByteWidth*8, Diags, Features); --ByteNo; } assert(!HadError && "This method isn't valid on erroneous strings"); |