diff options
author | Eli Friedman <eli.friedman@gmail.com> | 2011-11-01 02:14:50 +0000 |
---|---|---|
committer | Eli Friedman <eli.friedman@gmail.com> | 2011-11-01 02:14:50 +0000 |
commit | f74a4587629615ffd13bd0724868f86ba8c8f27b (patch) | |
tree | 70ded3f03e87f859c06714282fe0f3bcba55e142 /lib/Lex/LiteralSupport.cpp | |
parent | 436ecd959954db0e11c8daf64b3d6b6b6d0eba55 (diff) |
Perform proper conversion for strings encoded in the source file as UTF-8. (For now, we are assuming the source character set is always UTF-8; this can be easily extended if necessary.)
Tests will be coming up in a subsequent commit.
Patch by Seth Cantrell.
git-svn-id: https://llvm.org/svn/llvm-project/cfe/trunk@143416 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Lex/LiteralSupport.cpp')
-rw-r--r-- | lib/Lex/LiteralSupport.cpp | 55 |
1 files changed, 44 insertions, 11 deletions
diff --git a/lib/Lex/LiteralSupport.cpp b/lib/Lex/LiteralSupport.cpp index 70183fd1a0..b107531e14 100644 --- a/lib/Lex/LiteralSupport.cpp +++ b/lib/Lex/LiteralSupport.cpp @@ -16,6 +16,7 @@ #include "clang/Lex/Preprocessor.h" #include "clang/Lex/LexDiagnostic.h" #include "clang/Basic/TargetInfo.h" +#include "clang/Basic/ConvertUTF.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Support/ErrorHandling.h" using namespace clang; @@ -1033,7 +1034,14 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ ThisTokEnd -= (ThisTokBuf - Prefix); // Copy the string over - CopyStringFragment(StringRef(ThisTokBuf, ThisTokEnd - ThisTokBuf)); + if (CopyStringFragment(StringRef(ThisTokBuf,ThisTokEnd-ThisTokBuf))) + { + if (Diags) + Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + diag::err_bad_string_encoding); + hadError = true; + } + } else { assert(ThisTokBuf[0] == '"' && "Expected quote, lexer broken?"); ++ThisTokBuf; // skip " @@ -1060,7 +1068,13 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\'); // Copy the character span over. - CopyStringFragment(StringRef(InStart, ThisTokBuf - InStart)); + if (CopyStringFragment(StringRef(InStart,ThisTokBuf-InStart))) + { + if (Diags) + Diags->Report(FullSourceLoc(StringToks[i].getLocation(), SM), + diag::err_bad_string_encoding); + hadError = true; + } continue; } // Is this a Universal Character Name escape? @@ -1116,20 +1130,39 @@ void StringLiteralParser::init(const Token *StringToks, unsigned NumStringToks){ /// copyStringFragment - This function copies from Start to End into ResultPtr. /// Performs widening for multi-byte characters. -void StringLiteralParser::CopyStringFragment(StringRef Fragment) { +bool StringLiteralParser::CopyStringFragment(StringRef Fragment) { + assert(CharByteWidth==1 || CharByteWidth==2 || CharByteWidth==4); + ConversionResult result = conversionOK; // Copy the character span over. if (CharByteWidth == 1) { memcpy(ResultPtr, Fragment.data(), Fragment.size()); ResultPtr += Fragment.size(); - } else { - // Note: our internal rep of wide char tokens is always little-endian. - for (StringRef::iterator I=Fragment.begin(), E=Fragment.end(); I!=E; ++I) { - *ResultPtr++ = *I; - // Add zeros at the end. - for (unsigned i = 1, e = CharByteWidth; i != e; ++i) - *ResultPtr++ = 0; - } + } else if (CharByteWidth == 2) { + UTF8 const *sourceStart = (UTF8 const *)Fragment.data(); + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + UTF16 *targetStart = reinterpret_cast<UTF16*>(ResultPtr); + ConversionFlags flags = lenientConversion; + result = ConvertUTF8toUTF16( + &sourceStart,sourceStart + Fragment.size(), + &targetStart,targetStart + 2*Fragment.size(),flags); + if (result==conversionOK) + ResultPtr = reinterpret_cast<char*>(targetStart); + } else if (CharByteWidth == 4) { + UTF8 const *sourceStart = (UTF8 const *)Fragment.data(); + // FIXME: Make the type of the result buffer correct instead of + // using reinterpret_cast. + UTF32 *targetStart = reinterpret_cast<UTF32*>(ResultPtr); + ConversionFlags flags = lenientConversion; + result = ConvertUTF8toUTF32( + &sourceStart,sourceStart + Fragment.size(), + &targetStart,targetStart + 4*Fragment.size(),flags); + if (result==conversionOK) + ResultPtr = reinterpret_cast<char*>(targetStart); } + assert((result != targetExhausted) + && "ConvertUTF8toUTFXX exhausted target buffer"); + return result != conversionOK; } |