diff options
Diffstat (limited to 'lib/Support')
-rw-r--r-- | lib/Support/APFloat.cpp | 75 | ||||
-rw-r--r-- | lib/Support/APInt.cpp | 40 | ||||
-rw-r--r-- | lib/Support/Allocator.cpp | 8 | ||||
-rw-r--r-- | lib/Support/CMakeLists.txt | 2 | ||||
-rw-r--r-- | lib/Support/ConvertUTF.c | 571 | ||||
-rw-r--r-- | lib/Support/ConvertUTFWrapper.cpp | 76 | ||||
-rw-r--r-- | lib/Support/Debug.cpp | 2 | ||||
-rw-r--r-- | lib/Support/Dwarf.cpp | 1 | ||||
-rw-r--r-- | lib/Support/FileUtilities.cpp | 4 | ||||
-rw-r--r-- | lib/Support/GraphWriter.cpp | 11 | ||||
-rw-r--r-- | lib/Support/Host.cpp | 17 | ||||
-rw-r--r-- | lib/Support/LockFileManager.cpp | 4 | ||||
-rw-r--r-- | lib/Support/MemoryBuffer.cpp | 14 | ||||
-rw-r--r-- | lib/Support/PathV2.cpp | 3 | ||||
-rw-r--r-- | lib/Support/SourceMgr.cpp | 4 | ||||
-rw-r--r-- | lib/Support/TimeValue.cpp | 9 | ||||
-rw-r--r-- | lib/Support/Triple.cpp | 10 | ||||
-rw-r--r-- | lib/Support/Unix/Memory.inc | 11 | ||||
-rw-r--r-- | lib/Support/Unix/PathV2.inc | 8 | ||||
-rw-r--r-- | lib/Support/Unix/Process.inc | 2 | ||||
-rw-r--r-- | lib/Support/Unix/Program.inc | 5 | ||||
-rw-r--r-- | lib/Support/Unix/Signals.inc | 18 | ||||
-rw-r--r-- | lib/Support/Unix/TimeValue.inc | 3 | ||||
-rw-r--r-- | lib/Support/Windows/Path.inc | 2 | ||||
-rw-r--r-- | lib/Support/raw_ostream.cpp | 3 |
25 files changed, 842 insertions, 61 deletions
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp index 4a7a5d1a05..5b68fbb270 100644 --- a/lib/Support/APFloat.cpp +++ b/lib/Support/APFloat.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/APSInt.h" #include "llvm/ADT/FoldingSet.h" #include "llvm/ADT/Hashing.h" +#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" @@ -101,26 +102,6 @@ decDigitValue(unsigned int c) return c - '0'; } -static unsigned int -hexDigitValue(unsigned int c) -{ - unsigned int r; - - r = c - '0'; - if (r <= 9) - return r; - - r = c - 'A'; - if (r <= 5) - return r + 10; - - r = c - 'a'; - if (r <= 5) - return r + 10; - - return -1U; -} - /* Return the value of a decimal exponent of the form [+-]ddddddd. @@ -1932,6 +1913,12 @@ APFloat::convert(const fltSemantics &toSemantics, *losesInfo = (fs != opOK); } else if (category == fcNaN) { *losesInfo = lostFraction != lfExactlyZero || X86SpecialNan; + + // For x87 extended precision, we want to make a NaN, not a special NaN if + // the input wasn't special either. + if (!X86SpecialNan && semantics == &APFloat::x87DoubleExtended) + APInt::tcSetBit(significandParts(), semantics->precision - 1); + // gcc forces the Quiet bit on, which means (float)(double)(float_sNan) // does not give you back the same bits. This is dubious, and we // don't currently do it. You're really supposed to get @@ -3032,7 +3019,7 @@ APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api) // Unless we have a special case, add in second double. if (category == fcNormal) { - APFloat v(APInt(64, i2)); + APFloat v(IEEEdouble, APInt(64, i2)); fs = v.convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo); assert(fs == opOK && !losesInfo); (void)fs; @@ -3185,27 +3172,43 @@ APFloat::initFromHalfAPInt(const APInt & api) /// isIEEE argument distinguishes between PPC128 and IEEE128 (not meaningful /// when the size is anything else). void -APFloat::initFromAPInt(const APInt& api, bool isIEEE) +APFloat::initFromAPInt(const fltSemantics* Sem, const APInt& api) { - if (api.getBitWidth() == 16) + if (Sem == &IEEEhalf) return initFromHalfAPInt(api); - else if (api.getBitWidth() == 32) + if (Sem == &IEEEsingle) return initFromFloatAPInt(api); - else if (api.getBitWidth()==64) + if (Sem == &IEEEdouble) return initFromDoubleAPInt(api); - else if (api.getBitWidth()==80) + if (Sem == &x87DoubleExtended) return initFromF80LongDoubleAPInt(api); - else if (api.getBitWidth()==128) - return (isIEEE ? - initFromQuadrupleAPInt(api) : initFromPPCDoubleDoubleAPInt(api)); - else - llvm_unreachable(0); + if (Sem == &IEEEquad) + return initFromQuadrupleAPInt(api); + if (Sem == &PPCDoubleDouble) + return initFromPPCDoubleDoubleAPInt(api); + + llvm_unreachable(0); } APFloat APFloat::getAllOnesValue(unsigned BitWidth, bool isIEEE) { - return APFloat(APInt::getAllOnesValue(BitWidth), isIEEE); + switch (BitWidth) { + case 16: + return APFloat(IEEEhalf, APInt::getAllOnesValue(BitWidth)); + case 32: + return APFloat(IEEEsingle, APInt::getAllOnesValue(BitWidth)); + case 64: + return APFloat(IEEEdouble, APInt::getAllOnesValue(BitWidth)); + case 80: + return APFloat(x87DoubleExtended, APInt::getAllOnesValue(BitWidth)); + case 128: + if (isIEEE) + return APFloat(IEEEquad, APInt::getAllOnesValue(BitWidth)); + return APFloat(PPCDoubleDouble, APInt::getAllOnesValue(BitWidth)); + default: + llvm_unreachable("Unknown floating bit width"); + } } APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) { @@ -3263,16 +3266,16 @@ APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) { return Val; } -APFloat::APFloat(const APInt& api, bool isIEEE) { - initFromAPInt(api, isIEEE); +APFloat::APFloat(const fltSemantics &Sem, const APInt &API) { + initFromAPInt(&Sem, API); } APFloat::APFloat(float f) { - initFromAPInt(APInt::floatToBits(f)); + initFromAPInt(&IEEEsingle, APInt::floatToBits(f)); } APFloat::APFloat(double d) { - initFromAPInt(APInt::doubleToBits(d)); + initFromAPInt(&IEEEdouble, APInt::doubleToBits(d)); } namespace { diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index 61e503bc3a..07cb057b48 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -1876,6 +1876,17 @@ APInt APInt::udiv(const APInt& RHS) const { return Quotient; } +APInt APInt::sdiv(const APInt &RHS) const { + if (isNegative()) { + if (RHS.isNegative()) + return (-(*this)).udiv(-RHS); + return -((-(*this)).udiv(RHS)); + } + if (RHS.isNegative()) + return -(this->udiv(-RHS)); + return this->udiv(RHS); +} + APInt APInt::urem(const APInt& RHS) const { assert(BitWidth == RHS.BitWidth && "Bit widths must be the same"); if (isSingleWord()) { @@ -1913,6 +1924,17 @@ APInt APInt::urem(const APInt& RHS) const { return Remainder; } +APInt APInt::srem(const APInt &RHS) const { + if (isNegative()) { + if (RHS.isNegative()) + return -((-(*this)).urem(-RHS)); + return -((-(*this)).urem(RHS)); + } + if (RHS.isNegative()) + return this->urem(-RHS); + return this->urem(RHS); +} + void APInt::udivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder) { // Get some size facts about the dividend and divisor @@ -1953,6 +1975,24 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS, divide(LHS, lhsWords, RHS, rhsWords, &Quotient, &Remainder); } +void APInt::sdivrem(const APInt &LHS, const APInt &RHS, + APInt &Quotient, APInt &Remainder) { + if (LHS.isNegative()) { + if (RHS.isNegative()) + APInt::udivrem(-LHS, -RHS, Quotient, Remainder); + else { + APInt::udivrem(-LHS, RHS, Quotient, Remainder); + Quotient = -Quotient; + } + Remainder = -Remainder; + } else if (RHS.isNegative()) { + APInt::udivrem(LHS, -RHS, Quotient, Remainder); + Quotient = -Quotient; + } else { + APInt::udivrem(LHS, RHS, Quotient, Remainder); + } +} + APInt APInt::sadd_ov(const APInt &RHS, bool &Overflow) const { APInt Res = *this+RHS; Overflow = isNonNegative() == RHS.isNonNegative() && diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp index 28f4e64ac7..3c4191b805 100644 --- a/lib/Support/Allocator.cpp +++ b/lib/Support/Allocator.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/DataTypes.h" #include "llvm/Support/Memory.h" #include "llvm/Support/Recycler.h" @@ -82,6 +83,7 @@ void BumpPtrAllocator::Reset() { CurSlab->NextPtr = 0; CurPtr = (char*)(CurSlab + 1); End = ((char*)CurSlab) + CurSlab->Size; + BytesAllocated = 0; } /// Allocate - Allocate space at the specified alignment. @@ -102,6 +104,10 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) { // Check if we can hold it. if (Ptr + Size <= End) { CurPtr = Ptr + Size; + // Update the allocation point of this memory block in MemorySanitizer. + // Without this, MemorySanitizer messages for values originated from here + // will point to the allocation of the entire slab. + __msan_allocated_memory(Ptr, Size); return Ptr; } @@ -117,6 +123,7 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) { Ptr = AlignPtr((char*)(NewSlab + 1), Alignment); assert((uintptr_t)Ptr + Size <= (uintptr_t)NewSlab + NewSlab->Size); + __msan_allocated_memory(Ptr, Size); return Ptr; } @@ -125,6 +132,7 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) { Ptr = AlignPtr(CurPtr, Alignment); CurPtr = Ptr + Size; assert(CurPtr <= End && "Unable to allocate memory!"); + __msan_allocated_memory(Ptr, Size); return Ptr; } diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index f294a175e7..5ba69fc3c8 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -8,6 +8,8 @@ add_llvm_library(LLVMSupport circular_raw_ostream.cpp CommandLine.cpp ConstantRange.cpp + ConvertUTF.c + ConvertUTFWrapper.cpp CrashRecoveryContext.cpp DataExtractor.cpp DataStream.cpp diff --git a/lib/Support/ConvertUTF.c b/lib/Support/ConvertUTF.c new file mode 100644 index 0000000000..23f17ca25a --- /dev/null +++ b/lib/Support/ConvertUTF.c @@ -0,0 +1,571 @@ +/*===--- ConvertUTF.c - Universal Character Names conversions ---------------=== + * + * The LLVM Compiler Infrastructure + * + * This file is distributed under the University of Illinois Open Source + * License. See LICENSE.TXT for details. + * + *===------------------------------------------------------------------------=*/ +/* + * Copyright 2001-2004 Unicode, Inc. + * + * Disclaimer + * + * This source code is provided as is by Unicode, Inc. No claims are + * made as to fitness for any particular purpose. No warranties of any + * kind are expressed or implied. The recipient agrees to determine + * applicability of information provided. If this file has been + * purchased on magnetic or optical media from Unicode, Inc., the + * sole remedy for any claim will be exchange of defective media + * within 90 days of receipt. + * + * Limitations on Rights to Redistribute This Code + * + * Unicode, Inc. hereby grants the right to freely use the information + * supplied in this file in the creation of products supporting the + * Unicode Standard, and to make copies of this file in any form + * for internal or external distribution as long as this notice + * remains attached. + */ + +/* --------------------------------------------------------------------- + + Conversions between UTF32, UTF-16, and UTF-8. Source code file. + Author: Mark E. Davis, 1994. + Rev History: Rick McGowan, fixes & updates May 2001. + Sept 2001: fixed const & error conditions per + mods suggested by S. Parent & A. Lillich. + June 2002: Tim Dodd added detection and handling of incomplete + source sequences, enhanced error detection, added casts + to eliminate compiler warnings. + July 2003: slight mods to back out aggressive FFFE detection. + Jan 2004: updated switches in from-UTF8 conversions. + Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions. + + See the header file "ConvertUTF.h" for complete documentation. + +------------------------------------------------------------------------ */ + + +#include "llvm/Support/ConvertUTF.h" +#ifdef CVTUTF_DEBUG +#include <stdio.h> +#endif + +static const int halfShift = 10; /* used for shifting by 10 bits */ + +static const UTF32 halfBase = 0x0010000UL; +static const UTF32 halfMask = 0x3FFUL; + +#define UNI_SUR_HIGH_START (UTF32)0xD800 +#define UNI_SUR_HIGH_END (UTF32)0xDBFF +#define UNI_SUR_LOW_START (UTF32)0xDC00 +#define UNI_SUR_LOW_END (UTF32)0xDFFF +#define false 0 +#define true 1 + +/* --------------------------------------------------------------------- */ + +/* + * Index into the table below with the first byte of a UTF-8 sequence to + * get the number of trailing bytes that are supposed to follow it. + * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is + * left as-is for anyone who may want to do such conversion, which was + * allowed in earlier algorithms. + */ +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + +/* + * Magic values subtracted from a buffer value during UTF8 conversion. + * This table contains as many values as there might be trailing bytes + * in a UTF-8 sequence. + */ +static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL }; + +/* + * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed + * into the first byte, depending on how many bytes follow. There are + * as many entries in this table as there are UTF-8 sequence types. + * (I.e., one byte sequence, two byte... etc.). Remember that sequencs + * for *legal* UTF-8 will be 4 or fewer bytes total. + */ +static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC }; + +/* --------------------------------------------------------------------- */ + +/* The interface converts a whole buffer to avoid function-call overhead. + * Constants have been gathered. Loops & conditionals have been removed as + * much as possible for efficiency, in favor of drop-through switches. + * (See "Note A" at the bottom of the file for equivalent code.) + * If your compiler supports it, the "isLegalUTF8" call can be turned + * into an inline function. + */ + + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF16 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + if (target >= targetEnd) { + result = targetExhausted; break; + } + ch = *source++; + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_LEGAL_UTF32) { + if (flags == strictConversion) { + result = sourceIllegal; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + --source; /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF16toUTF32 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF32* target = *targetStart; + UTF32 ch, ch2; + while (source < sourceEnd) { + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + if (target >= targetEnd) { + source = oldSource; /* Back up source pointer! */ + result = targetExhausted; break; + } + *target++ = ch; + } + *sourceStart = source; + *targetStart = target; +#ifdef CVTUTF_DEBUG +if (result == sourceIllegal) { + fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2); + fflush(stderr); +} +#endif + return result; +} +ConversionResult ConvertUTF16toUTF8 ( + const UTF16** sourceStart, const UTF16* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF16* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */ + ch = *source++; + /* If we have a surrogate pair, convert to UTF32 first. */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) { + /* If the 16 bits following the high surrogate are in the source buffer... */ + if (source < sourceEnd) { + UTF32 ch2 = *source; + /* If it's a low surrogate, convert to UTF32. */ + if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) { + ch = ((ch - UNI_SUR_HIGH_START) << halfShift) + + (ch2 - UNI_SUR_LOW_START) + halfBase; + ++source; + } else if (flags == strictConversion) { /* it's an unpaired high surrogate */ + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } else { /* We don't have the 16 bits following the high surrogate. */ + --source; /* return to the high surrogate */ + result = sourceExhausted; + break; + } + } else if (flags == strictConversion) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* Figure out how many bytes the result will require */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch < (UTF32)0x110000) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + } + + target += bytesToWrite; + if (target > targetEnd) { + source = oldSource; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF32toUTF8 ( + const UTF32** sourceStart, const UTF32* sourceEnd, + UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF32* source = *sourceStart; + UTF8* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch; + unsigned short bytesToWrite = 0; + const UTF32 byteMask = 0xBF; + const UTF32 byteMark = 0x80; + ch = *source++; + if (flags == strictConversion ) { + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + --source; /* return to the illegal value itself */ + result = sourceIllegal; + break; + } + } + /* + * Figure out how many bytes the result will require. Turn any + * illegally large UTF32 things (> Plane 17) into replacement chars. + */ + if (ch < (UTF32)0x80) { bytesToWrite = 1; + } else if (ch < (UTF32)0x800) { bytesToWrite = 2; + } else if (ch < (UTF32)0x10000) { bytesToWrite = 3; + } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4; + } else { bytesToWrite = 3; + ch = UNI_REPLACEMENT_CHAR; + result = sourceIllegal; + } + + target += bytesToWrite; + if (target > targetEnd) { + --source; /* Back up source pointer! */ + target -= bytesToWrite; result = targetExhausted; break; + } + switch (bytesToWrite) { /* note: everything falls through. */ + case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6; + case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]); + } + target += bytesToWrite; + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +/* + * Utility routine to tell whether a sequence of bytes is legal UTF-8. + * This must be called with the length pre-determined by the first byte. + * If not calling this from ConvertUTF8to*, then the length can be set by: + * length = trailingBytesForUTF8[*source]+1; + * and the sequence is illegal right away if there aren't that many bytes + * available. + * If presented with a length > 4, this returns false. The Unicode + * definition of UTF-8 goes up to 4-byte sequences. + */ + +static Boolean isLegalUTF8(const UTF8 *source, int length) { + UTF8 a; + const UTF8 *srcptr = source+length; + switch (length) { + default: return false; + /* Everything else falls through when "true"... */ + case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false; + + switch (*source) { + /* no fall-through in this inner switch */ + case 0xE0: if (a < 0xA0) return false; break; + case 0xED: if (a > 0x9F) return false; break; + case 0xF0: if (a < 0x90) return false; break; + case 0xF4: if (a > 0x8F) return false; break; + default: if (a < 0x80) return false; + } + + case 1: if (*source >= 0x80 && *source < 0xC2) return false; + } + if (*source > 0xF4) return false; + return true; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 sequence is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) { + int length = trailingBytesForUTF8[*source]+1; + if (length > sourceEnd - source) { + return false; + } + return isLegalUTF8(source, length); +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return the total number of bytes in a codepoint + * represented in UTF-8, given the value of the first byte. + */ +unsigned getNumBytesForUTF8(UTF8 first) { + return trailingBytesForUTF8[first] + 1; +} + +/* --------------------------------------------------------------------- */ + +/* + * Exported function to return whether a UTF-8 string is legal or not. + * This is not used here; it's just exported. + */ +Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) { + while (*source != sourceEnd) { + int length = trailingBytesForUTF8[**source] + 1; + if (length > sourceEnd - *source || !isLegalUTF8(*source, length)) + return false; + *source += length; + } + return true; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF16 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF16* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (extraBytesToRead >= sourceEnd - source) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */ + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */ + /* UTF-16 surrogate values are illegal in UTF-32 */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = (UTF16)ch; /* normal case */ + } + } else if (ch > UNI_MAX_UTF16) { + if (flags == strictConversion) { + result = sourceIllegal; + source -= (extraBytesToRead+1); /* return to the start */ + break; /* Bail out; shouldn't continue */ + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + /* target is a character in range 0xFFFF - 0x10FFFF. */ + if (target + 1 >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up source pointer! */ + result = targetExhausted; break; + } + ch -= halfBase; + *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START); + *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START); + } + } + *sourceStart = source; + *targetStart = target; + return result; +} + +/* --------------------------------------------------------------------- */ + +ConversionResult ConvertUTF8toUTF32 ( + const UTF8** sourceStart, const UTF8* sourceEnd, + UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) { + ConversionResult result = conversionOK; + const UTF8* source = *sourceStart; + UTF32* target = *targetStart; + while (source < sourceEnd) { + UTF32 ch = 0; + unsigned short extraBytesToRead = trailingBytesForUTF8[*source]; + if (extraBytesToRead >= sourceEnd - source) { + result = sourceExhausted; break; + } + /* Do this check whether lenient or strict */ + if (!isLegalUTF8(source, extraBytesToRead+1)) { + result = sourceIllegal; + break; + } + /* + * The cases all fall through. See "Note A" below. + */ + switch (extraBytesToRead) { + case 5: ch += *source++; ch <<= 6; + case 4: ch += *source++; ch <<= 6; + case 3: ch += *source++; ch <<= 6; + case 2: ch += *source++; ch <<= 6; + case 1: ch += *source++; ch <<= 6; + case 0: ch += *source++; + } + ch -= offsetsFromUTF8[extraBytesToRead]; + + if (target >= targetEnd) { + source -= (extraBytesToRead+1); /* Back up the source pointer! */ + result = targetExhausted; break; + } + if (ch <= UNI_MAX_LEGAL_UTF32) { + /* + * UTF-16 surrogate values are illegal in UTF-32, and anything + * over Plane 17 (> 0x10FFFF) is illegal. + */ + if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) { + if (flags == strictConversion) { + source -= (extraBytesToRead+1); /* return to the illegal value itself */ + result = sourceIllegal; + break; + } else { + *target++ = UNI_REPLACEMENT_CHAR; + } + } else { + *target++ = ch; + } |