aboutsummaryrefslogtreecommitdiff
path: root/lib/Support
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Support')
-rw-r--r--lib/Support/APFloat.cpp75
-rw-r--r--lib/Support/APInt.cpp40
-rw-r--r--lib/Support/Allocator.cpp8
-rw-r--r--lib/Support/CMakeLists.txt2
-rw-r--r--lib/Support/ConvertUTF.c571
-rw-r--r--lib/Support/ConvertUTFWrapper.cpp76
-rw-r--r--lib/Support/Debug.cpp2
-rw-r--r--lib/Support/Dwarf.cpp1
-rw-r--r--lib/Support/FileUtilities.cpp4
-rw-r--r--lib/Support/GraphWriter.cpp11
-rw-r--r--lib/Support/Host.cpp17
-rw-r--r--lib/Support/LockFileManager.cpp4
-rw-r--r--lib/Support/MemoryBuffer.cpp14
-rw-r--r--lib/Support/PathV2.cpp3
-rw-r--r--lib/Support/SourceMgr.cpp4
-rw-r--r--lib/Support/TimeValue.cpp9
-rw-r--r--lib/Support/Triple.cpp10
-rw-r--r--lib/Support/Unix/Memory.inc11
-rw-r--r--lib/Support/Unix/PathV2.inc8
-rw-r--r--lib/Support/Unix/Process.inc2
-rw-r--r--lib/Support/Unix/Program.inc5
-rw-r--r--lib/Support/Unix/Signals.inc18
-rw-r--r--lib/Support/Unix/TimeValue.inc3
-rw-r--r--lib/Support/Windows/Path.inc2
-rw-r--r--lib/Support/raw_ostream.cpp3
25 files changed, 842 insertions, 61 deletions
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 4a7a5d1a05..5b68fbb270 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -16,6 +16,7 @@
#include "llvm/ADT/APSInt.h"
#include "llvm/ADT/FoldingSet.h"
#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
@@ -101,26 +102,6 @@ decDigitValue(unsigned int c)
return c - '0';
}
-static unsigned int
-hexDigitValue(unsigned int c)
-{
- unsigned int r;
-
- r = c - '0';
- if (r <= 9)
- return r;
-
- r = c - 'A';
- if (r <= 5)
- return r + 10;
-
- r = c - 'a';
- if (r <= 5)
- return r + 10;
-
- return -1U;
-}
-
/* Return the value of a decimal exponent of the form
[+-]ddddddd.
@@ -1932,6 +1913,12 @@ APFloat::convert(const fltSemantics &toSemantics,
*losesInfo = (fs != opOK);
} else if (category == fcNaN) {
*losesInfo = lostFraction != lfExactlyZero || X86SpecialNan;
+
+ // For x87 extended precision, we want to make a NaN, not a special NaN if
+ // the input wasn't special either.
+ if (!X86SpecialNan && semantics == &APFloat::x87DoubleExtended)
+ APInt::tcSetBit(significandParts(), semantics->precision - 1);
+
// gcc forces the Quiet bit on, which means (float)(double)(float_sNan)
// does not give you back the same bits. This is dubious, and we
// don't currently do it. You're really supposed to get
@@ -3032,7 +3019,7 @@ APFloat::initFromPPCDoubleDoubleAPInt(const APInt &api)
// Unless we have a special case, add in second double.
if (category == fcNormal) {
- APFloat v(APInt(64, i2));
+ APFloat v(IEEEdouble, APInt(64, i2));
fs = v.convert(PPCDoubleDouble, rmNearestTiesToEven, &losesInfo);
assert(fs == opOK && !losesInfo);
(void)fs;
@@ -3185,27 +3172,43 @@ APFloat::initFromHalfAPInt(const APInt & api)
/// isIEEE argument distinguishes between PPC128 and IEEE128 (not meaningful
/// when the size is anything else).
void
-APFloat::initFromAPInt(const APInt& api, bool isIEEE)
+APFloat::initFromAPInt(const fltSemantics* Sem, const APInt& api)
{
- if (api.getBitWidth() == 16)
+ if (Sem == &IEEEhalf)
return initFromHalfAPInt(api);
- else if (api.getBitWidth() == 32)
+ if (Sem == &IEEEsingle)
return initFromFloatAPInt(api);
- else if (api.getBitWidth()==64)
+ if (Sem == &IEEEdouble)
return initFromDoubleAPInt(api);
- else if (api.getBitWidth()==80)
+ if (Sem == &x87DoubleExtended)
return initFromF80LongDoubleAPInt(api);
- else if (api.getBitWidth()==128)
- return (isIEEE ?
- initFromQuadrupleAPInt(api) : initFromPPCDoubleDoubleAPInt(api));
- else
- llvm_unreachable(0);
+ if (Sem == &IEEEquad)
+ return initFromQuadrupleAPInt(api);
+ if (Sem == &PPCDoubleDouble)
+ return initFromPPCDoubleDoubleAPInt(api);
+
+ llvm_unreachable(0);
}
APFloat
APFloat::getAllOnesValue(unsigned BitWidth, bool isIEEE)
{
- return APFloat(APInt::getAllOnesValue(BitWidth), isIEEE);
+ switch (BitWidth) {
+ case 16:
+ return APFloat(IEEEhalf, APInt::getAllOnesValue(BitWidth));
+ case 32:
+ return APFloat(IEEEsingle, APInt::getAllOnesValue(BitWidth));
+ case 64:
+ return APFloat(IEEEdouble, APInt::getAllOnesValue(BitWidth));
+ case 80:
+ return APFloat(x87DoubleExtended, APInt::getAllOnesValue(BitWidth));
+ case 128:
+ if (isIEEE)
+ return APFloat(IEEEquad, APInt::getAllOnesValue(BitWidth));
+ return APFloat(PPCDoubleDouble, APInt::getAllOnesValue(BitWidth));
+ default:
+ llvm_unreachable("Unknown floating bit width");
+ }
}
APFloat APFloat::getLargest(const fltSemantics &Sem, bool Negative) {
@@ -3263,16 +3266,16 @@ APFloat APFloat::getSmallestNormalized(const fltSemantics &Sem, bool Negative) {
return Val;
}
-APFloat::APFloat(const APInt& api, bool isIEEE) {
- initFromAPInt(api, isIEEE);
+APFloat::APFloat(const fltSemantics &Sem, const APInt &API) {
+ initFromAPInt(&Sem, API);
}
APFloat::APFloat(float f) {
- initFromAPInt(APInt::floatToBits(f));
+ initFromAPInt(&IEEEsingle, APInt::floatToBits(f));
}
APFloat::APFloat(double d) {
- initFromAPInt(APInt::doubleToBits(d));
+ initFromAPInt(&IEEEdouble, APInt::doubleToBits(d));
}
namespace {
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 61e503bc3a..07cb057b48 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -1876,6 +1876,17 @@ APInt APInt::udiv(const APInt& RHS) const {
return Quotient;
}
+APInt APInt::sdiv(const APInt &RHS) const {
+ if (isNegative()) {
+ if (RHS.isNegative())
+ return (-(*this)).udiv(-RHS);
+ return -((-(*this)).udiv(RHS));
+ }
+ if (RHS.isNegative())
+ return -(this->udiv(-RHS));
+ return this->udiv(RHS);
+}
+
APInt APInt::urem(const APInt& RHS) const {
assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
if (isSingleWord()) {
@@ -1913,6 +1924,17 @@ APInt APInt::urem(const APInt& RHS) const {
return Remainder;
}
+APInt APInt::srem(const APInt &RHS) const {
+ if (isNegative()) {
+ if (RHS.isNegative())
+ return -((-(*this)).urem(-RHS));
+ return -((-(*this)).urem(RHS));
+ }
+ if (RHS.isNegative())
+ return this->urem(-RHS);
+ return this->urem(RHS);
+}
+
void APInt::udivrem(const APInt &LHS, const APInt &RHS,
APInt &Quotient, APInt &Remainder) {
// Get some size facts about the dividend and divisor
@@ -1953,6 +1975,24 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS,
divide(LHS, lhsWords, RHS, rhsWords, &Quotient, &Remainder);
}
+void APInt::sdivrem(const APInt &LHS, const APInt &RHS,
+ APInt &Quotient, APInt &Remainder) {
+ if (LHS.isNegative()) {
+ if (RHS.isNegative())
+ APInt::udivrem(-LHS, -RHS, Quotient, Remainder);
+ else {
+ APInt::udivrem(-LHS, RHS, Quotient, Remainder);
+ Quotient = -Quotient;
+ }
+ Remainder = -Remainder;
+ } else if (RHS.isNegative()) {
+ APInt::udivrem(LHS, -RHS, Quotient, Remainder);
+ Quotient = -Quotient;
+ } else {
+ APInt::udivrem(LHS, RHS, Quotient, Remainder);
+ }
+}
+
APInt APInt::sadd_ov(const APInt &RHS, bool &Overflow) const {
APInt Res = *this+RHS;
Overflow = isNonNegative() == RHS.isNonNegative() &&
diff --git a/lib/Support/Allocator.cpp b/lib/Support/Allocator.cpp
index 28f4e64ac7..3c4191b805 100644
--- a/lib/Support/Allocator.cpp
+++ b/lib/Support/Allocator.cpp
@@ -12,6 +12,7 @@
//===----------------------------------------------------------------------===//
#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Compiler.h"
#include "llvm/Support/DataTypes.h"
#include "llvm/Support/Memory.h"
#include "llvm/Support/Recycler.h"
@@ -82,6 +83,7 @@ void BumpPtrAllocator::Reset() {
CurSlab->NextPtr = 0;
CurPtr = (char*)(CurSlab + 1);
End = ((char*)CurSlab) + CurSlab->Size;
+ BytesAllocated = 0;
}
/// Allocate - Allocate space at the specified alignment.
@@ -102,6 +104,10 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) {
// Check if we can hold it.
if (Ptr + Size <= End) {
CurPtr = Ptr + Size;
+ // Update the allocation point of this memory block in MemorySanitizer.
+ // Without this, MemorySanitizer messages for values originated from here
+ // will point to the allocation of the entire slab.
+ __msan_allocated_memory(Ptr, Size);
return Ptr;
}
@@ -117,6 +123,7 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) {
Ptr = AlignPtr((char*)(NewSlab + 1), Alignment);
assert((uintptr_t)Ptr + Size <= (uintptr_t)NewSlab + NewSlab->Size);
+ __msan_allocated_memory(Ptr, Size);
return Ptr;
}
@@ -125,6 +132,7 @@ void *BumpPtrAllocator::Allocate(size_t Size, size_t Alignment) {
Ptr = AlignPtr(CurPtr, Alignment);
CurPtr = Ptr + Size;
assert(CurPtr <= End && "Unable to allocate memory!");
+ __msan_allocated_memory(Ptr, Size);
return Ptr;
}
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index f294a175e7..5ba69fc3c8 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -8,6 +8,8 @@ add_llvm_library(LLVMSupport
circular_raw_ostream.cpp
CommandLine.cpp
ConstantRange.cpp
+ ConvertUTF.c
+ ConvertUTFWrapper.cpp
CrashRecoveryContext.cpp
DataExtractor.cpp
DataStream.cpp
diff --git a/lib/Support/ConvertUTF.c b/lib/Support/ConvertUTF.c
new file mode 100644
index 0000000000..23f17ca25a
--- /dev/null
+++ b/lib/Support/ConvertUTF.c
@@ -0,0 +1,571 @@
+/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
+ *
+ * The LLVM Compiler Infrastructure
+ *
+ * This file is distributed under the University of Illinois Open Source
+ * License. See LICENSE.TXT for details.
+ *
+ *===------------------------------------------------------------------------=*/
+/*
+ * Copyright 2001-2004 Unicode, Inc.
+ *
+ * Disclaimer
+ *
+ * This source code is provided as is by Unicode, Inc. No claims are
+ * made as to fitness for any particular purpose. No warranties of any
+ * kind are expressed or implied. The recipient agrees to determine
+ * applicability of information provided. If this file has been
+ * purchased on magnetic or optical media from Unicode, Inc., the
+ * sole remedy for any claim will be exchange of defective media
+ * within 90 days of receipt.
+ *
+ * Limitations on Rights to Redistribute This Code
+ *
+ * Unicode, Inc. hereby grants the right to freely use the information
+ * supplied in this file in the creation of products supporting the
+ * Unicode Standard, and to make copies of this file in any form
+ * for internal or external distribution as long as this notice
+ * remains attached.
+ */
+
+/* ---------------------------------------------------------------------
+
+ Conversions between UTF32, UTF-16, and UTF-8. Source code file.
+ Author: Mark E. Davis, 1994.
+ Rev History: Rick McGowan, fixes & updates May 2001.
+ Sept 2001: fixed const & error conditions per
+ mods suggested by S. Parent & A. Lillich.
+ June 2002: Tim Dodd added detection and handling of incomplete
+ source sequences, enhanced error detection, added casts
+ to eliminate compiler warnings.
+ July 2003: slight mods to back out aggressive FFFE detection.
+ Jan 2004: updated switches in from-UTF8 conversions.
+ Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
+
+ See the header file "ConvertUTF.h" for complete documentation.
+
+------------------------------------------------------------------------ */
+
+
+#include "llvm/Support/ConvertUTF.h"
+#ifdef CVTUTF_DEBUG
+#include <stdio.h>
+#endif
+
+static const int halfShift = 10; /* used for shifting by 10 bits */
+
+static const UTF32 halfBase = 0x0010000UL;
+static const UTF32 halfMask = 0x3FFUL;
+
+#define UNI_SUR_HIGH_START (UTF32)0xD800
+#define UNI_SUR_HIGH_END (UTF32)0xDBFF
+#define UNI_SUR_LOW_START (UTF32)0xDC00
+#define UNI_SUR_LOW_END (UTF32)0xDFFF
+#define false 0
+#define true 1
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Index into the table below with the first byte of a UTF-8 sequence to
+ * get the number of trailing bytes that are supposed to follow it.
+ * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
+ * left as-is for anyone who may want to do such conversion, which was
+ * allowed in earlier algorithms.
+ */
+static const char trailingBytesForUTF8[256] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
+};
+
+/*
+ * Magic values subtracted from a buffer value during UTF8 conversion.
+ * This table contains as many values as there might be trailing bytes
+ * in a UTF-8 sequence.
+ */
+static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
+ 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
+
+/*
+ * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
+ * into the first byte, depending on how many bytes follow. There are
+ * as many entries in this table as there are UTF-8 sequence types.
+ * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
+ * for *legal* UTF-8 will be 4 or fewer bytes total.
+ */
+static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
+
+/* --------------------------------------------------------------------- */
+
+/* The interface converts a whole buffer to avoid function-call overhead.
+ * Constants have been gathered. Loops & conditionals have been removed as
+ * much as possible for efficiency, in favor of drop-through switches.
+ * (See "Note A" at the bottom of the file for equivalent code.)
+ * If your compiler supports it, the "isLegalUTF8" call can be turned
+ * into an inline function.
+ */
+
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF32toUTF16 (
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF32* source = *sourceStart;
+ UTF16* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch;
+ if (target >= targetEnd) {
+ result = targetExhausted; break;
+ }
+ ch = *source++;
+ if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+ /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ if (flags == strictConversion) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ *target++ = (UTF16)ch; /* normal case */
+ }
+ } else if (ch > UNI_MAX_LEGAL_UTF32) {
+ if (flags == strictConversion) {
+ result = sourceIllegal;
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ /* target is a character in range 0xFFFF - 0x10FFFF. */
+ if (target + 1 >= targetEnd) {
+ --source; /* Back up source pointer! */
+ result = targetExhausted; break;
+ }
+ ch -= halfBase;
+ *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+ *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
+ }
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF16toUTF32 (
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF16* source = *sourceStart;
+ UTF32* target = *targetStart;
+ UTF32 ch, ch2;
+ while (source < sourceEnd) {
+ const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
+ ch = *source++;
+ /* If we have a surrogate pair, convert to UTF32 first. */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+ /* If the 16 bits following the high surrogate are in the source buffer... */
+ if (source < sourceEnd) {
+ ch2 = *source;
+ /* If it's a low surrogate, convert to UTF32. */
+ if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+ ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ + (ch2 - UNI_SUR_LOW_START) + halfBase;
+ ++source;
+ } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ } else { /* We don't have the 16 bits following the high surrogate. */
+ --source; /* return to the high surrogate */
+ result = sourceExhausted;
+ break;
+ }
+ } else if (flags == strictConversion) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ }
+ if (target >= targetEnd) {
+ source = oldSource; /* Back up source pointer! */
+ result = targetExhausted; break;
+ }
+ *target++ = ch;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+#ifdef CVTUTF_DEBUG
+if (result == sourceIllegal) {
+ fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
+ fflush(stderr);
+}
+#endif
+ return result;
+}
+ConversionResult ConvertUTF16toUTF8 (
+ const UTF16** sourceStart, const UTF16* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF16* source = *sourceStart;
+ UTF8* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch;
+ unsigned short bytesToWrite = 0;
+ const UTF32 byteMask = 0xBF;
+ const UTF32 byteMark = 0x80;
+ const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
+ ch = *source++;
+ /* If we have a surrogate pair, convert to UTF32 first. */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
+ /* If the 16 bits following the high surrogate are in the source buffer... */
+ if (source < sourceEnd) {
+ UTF32 ch2 = *source;
+ /* If it's a low surrogate, convert to UTF32. */
+ if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
+ ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
+ + (ch2 - UNI_SUR_LOW_START) + halfBase;
+ ++source;
+ } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ } else { /* We don't have the 16 bits following the high surrogate. */
+ --source; /* return to the high surrogate */
+ result = sourceExhausted;
+ break;
+ }
+ } else if (flags == strictConversion) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ }
+ /* Figure out how many bytes the result will require */
+ if (ch < (UTF32)0x80) { bytesToWrite = 1;
+ } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
+ } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
+ } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
+ ch = UNI_REPLACEMENT_CHAR;
+ }
+
+ target += bytesToWrite;
+ if (target > targetEnd) {
+ source = oldSource; /* Back up source pointer! */
+ target -= bytesToWrite; result = targetExhausted; break;
+ }
+ switch (bytesToWrite) { /* note: everything falls through. */
+ case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
+ }
+ target += bytesToWrite;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF32toUTF8 (
+ const UTF32** sourceStart, const UTF32* sourceEnd,
+ UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF32* source = *sourceStart;
+ UTF8* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch;
+ unsigned short bytesToWrite = 0;
+ const UTF32 byteMask = 0xBF;
+ const UTF32 byteMark = 0x80;
+ ch = *source++;
+ if (flags == strictConversion ) {
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ --source; /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ }
+ }
+ /*
+ * Figure out how many bytes the result will require. Turn any
+ * illegally large UTF32 things (> Plane 17) into replacement chars.
+ */
+ if (ch < (UTF32)0x80) { bytesToWrite = 1;
+ } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
+ } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
+ } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
+ } else { bytesToWrite = 3;
+ ch = UNI_REPLACEMENT_CHAR;
+ result = sourceIllegal;
+ }
+
+ target += bytesToWrite;
+ if (target > targetEnd) {
+ --source; /* Back up source pointer! */
+ target -= bytesToWrite; result = targetExhausted; break;
+ }
+ switch (bytesToWrite) { /* note: everything falls through. */
+ case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
+ case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
+ }
+ target += bytesToWrite;
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Utility routine to tell whether a sequence of bytes is legal UTF-8.
+ * This must be called with the length pre-determined by the first byte.
+ * If not calling this from ConvertUTF8to*, then the length can be set by:
+ * length = trailingBytesForUTF8[*source]+1;
+ * and the sequence is illegal right away if there aren't that many bytes
+ * available.
+ * If presented with a length > 4, this returns false. The Unicode
+ * definition of UTF-8 goes up to 4-byte sequences.
+ */
+
+static Boolean isLegalUTF8(const UTF8 *source, int length) {
+ UTF8 a;
+ const UTF8 *srcptr = source+length;
+ switch (length) {
+ default: return false;
+ /* Everything else falls through when "true"... */
+ case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+ case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
+
+ switch (*source) {
+ /* no fall-through in this inner switch */
+ case 0xE0: if (a < 0xA0) return false; break;
+ case 0xED: if (a > 0x9F) return false; break;
+ case 0xF0: if (a < 0x90) return false; break;
+ case 0xF4: if (a > 0x8F) return false; break;
+ default: if (a < 0x80) return false;
+ }
+
+ case 1: if (*source >= 0x80 && *source < 0xC2) return false;
+ }
+ if (*source > 0xF4) return false;
+ return true;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return whether a UTF-8 sequence is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
+ int length = trailingBytesForUTF8[*source]+1;
+ if (length > sourceEnd - source) {
+ return false;
+ }
+ return isLegalUTF8(source, length);
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return the total number of bytes in a codepoint
+ * represented in UTF-8, given the value of the first byte.
+ */
+unsigned getNumBytesForUTF8(UTF8 first) {
+ return trailingBytesForUTF8[first] + 1;
+}
+
+/* --------------------------------------------------------------------- */
+
+/*
+ * Exported function to return whether a UTF-8 string is legal or not.
+ * This is not used here; it's just exported.
+ */
+Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
+ while (*source != sourceEnd) {
+ int length = trailingBytesForUTF8[**source] + 1;
+ if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
+ return false;
+ *source += length;
+ }
+ return true;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF8toUTF16 (
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF8* source = *sourceStart;
+ UTF16* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch = 0;
+ unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+ if (extraBytesToRead >= sourceEnd - source) {
+ result = sourceExhausted; break;
+ }
+ /* Do this check whether lenient or strict */
+ if (!isLegalUTF8(source, extraBytesToRead+1)) {
+ result = sourceIllegal;
+ break;
+ }
+ /*
+ * The cases all fall through. See "Note A" below.
+ */
+ switch (extraBytesToRead) {
+ case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
+ case 3: ch += *source++; ch <<= 6;
+ case 2: ch += *source++; ch <<= 6;
+ case 1: ch += *source++; ch <<= 6;
+ case 0: ch += *source++;
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
+
+ if (target >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up source pointer! */
+ result = targetExhausted; break;
+ }
+ if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
+ /* UTF-16 surrogate values are illegal in UTF-32 */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ if (flags == strictConversion) {
+ source -= (extraBytesToRead+1); /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ *target++ = (UTF16)ch; /* normal case */
+ }
+ } else if (ch > UNI_MAX_UTF16) {
+ if (flags == strictConversion) {
+ result = sourceIllegal;
+ source -= (extraBytesToRead+1); /* return to the start */
+ break; /* Bail out; shouldn't continue */
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ /* target is a character in range 0xFFFF - 0x10FFFF. */
+ if (target + 1 >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up source pointer! */
+ result = targetExhausted; break;
+ }
+ ch -= halfBase;
+ *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
+ *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
+ }
+ }
+ *sourceStart = source;
+ *targetStart = target;
+ return result;
+}
+
+/* --------------------------------------------------------------------- */
+
+ConversionResult ConvertUTF8toUTF32 (
+ const UTF8** sourceStart, const UTF8* sourceEnd,
+ UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
+ ConversionResult result = conversionOK;
+ const UTF8* source = *sourceStart;
+ UTF32* target = *targetStart;
+ while (source < sourceEnd) {
+ UTF32 ch = 0;
+ unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
+ if (extraBytesToRead >= sourceEnd - source) {
+ result = sourceExhausted; break;
+ }
+ /* Do this check whether lenient or strict */
+ if (!isLegalUTF8(source, extraBytesToRead+1)) {
+ result = sourceIllegal;
+ break;
+ }
+ /*
+ * The cases all fall through. See "Note A" below.
+ */
+ switch (extraBytesToRead) {
+ case 5: ch += *source++; ch <<= 6;
+ case 4: ch += *source++; ch <<= 6;
+ case 3: ch += *source++; ch <<= 6;
+ case 2: ch += *source++; ch <<= 6;
+ case 1: ch += *source++; ch <<= 6;
+ case 0: ch += *source++;
+ }
+ ch -= offsetsFromUTF8[extraBytesToRead];
+
+ if (target >= targetEnd) {
+ source -= (extraBytesToRead+1); /* Back up the source pointer! */
+ result = targetExhausted; break;
+ }
+ if (ch <= UNI_MAX_LEGAL_UTF32) {
+ /*
+ * UTF-16 surrogate values are illegal in UTF-32, and anything
+ * over Plane 17 (> 0x10FFFF) is illegal.
+ */
+ if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
+ if (flags == strictConversion) {
+ source -= (extraBytesToRead+1); /* return to the illegal value itself */
+ result = sourceIllegal;
+ break;
+ } else {
+ *target++ = UNI_REPLACEMENT_CHAR;
+ }
+ } else {
+ *target++ = ch;
+ }