diff options
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 194 | ||||
-rw-r--r-- | test/CodeGen/ARM/lsr-on-unrolled-loops.ll | 386 |
2 files changed, 577 insertions, 3 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index bd7cc45901..43c1d06b65 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -113,6 +113,7 @@ class RegUseTracker { public: void CountRegister(const SCEV *Reg, size_t LUIdx); void DropRegister(const SCEV *Reg, size_t LUIdx); + void DropUse(size_t LUIdx); bool isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const; @@ -150,6 +151,14 @@ RegUseTracker::DropRegister(const SCEV *Reg, size_t LUIdx) { RSD.UsedByIndices.reset(LUIdx); } +void +RegUseTracker::DropUse(size_t LUIdx) { + // Remove the use index from every register's use list. + for (RegUsesTy::iterator I = RegUsesMap.begin(), E = RegUsesMap.end(); + I != E; ++I) + I->second.UsedByIndices.reset(LUIdx); +} + bool RegUseTracker::isRegUsedByUsesOtherThan(const SCEV *Reg, size_t LUIdx) const { if (!RegUsesMap.count(Reg)) return false; @@ -951,6 +960,7 @@ public: MaxOffset(INT64_MIN), AllFixupsOutsideLoop(true) {} + bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); void DeleteFormula(Formula &F); void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses); @@ -961,6 +971,16 @@ public: void dump() const; }; +/// HasFormula - Test whether this use as a formula which has the same +/// registers as the given formula. +bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const { + SmallVector<const SCEV *, 2> Key = F.BaseRegs; + if (F.ScaledReg) Key.push_back(F.ScaledReg); + // Unstable sort by host order ok, because this is only used for uniquifying. + std::sort(Key.begin(), Key.end()); + return Uniquifier.count(Key); +} + /// InsertFormula - If the given formula has not yet been inserted, add it to /// the list, and return true. Return false otherwise. bool LSRUse::InsertFormula(const Formula &F) { @@ -995,6 +1015,7 @@ bool LSRUse::InsertFormula(const Formula &F) { void LSRUse::DeleteFormula(Formula &F) { std::swap(F, Formulae.back()); Formulae.pop_back(); + assert(!Formulae.empty() && "LSRUse has no formulae left!"); } /// RecomputeRegs - Recompute the Regs field, and update RegUses. @@ -1134,6 +1155,13 @@ static bool isAlwaysFoldable(int64_t BaseOffs, AM.HasBaseReg = HasBaseReg; AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1; + // Canonicalize a scale of 1 to a base register if the formula doesn't + // already have a base register. + if (!AM.HasBaseReg && AM.Scale == 1) { + AM.Scale = 0; + AM.HasBaseReg = true; + } + return isLegalUse(AM, Kind, AccessTy, TLI); } @@ -1244,12 +1272,15 @@ class LSRInstance { UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, + bool HasBaseReg, LSRUse::KindType Kind, const Type *AccessTy); std::pair<size_t, int64_t> getUse(const SCEV *&Expr, LSRUse::KindType Kind, const Type *AccessTy); + LSRUse *FindUseWithSimilarFormula(const Formula &F, const LSRUse &OrigLU); + public: void InsertInitialFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); void InsertSupplementalFormula(const SCEV *S, LSRUse &LU, size_t LUIdx); @@ -1742,6 +1773,7 @@ LSRInstance::OptimizeLoopTermCond() { bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, + bool HasBaseReg, LSRUse::KindType Kind, const Type *AccessTy) { int64_t NewMinOffset = LU.MinOffset; int64_t NewMaxOffset = LU.MaxOffset; @@ -1754,12 +1786,12 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, return false; // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { - if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, /*HasBaseReg=*/true, + if (!isAlwaysFoldable(LU.MaxOffset - NewOffset, 0, HasBaseReg, Kind, AccessTy, TLI)) return false; NewMinOffset = NewOffset; } else if (NewOffset > LU.MaxOffset) { - if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, /*HasBaseReg=*/true, + if (!isAlwaysFoldable(NewOffset - LU.MinOffset, 0, HasBaseReg, Kind, AccessTy, TLI)) return false; NewMaxOffset = NewOffset; @@ -1798,7 +1830,7 @@ LSRInstance::getUse(const SCEV *&Expr, // A use already existed with this base. size_t LUIdx = P.first->second; LSRUse &LU = Uses[LUIdx]; - if (reconcileNewOffset(LU, Offset, Kind, AccessTy)) + if (reconcileNewOffset(LU, Offset, /*HasBaseReg=*/true, Kind, AccessTy)) // Reuse this use. return std::make_pair(LUIdx, Offset); } @@ -1819,6 +1851,40 @@ LSRInstance::getUse(const SCEV *&Expr, return std::make_pair(LUIdx, Offset); } +/// FindUseWithFormula - Look for a use distinct from OrigLU which is has +/// a formula that has the same registers as the given formula. +LSRUse * +LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, + const LSRUse &OrigLU) { + // Search all uses for the formula. This could be more clever. Ignore + // ICmpZero uses because they may contain formulae generated by + // GenerateICmpZeroScales, in which case adding fixup offsets may + // be invalid. + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + if (&LU != &OrigLU && + LU.Kind != LSRUse::ICmpZero && + LU.Kind == OrigLU.Kind && OrigLU.AccessTy == LU.AccessTy && + LU.HasFormulaWithSameRegs(OrigF)) { + for (size_t FIdx = 0, NumForms = LU.Formulae.size(); + FIdx != NumForms; ++FIdx) { + Formula &F = LU.Formulae[FIdx]; + if (F.BaseRegs == OrigF.BaseRegs && + F.ScaledReg == OrigF.ScaledReg && + F.AM.BaseGV == OrigF.AM.BaseGV && + F.AM.Scale == OrigF.AM.Scale && + LU.Kind) { + if (F.AM.BaseOffs == 0) + return &LU; + break; + } + } + } + } + + return 0; +} + void LSRInstance::CollectInterestingTypesAndFactors() { SmallSetVector<const SCEV *, 4> Strides; @@ -2722,6 +2788,128 @@ size_t LSRInstance::EstimateSearchSpaceComplexity() const { /// of formulae. This keeps the main solver from taking an extraordinary amount /// of time in some worst-case scenarios. void LSRInstance::NarrowSearchSpaceUsingHeuristics() { + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by eliminating formulae " + "which use a superset of registers used by other " + "formulae.\n"); + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + bool Any = false; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + for (SmallVectorImpl<const SCEV *>::const_iterator + I = F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I) { + if (const SCEVConstant *C = dyn_cast<SCEVConstant>(*I)) { + Formula NewF = F; + NewF.AM.BaseOffs += C->getValue()->getSExtValue(); + NewF.BaseRegs.erase(NewF.BaseRegs.begin() + + (I - F.BaseRegs.begin())); + if (LU.HasFormulaWithSameRegs(NewF)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); dbgs() << '\n'); + LU.DeleteFormula(F); + --i; + --e; + Any = true; + break; + } + } else if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(*I)) { + if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) + if (!F.AM.BaseGV) { + Formula NewF = F; + NewF.AM.BaseGV = GV; + NewF.BaseRegs.erase(NewF.BaseRegs.begin() + + (I - F.BaseRegs.begin())); + if (LU.HasFormulaWithSameRegs(NewF)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); + LU.DeleteFormula(F); + --i; + --e; + Any = true; + break; + } + } + } + } + } + if (Any) + LU.RecomputeRegs(LUIdx, RegUses); + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } + + if (EstimateSearchSpaceComplexity() >= ComplexityLimit) { + DEBUG(dbgs() << "The search space is too complex.\n"); + + DEBUG(dbgs() << "Narrowing the search space by assuming that uses " + "separated by a constant offset will use the same " + "registers.\n"); + + for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) { + LSRUse &LU = Uses[LUIdx]; + for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) { + Formula &F = LU.Formulae[i]; + if (F.AM.BaseOffs != 0 && F.AM.Scale == 0) { + if (LSRUse *LUThatHas = FindUseWithSimilarFormula(F, LU)) { + if (reconcileNewOffset(*LUThatHas, F.AM.BaseOffs, + /*HasBaseReg=*/false, + LU.Kind, LU.AccessTy)) { + DEBUG(dbgs() << " Deleting use "; LU.print(dbgs()); + dbgs() << '\n'); + + LUThatHas->AllFixupsOutsideLoop &= LU.AllFixupsOutsideLoop; + + // Delete formulae from the new use which are no longer legal. + bool Any = false; + for (size_t i = 0, e = LUThatHas->Formulae.size(); i != e; ++i) { + Formula &F = LUThatHas->Formulae[i]; + if (!isLegalUse(F.AM, + LUThatHas->MinOffset, LUThatHas->MaxOffset, + LUThatHas->Kind, LUThatHas->AccessTy, TLI)) { + DEBUG(dbgs() << " Deleting "; F.print(dbgs()); + dbgs() << '\n'); + LUThatHas->DeleteFormula(F); + --i; + --e; + Any = true; + } + } + if (Any) + LUThatHas->RecomputeRegs(LUThatHas - &Uses.front(), RegUses); + + // Update the relocs to reference the new use. + for (size_t i = 0, e = Fixups.size(); i != e; ++i) { + if (Fixups[i].LUIdx == LUIdx) { + Fixups[i].LUIdx = LUThatHas - &Uses.front(); + Fixups[i].Offset += F.AM.BaseOffs; + DEBUG(errs() << "New fixup has offset " + << Fixups[i].Offset << "\n"); + } + if (Fixups[i].LUIdx == NumUses-1) + Fixups[i].LUIdx = LUIdx; + } + + // Delete the old use. + std::swap(LU, Uses.back()); + Uses.pop_back(); + --LUIdx; + --NumUses; + break; + } + } + } + } + } + + DEBUG(dbgs() << "After pre-selection:\n"; + print_uses(dbgs())); + } + SmallPtrSet<const SCEV *, 4> Taken; while (EstimateSearchSpaceComplexity() >= ComplexityLimit) { // Ok, we have too many of formulae on our hands to conveniently handle. diff --git a/test/CodeGen/ARM/lsr-on-unrolled-loops.ll b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll new file mode 100644 index 0000000000..a206c3e0ad --- /dev/null +++ b/test/CodeGen/ARM/lsr-on-unrolled-loops.ll @@ -0,0 +1,386 @@ +; RUN: llc -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 < %s | FileCheck %s + +; LSR should recognize that this is an unrolled loop which can use +; constant offset addressing, so that each of the following stores +; uses the same register. + +; CHECK: vstr.32 s0, [r12, #-128] +; CHECK: vstr.32 s0, [r12, #-96] +; CHECK: vstr.32 s0, [r12, #-64] +; CHECK: vstr.32 s0, [r12, #-32] +; CHECK: vstr.32 s0, [r12] +; CHECK: vstr.32 s0, [r12, #32] +; CHECK: vstr.32 s0, [r12, #64] +; CHECK: vstr.32 s0, [r12, #96] + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32" + +%0 = type { %1*, %3*, %6*, i8*, i32, i32, %8*, i32, i32, i32, i32, i32, i32, i32, double, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8**, i32, i32, i32, i32, i32, [64 x i32]*, [4 x %9*], [4 x %10*], [4 x %10*], i32, %11*, i32, i32, [16 x i8], [16 x i8], [16 x i8], i32, i32, i8, i8, i8, i16, i16, i32, i8, i32, %12*, i32, i32, i32, i32, i8*, i32, [4 x %11*], i32, i32, i32, [10 x i32], i32, i32, i32, i32, i32, %13*, %14*, %15*, %16*, %17*, %18*, %19*, %20*, %21*, %22*, %23* } +%1 = type { void (%2*)*, void (%2*, i32)*, void (%2*)*, void (%2*, i8*)*, void (%2*)*, i32, %7, i32, i32, i8**, i32, i8**, i32, i32 } +%2 = type { %1*, %3*, %6*, i8*, i32, i32 } +%3 = type { i8* (%2*, i32, i32)*, i8* (%2*, i32, i32)*, i8** (%2*, i32, i32, i32)*, [64 x i16]** (%2*, i32, i32, i32)*, %4* (%2*, i32, i32, i32, i32, i32)*, %5* (%2*, i32, i32, i32, i32, i32)*, void (%2*)*, i8** (%2*, %4*, i32, i32, i32)*, [64 x i16]** (%2*, %5*, i32, i32, i32)*, void (%2*, i32)*, void (%2*)*, i32, i32 } +%4 = type opaque +%5 = type opaque +%6 = type { void (%2*)*, i32, i32, i32, i32 } +%7 = type { [8 x i32], [12 x i32] } +%8 = type { i8*, i32, void (%0*)*, i32 (%0*)*, void (%0*, i32)*, i32 (%0*, i32)*, void (%0*)* } +%9 = type { [64 x i16], i32 } +%10 = type { [17 x i8], [256 x i8], i32 } +%11 = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, %9*, i8* } +%12 = type { %12*, i8, i32, i32, i8* } +%13 = type { void (%0*)*, void (%0*)*, i32 } +%14 = type { void (%0*, i32)*, void (%0*, i8**, i32*, i32)* } +%15 = type { void (%0*)*, i32 (%0*)*, void (%0*)*, i32 (%0*, i8***)*, %5** } +%16 = type { void (%0*, i32)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)* } +%17 = type { i32 (%0*)*, void (%0*)*, void (%0*)*, void (%0*)*, i32, i32 } +%18 = type { void (%0*)*, i32 (%0*)*, i32 (%0*)*, i32, i32, i32, i32 } +%19 = type { void (%0*)*, i32 (%0*, [64 x i16]**)*, i32 } +%20 = type { void (%0*)*, [10 x void (%0*, %11*, i16*, i8**, i32)*] } +%21 = type { void (%0*)*, void (%0*, i8***, i32*, i32, i8**, i32*, i32)*, i32 } +%22 = type { void (%0*)*, void (%0*, i8***, i32, i8**, i32)* } +%23 = type { void (%0*, i32)*, void (%0*, i8**, i8**, i32)*, void (%0*)*, void (%0*)* } + +define arm_apcscc void @test(%0* nocapture %a0, %11* nocapture %a1, i16* nocapture %a2, i8** nocapture %a3, i32 %a4) nounwind { +bb: + %t = alloca [64 x float], align 4 + %t5 = getelementptr inbounds %0* %a0, i32 0, i32 65 + %t6 = load i8** %t5, align 4 + %t7 = getelementptr inbounds %11* %a1, i32 0, i32 20 + %t8 = load i8** %t7, align 4 + br label %bb9 + +bb9: + %t10 = phi i32 [ 0, %bb ], [ %t157, %bb156 ] + %t11 = add i32 %t10, 8 + %t12 = getelementptr [64 x float]* %t, i32 0, i32 %t11 + %t13 = add i32 %t10, 16 + %t14 = getelementptr [64 x float]* %t, i32 0, i32 %t13 + %t15 = add i32 %t10, 24 + %t16 = getelementptr [64 x float]* %t, i32 0, i32 %t15 + %t17 = add i32 %t10, 32 + %t18 = getelementptr [64 x float]* %t, i32 0, i32 %t17 + %t19 = add i32 %t10, 40 + %t20 = getelementptr [64 x float]* %t, i32 0, i32 %t19 + %t21 = add i32 %t10, 48 + %t22 = getelementptr [64 x float]* %t, i32 0, i32 %t21 + %t23 = add i32 %t10, 56 + %t24 = getelementptr [64 x float]* %t, i32 0, i32 %t23 + %t25 = getelementptr [64 x float]* %t, i32 0, i32 %t10 + %t26 = shl i32 %t10, 5 + %t27 = or i32 %t26, 8 + %t28 = getelementptr i8* %t8, i32 %t27 + %t29 = bitcast i8* %t28 to float* + %t30 = or i32 %t26, 16 + %t31 = getelementptr i8* %t8, i32 %t30 + %t32 = bitcast i8* %t31 to float* + %t33 = or i32 %t26, 24 + %t34 = getelementptr i8* %t8, i32 %t33 + %t35 = bitcast i8* %t34 to float* + %t36 = or i32 %t26, 4 + %t37 = getelementptr i8* %t8, i32 %t36 + %t38 = bitcast i8* %t37 to float* + %t39 = or i32 %t26, 12 + %t40 = getelementptr i8* %t8, i32 %t39 + %t41 = bitcast i8* %t40 to float* + %t42 = or i32 %t26, 20 + %t43 = getelementptr i8* %t8, i32 %t42 + %t44 = bitcast i8* %t43 to float* + %t45 = or i32 %t26, 28 + %t46 = getelementptr i8* %t8, i32 %t45 + %t47 = bitcast i8* %t46 to float* + %t48 = getelementptr i8* %t8, i32 %t26 + %t49 = bitcast i8* %t48 to float* + %t50 = shl i32 %t10, 3 + %t51 = or i32 %t50, 1 + %t52 = getelementptr i16* %a2, i32 %t51 + %t53 = or i32 %t50, 2 + %t54 = getelementptr i16* %a2, i32 %t53 + %t55 = or i32 %t50, 3 + %t56 = getelementptr i16* %a2, i32 %t55 + %t57 = or i32 %t50, 4 + %t58 = getelementptr i16* %a2, i32 %t57 + %t59 = or i32 %t50, 5 + %t60 = getelementptr i16* %a2, i32 %t59 + %t61 = or i32 %t50, 6 + %t62 = getelementptr i16* %a2, i32 %t61 + %t63 = or i32 %t50, 7 + %t64 = getelementptr i16* %a2, i32 %t63 + %t65 = getelementptr i16* %a2, i32 %t50 + %t66 = load i16* %t52, align 2 + %t67 = icmp eq i16 %t66, 0 + %t68 = load i16* %t54, align 2 + %t69 = icmp eq i16 %t68, 0 + %t70 = and i1 %t67, %t69 + br i1 %t70, label %bb71, label %bb91 + +bb71: + %t72 = load i16* %t56, align 2 + %t73 = icmp eq i16 %t72, 0 + br i1 %t73, label %bb74, label %bb91 + +bb74: + %t75 = load i16* %t58, align 2 + %t76 = icmp eq i16 %t75, 0 + br i1 %t76, label %bb77, label %bb91 + +bb77: + %t78 = load i16* %t60, align 2 + %t79 = icmp eq i16 %t78, 0 + br i1 %t79, label %bb80, label %bb91 + +bb80: + %t81 = load i16* %t62, align 2 + %t82 = icmp eq i16 %t81, 0 + br i1 %t82, label %bb83, label %bb91 + +bb83: + %t84 = load i16* %t64, align 2 + %t85 = icmp eq i16 %t84, 0 + br i1 %t85, label %bb86, label %bb91 + +bb86: + %t87 = load i16* %t65, align 2 + %t88 = sitofp i16 %t87 to float + %t89 = load float* %t49, align 4 + %t90 = fmul float %t88, %t89 + store float %t90, float* %t25, align 4 + store float %t90, float* %t12, align 4 + store float %t90, float* %t14, align 4 + store float %t90, float* %t16, align 4 + store float %t90, float* %t18, align 4 + store float %t90, float* %t20, align 4 + store float %t90, float* %t22, align 4 + store float %t90, float* %t24, align 4 + br label %bb156 + +bb91: + %t92 = load i16* %t65, align 2 + %t93 = sitofp i16 %t92 to float + %t94 = load float* %t49, align 4 + %t95 = fmul float %t93, %t94 + %t96 = sitofp i16 %t68 to float + %t97 = load float* %t29, align 4 + %t98 = fmul float %t96, %t97 + %t99 = load i16* %t58, align 2 + %t100 = sitofp i16 %t99 to float + %t101 = load float* %t32, align 4 + %t102 = fmul float %t100, %t101 + %t103 = load i16* %t62, align 2 + %t104 = sitofp i16 %t103 to float + %t105 = load float* %t35, align 4 + %t106 = fmul float %t104, %t105 + %t107 = fadd float %t95, %t102 + %t108 = fsub float %t95, %t102 + %t109 = fadd float %t98, %t106 + %t110 = fsub float %t98, %t106 + %t111 = fmul float %t110, 0x3FF6A09E60000000 + %t112 = fsub float %t111, %t109 + %t113 = fadd float %t107, %t109 + %t114 = fsub float %t107, %t109 + %t115 = fadd float %t108, %t112 + %t116 = fsub float %t108, %t112 + %t117 = sitofp i16 %t66 to float + %t118 = load float* %t38, align 4 + %t119 = fmul float %t117, %t118 + %t120 = load i16* %t56, align 2 + %t121 = sitofp i16 %t120 to float + %t122 = load float* %t41, align 4 + %t123 = fmul float %t121, %t122 + %t124 = load i16* %t60, align 2 + %t125 = sitofp i16 %t124 to float + %t126 = load float* %t44, align 4 + %t127 = fmul float %t125, %t126 + %t128 = load i16* %t64, align 2 + %t129 = sitofp i16 %t128 to float + %t130 = load float* %t47, align 4 + %t131 = fmul float %t129, %t130 + %t132 = fadd float %t127, %t123 + %t133 = fsub float %t127, %t123 + %t134 = fadd float %t119, %t131 + %t135 = fsub float %t119, %t131 + %t136 = fadd float %t134, %t132 + %t137 = fsub float %t134, %t132 + %t138 = fmul float %t137, 0x3FF6A09E60000000 + %t139 = fadd float %t133, %t135 + %t140 = fmul float %t139, 0x3FFD906BC0000000 + %t141 = fmul float %t135, 0x3FF1517A80000000 + %t142 = fsub float %t141, %t140 + %t143 = fmul float %t133, 0xC004E7AEA0000000 + %t144 = fadd float %t143, %t140 + %t145 = fsub float %t144, %t136 + %t146 = fsub float %t138, %t145 + %t147 = fadd float %t142, %t146 + %t148 = fadd float %t113, %t136 + store float %t148, float* %t25, align 4 + %t149 = fsub float %t113, %t136 + store float %t149, float* %t24, align 4 + %t150 = fadd float %t115, %t145 + store float %t150, float* %t12, align 4 + %t151 = fsub float %t115, %t145 + store float %t151, float* %t22, align 4 + %t152 = fadd float %t116, %t146 + store float %t152, float* %t14, align 4 + %t153 = fsub float %t116, %t146 + store float %t153, float* %t20, align 4 + %t154 = fadd float %t114, %t147 + store float %t154, float* %t18, align 4 + %t155 = fsub float %t114, %t147 + store float %t155, float* %t16, align 4 + br label %bb156 + +bb156: + %t157 = add i32 %t10, 1 + %t158 = icmp eq i32 %t157, 8 + br i1 %t158, label %bb159, label %bb9 + +bb159: + %t160 = add i32 %a4, 7 + %t161 = add i32 %a4, 1 + %t162 = add i32 %a4, 6 + %t163 = add i32 %a4, 2 + %t164 = add i32 %a4, 5 + %t165 = add i32 %a4, 4 + %t166 = add i32 %a4, 3 + br label %bb167 + +bb167: + %t168 = phi i32 [ 0, %bb159 ], [ %t293, %bb167 ] + %t169 = getelementptr i8** %a3, i32 %t168 + %t170 = shl i32 %t168, 3 + %t171 = or i32 %t170, 4 + %t172 = getelementptr [64 x float]* %t, i32 0, i32 %t171 + %t173 = or i32 %t170, 2 + %t174 = getelementptr [64 x float]* %t, i32 0, i32 %t173 + %t175 = or i32 %t170, 6 + %t176 = getelementptr [64 x float]* %t, i32 0, i32 %t175 + %t177 = or i32 %t170, 5 + %t178 = getelementptr [64 x float]* %t, i32 0, i32 %t177 + %t179 = or i32 %t170, 3 + %t180 = getelementptr [64 x float]* %t, i32 0, i32 %t179 + %t181 = or i32 %t170, 1 + %t182 = getelementptr [64 x float]* %t, i32 0, i32 %t181 + %t183 = or i32 %t170, 7 + %t184 = getelementptr [64 x float]* %t, i32 0, i32 %t183 + %t185 = getelementptr [64 x float]* %t, i32 0, i32 %t170 + %t186 = load i8** %t169, align 4 + %t187 = getelementptr inbounds i8* %t186, i32 %a4 + %t188 = load float* %t185, align 4 + %t189 = load float* %t172, align 4 + %t190 = fadd float %t188, %t189 + %t191 = fsub float %t188, %t189 + %t192 = load float* %t174, align 4 + %t193 = load float* %t176, align 4 + %t194 = fadd float %t192, %t193 + %t195 = fsub float %t192, %t193 + %t196 = fmul float %t195, 0x3FF6A09E60000000 + %t197 = fsub float %t196, %t194 + %t198 = fadd float %t190, %t194 + %t199 = fsub float %t190, %t194 + %t200 = fadd float %t191, %t197 + %t201 = fsub float %t191, %t197 + %t202 = load float* %t178, align 4 + %t203 = load float* %t180, align 4 + %t204 = fadd float %t202, %t203 + %t205 = fsub float %t202, %t203 + %t206 = load float* %t182, align 4 + %t207 = load float* %t184, align 4 + %t208 = fadd float %t206, %t207 + %t209 = fsub float %t206, %t207 + %t210 = fadd float %t208, %t204 + %t211 = fsub float %t208, %t204 + %t212 = fmul float %t211, 0x3FF6A09E60000000 + %t213 = fadd float %t205, %t209 + %t214 = fmul float %t213, 0x3FFD906BC0000000 + %t215 = fmul float %t209, 0x3FF1517A80000000 + %t216 = fsub float %t215, %t214 + %t217 = fmul float %t205, 0xC004E7AEA0000000 + %t218 = fadd float %t217, %t214 + %t219 = fsub float %t218, %t210 + %t220 = fsub float %t212, %t219 + %t221 = fadd float %t216, %t220 + %t222 = fadd float %t198, %t210 + %t223 = fptosi float %t222 to i32 + %t224 = add nsw i32 %t223, 4 + %t225 = lshr i32 %t224, 3 + %t226 = and i32 %t225, 1023 + %t227 = add i32 %t226, 128 + %t228 = getelementptr inbounds i8* %t6, i32 %t227 + %t229 = load i8* %t228, align 1 + store i8 %t229, i8* %t187, align 1 + %t230 = fsub float %t198, %t210 + %t231 = fptosi float %t230 to i32 + %t232 = add nsw i32 %t231, 4 + %t233 = lshr i32 %t232, 3 + %t234 = and i32 %t233, 1023 + %t235 = add i32 %t234, 128 + %t236 = getelementptr inbounds i8* %t6, i32 %t235 + %t237 = load i8* %t236, align 1 + %t238 = getelementptr inbounds i8* %t186, i32 %t160 + store i8 %t237, i8* %t238, align 1 + %t239 = fadd float %t200, %t219 + %t240 = fptosi float %t239 to i32 + %t241 = add nsw i32 %t240, 4 + %t242 = lshr i32 %t241, 3 + %t243 = and i32 %t242, 1023 + %t244 = add i32 %t243, 128 + %t245 = getelementptr inbounds i8* %t6, i32 %t244 + %t246 = load i8* %t245, align 1 + %t247 = getelementptr inbounds i8* %t186, i32 %t161 + store i8 %t246, i8* %t247, align 1 + %t248 = fsub float %t200, %t219 + %t249 = fptosi float %t248 to i32 + %t250 = add nsw i32 %t249, 4 + %t251 = lshr i32 %t250, 3 + %t252 = and i32 %t251, 1023 + %t253 = add i32 %t252, 128 + %t254 = getelementptr inbounds i8* %t6, i32 %t253 + %t255 = load i8* %t254, align 1 + %t256 = getelementptr inbounds i8* %t186, i32 %t162 + store i8 %t255, i8* %t256, align 1 + %t257 = fadd float %t201, %t220 + %t258 = fptosi float %t257 to i32 + %t259 = add nsw i32 %t258, 4 + %t260 = lshr i32 %t259, 3 + %t261 = and i32 %t260, 1023 + %t262 = add i32 %t261, 128 + %t263 = getelementptr inbounds i8* %t6, i32 %t262 + %t264 = load i8* %t263, align 1 + %t265 = getelementptr inbounds i8* %t186, i32 %t163 + store i8 %t264, i8* %t265, align 1 + %t266 = fsub float %t201, %t220 + %t267 = fptosi float %t266 to i32 + %t268 = add nsw i32 %t267, 4 + %t269 = lshr i32 %t268, 3 + %t270 = and i32 %t269, 1023 + %t271 = add i32 %t270, 128 + %t272 = getelementptr inbounds i8* %t6, i32 %t271 + %t273 = load i8* %t272, align 1 + %t274 = getelementptr inbounds i8* %t186, i32 %t164 + store i8 %t273, i8* %t274, align 1 + %t275 = fadd float %t199, %t221 + %t276 = fptosi float %t275 to i32 + %t277 = add nsw i32 %t276, 4 + %t278 = lshr i32 %t277, 3 + %t279 = and i32 %t278, 1023 + %t280 = add i32 %t279, 128 + %t281 = getelementptr inbounds i8* %t6, i32 %t280 + %t282 = load i8* %t281, align 1 + %t283 = getelementptr inbounds i8* %t186, i32 %t165 + store i8 %t282, i8* %t283, align 1 + %t284 = fsub float %t199, %t221 + %t285 = fptosi float %t284 to i32 + %t286 = add nsw i32 %t285, 4 + %t287 = lshr i32 %t286, 3 + %t288 = and i32 %t287, 1023 + %t289 = add i32 %t288, 128 + %t290 = getelementptr inbounds i8* %t6, i32 %t289 + %t291 = load i8* %t290, align 1 + %t292 = getelementptr inbounds i8* %t186, i32 %t166 + store i8 %t291, i8* %t292, align 1 + %t293 = add nsw i32 %t168, 1 + %t294 = icmp eq i32 %t293, 8 + br i1 %t294, label %bb295, label %bb167 + +bb295: + ret void +} |