From c82fba27fe6205e5ecdc80fb53e10675db07dfff Mon Sep 17 00:00:00 2001 From: Chad Rosier Date: Mon, 25 Mar 2013 16:29:20 +0000 Subject: [arm load/store optimizer] When trying to merge a base update load/store, make sure the base register and would-be writeback register don't conflict for stores. This was already being done for loads. Unfortunately, it is rather difficult to create a test case for this issue. It was exposed in 450.soplex at LTO and requires unlucky register allocation. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177874 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Target/ARM/ARMLoadStoreOptimizer.cpp') diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 98bd6c168e..e4e683c2a0 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -865,7 +865,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB, bool isLd = isi32Load(Opcode) || Opcode == ARM::VLDRS || Opcode == ARM::VLDRD; // Can't do the merge if the destination register is the same as the would-be // writeback register. - if (isLd && MI->getOperand(0).getReg() == Base) + if (MI->getOperand(0).getReg() == Base) return false; unsigned PredReg = 0; -- cgit v1.2.3-70-g09d2 From 89becbb97423fb608a4dd85ec10c3fde4398d956 Mon Sep 17 00:00:00 2001 From: Stepan Dyatkovskiy Date: Fri, 5 Apr 2013 05:52:14 +0000 Subject: Fix for PR14824: "Optimization arm_ldst_opt inserts newly generated instruction vldmia at incorrect position". Patch introduces memory operands tracking in ARMLoadStoreOpt::LoadStoreMultipleOpti. For each register it keeps the order of load operations as it was before optimization pass. It is kind of deep improvement of fix proposed by Hao: http://llvm.org/bugs/show_bug.cgi?id=14824#c4 But it also tracks conflicts between different register classes (e.g. D2 and S5). For more details see: Bug description: http://llvm.org/bugs/show_bug.cgi?id=14824 LLVM Commits discussion: http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130311/167936.html http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130318/168688.html http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130325/169376.html http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20130401/170238.html git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178851 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 171 +++++++++++++++++++-- .../ARM/2013-04-05-overridden-loads-PR14824.ll | 110 +++++++++++++ 2 files changed, 271 insertions(+), 10 deletions(-) create mode 100644 test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll (limited to 'lib/Target/ARM/ARMLoadStoreOptimizer.cpp') diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index e4e683c2a0..0682459e1c 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -87,6 +87,53 @@ namespace { MachineBasicBlock::iterator i) : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} }; + class UnitRegsMap { + public: + UnitRegsMap(const TargetRegisterInfo* _TRI) : TRI(_TRI) {} + const SmallVector& operator[](unsigned Reg) { + DenseMap >::iterator found = + Cache.find(Reg); + if (found != Cache.end()) + return found->second; + else + return Cache.insert(std::make_pair(Reg, this->getUnitRegs(Reg))) + .first->second; + } + private: + SmallVector getUnitRegs(unsigned Reg) { + SmallVector Res; + + const TargetRegisterClass* TRC = TRI->getRegClass(Reg); + if (TRC == &ARM::QPRRegClass) { + if (Reg > ARM::Q7) { + Res.push_back(TRI->getSubReg(Reg, ARM::dsub_0)); + Res.push_back(TRI->getSubReg(Reg, ARM::dsub_1)); + return Res; + } + + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_2)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_3)); + + return Res; + } + + if (TRC == &ARM::DPRRegClass && Reg < ARM::D15) { + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0)); + Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1)); + + return Res; + } + + Res.push_back(Reg); + + return Res; + + } + const TargetRegisterInfo* TRI; + DenseMap > Cache; + }; typedef SmallVector MemOpQueue; typedef MemOpQueue::iterator MemOpQueueIter; @@ -128,6 +175,11 @@ namespace { MachineBasicBlock::iterator MBBI, bool &Advance, MachineBasicBlock::iterator &I); + unsigned AddMemOp(MemOpQueue& MemOps, + const MemOpQueueEntry newEntry, + UnitRegsMap& UnitRegsInfo, + SmallSet& UsedUnitRegs, + unsigned At = -1U); bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; @@ -1213,12 +1265,103 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, return false; } +/// AddMemOp - helper for ARMLoadStoreOpt::LoadStoreMultipleOpti. +/// It adds store mem ops with simple push_back/insert method, +/// without any additional logic. +/// For load operation it does the next: +/// 1. Adds new load operation into MemOp collection at "At" position. +/// 2. Removes any "load" operations from MemOps, that changes "Reg" register +/// contents, prior to "At". +/// UnitRegsInfo - Map of type Map< Register, UnitRegisters-vector > +/// UsedUnitRegs - set of unit-registers currently in use. +/// At - position at which it would added, and prior which the clean-up +/// should be made (for load operation). +/// FIXME: The clean-up also should be made for store operations, +/// but the memory address should be analyzed instead of unit registers. +unsigned ARMLoadStoreOpt::AddMemOp(MemOpQueue& MemOps, + const MemOpQueueEntry NewEntry, + UnitRegsMap& UnitRegsInfo, + SmallSet& UsedUnitRegs, + unsigned At) { + unsigned Cleaned = 0; + + if (At == -1U) { + At = MemOps.size(); + MemOps.push_back(NewEntry); + } else + MemOps.insert(&MemOps[At], NewEntry); + + // FIXME: + // If operation is not load, leave it as is by now, + // So 0 overridden ops would cleaned in this case. + if (!NewEntry.MBBI->mayLoad()) + return 0; + + const SmallVector& NewEntryUnitRegs = UnitRegsInfo[NewEntry.Reg]; + + bool FoundOverriddenLoads = false; + + for (unsigned i = 0, e = NewEntryUnitRegs.size(); i != e; ++i) + if (UsedUnitRegs.count(NewEntryUnitRegs[i])) { + FoundOverriddenLoads = true; + break; + } + + // If we detect that this register is used by load operations that are + // predecessors for the new one, remove them from MemOps then. + if (FoundOverriddenLoads) { + MemOpQueue UpdatedMemOps; + + // Scan through MemOps entries. + for (unsigned i = 0; i != At; ++i) { + MemOpQueueEntry& MemOpEntry = MemOps[i]; + + // FIXME: Skip non-load operations by now. + if (!MemOpEntry.MBBI->mayLoad()) + continue; + + const SmallVector& MemOpUnitRegs = + UnitRegsInfo[MemOpEntry.Reg]; + + // Lookup entry that loads contents into register used by new entry. + bool ReleaseThisEntry = false; + for (unsigned m = 0, em = MemOpUnitRegs.size(); m != em; ++m) { + if (std::find(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end(), + MemOpUnitRegs[m]) != NewEntryUnitRegs.end()) { + ReleaseThisEntry = true; + ++Cleaned; + break; + } + } + + if (ReleaseThisEntry) { + const SmallVector& RelesedRegs = UnitRegsInfo[MemOpEntry.Reg]; + for (unsigned r = 0, er = RelesedRegs.size(); r != er; ++r) + UsedUnitRegs.erase(RelesedRegs[r]); + } else + UpdatedMemOps.push_back(MemOpEntry); + } + + // Keep anything without changes after At position. + for (unsigned i = At, e = MemOps.size(); i != e; ++i) + UpdatedMemOps.push_back(MemOps[i]); + + MemOps.swap(UpdatedMemOps); + } + + UsedUnitRegs.insert(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end()); + + return Cleaned; +} + /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR /// ops of the same base and incrementing offset into LDM / STM ops. bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { unsigned NumMerges = 0; unsigned NumMemOps = 0; MemOpQueue MemOps; + UnitRegsMap UnitRegsInfo(TRI); + SmallSet UsedRegUnits; unsigned CurrBase = 0; int CurrOpc = -1; unsigned CurrSize = 0; @@ -1265,8 +1408,11 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { CurrSize = Size; CurrPred = Pred; CurrPredReg = PredReg; + MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); ++NumMemOps; + const SmallVector& EntryUnitRegs = UnitRegsInfo[Reg]; + UsedRegUnits.insert(EntryUnitRegs.begin(), EntryUnitRegs.end()); Advance = true; } else { if (Clobber) { @@ -1278,20 +1424,24 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // No need to match PredReg. // Continue adding to the queue. if (Offset > MemOps.back().Offset) { - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; + unsigned OverridesCleaned = + AddMemOp(MemOps, + MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI), + UnitRegsInfo, UsedRegUnits) != 0; + NumMemOps += 1 - OverridesCleaned; Advance = true; } else { - for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); - I != E; ++I) { - if (Offset < I->Offset) { - MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, - Position, MBBI)); - ++NumMemOps; + for (unsigned I = 0; I != NumMemOps; ++I) { + if (Offset < MemOps[I].Offset) { + MemOpQueueEntry entry(Offset, Reg, isKill, Position, MBBI); + unsigned OverridesCleaned = + AddMemOp(MemOps, entry, UnitRegsInfo, + UsedRegUnits, I) != 0; + NumMemOps += 1 - OverridesCleaned; + Advance = true; break; - } else if (Offset == I->Offset) { + } else if (Offset == MemOps[I].Offset) { // Collision! This can't be merged! break; } @@ -1362,6 +1512,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { CurrPredReg = 0; if (NumMemOps) { MemOps.clear(); + UsedRegUnits.clear(); NumMemOps = 0; } diff --git a/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll b/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll new file mode 100644 index 0000000000..2561686c1f --- /dev/null +++ b/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll @@ -0,0 +1,110 @@ +; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp | FileCheck %s +; The test is presented by Jiangning Liu. +;CHECK-NOT: vldmia + +define void @sample_test(<8 x i64> * %secondSource, <8 x i64> * %source, <8 x i64> * %dest) nounwind { +entry: + %s0 = load <8 x i64> * %source, align 64 + %s1 = load <8 x i64> * %secondSource, align 64 + %s2 = bitcast <8 x i64> %s0 to i512 + %data.i.i.48.extract.shift = lshr i512 %s2, 384 + %data.i.i.48.extract.trunc = trunc i512 %data.i.i.48.extract.shift to i64 + %arrayidx64 = getelementptr inbounds <8 x i64> * %source, i32 6 + %s120 = load <8 x i64> * %arrayidx64, align 64 + %arrayidx67 = getelementptr inbounds <8 x i64> * %secondSource, i32 6 + %s121 = load <8 x i64> * %arrayidx67, align 64 + %s122 = bitcast <8 x i64> %s120 to i512 + %data.i.i677.48.extract.shift = lshr i512 %s122, 384 + %data.i.i677.48.extract.trunc = trunc i512 %data.i.i677.48.extract.shift to i64 + %s123 = insertelement <8 x i64> undef, i64 %data.i.i677.48.extract.trunc, i32 0 + %data.i.i677.32.extract.shift = lshr i512 %s122, 256 + %data.i.i677.32.extract.trunc = trunc i512 %data.i.i677.32.extract.shift to i64 + %s124 = insertelement <8 x i64> %s123, i64 %data.i.i677.32.extract.trunc, i32 1 + %data.i.i677.16.extract.shift = lshr i512 %s122, 128 + %data.i.i677.16.extract.trunc = trunc i512 %data.i.i677.16.extract.shift to i64 + %s125 = insertelement <8 x i64> %s124, i64 %data.i.i677.16.extract.trunc, i32 2 + %data.i.i677.56.extract.shift = lshr i512 %s122, 448 + %data.i.i677.56.extract.trunc = trunc i512 %data.i.i677.56.extract.shift to i64 + %s126 = insertelement <8 x i64> %s125, i64 %data.i.i677.56.extract.trunc, i32 3 + %data.i.i677.24.extract.shift = lshr i512 %s122, 192 + %data.i.i677.24.extract.trunc = trunc i512 %data.i.i677.24.extract.shift to i64 + %s127 = insertelement <8 x i64> %s126, i64 %data.i.i677.24.extract.trunc, i32 4 + %s128 = insertelement <8 x i64> %s127, i64 %data.i.i677.32.extract.trunc, i32 5 + %s129 = insertelement <8 x i64> %s128, i64 %data.i.i677.16.extract.trunc, i32 6 + %s130 = insertelement <8 x i64> %s129, i64 %data.i.i677.56.extract.trunc, i32 7 + %s131 = bitcast <8 x i64> %s121 to i512 + %data.i1.i676.48.extract.shift = lshr i512 %s131, 384 + %data.i1.i676.48.extract.trunc = trunc i512 %data.i1.i676.48.extract.shift to i64 + %s132 = insertelement <8 x i64> undef, i64 %data.i1.i676.48.extract.trunc, i32 0 + %data.i1.i676.32.extract.shift = lshr i512 %s131, 256 + %data.i1.i676.32.extract.trunc = trunc i512 %data.i1.i676.32.extract.shift to i64 + %s133 = insertelement <8 x i64> %s132, i64 %data.i1.i676.32.extract.trunc, i32 1 + %data.i1.i676.16.extract.shift = lshr i512 %s131, 128 + %data.i1.i676.16.extract.trunc = trunc i512 %data.i1.i676.16.extract.shift to i64 + %s134 = insertelement <8 x i64> %s133, i64 %data.i1.i676.16.extract.trunc, i32 2 + %data.i1.i676.56.extract.shift = lshr i512 %s131, 448 + %data.i1.i676.56.extract.trunc = trunc i512 %data.i1.i676.56.extract.shift to i64 + %s135 = insertelement <8 x i64> %s134, i64 %data.i1.i676.56.extract.trunc, i32 3 + %data.i1.i676.24.extract.shift = lshr i512 %s131, 192 + %data.i1.i676.24.extract.trunc = trunc i512 %data.i1.i676.24.extract.shift to i64 + %s136 = insertelement <8 x i64> %s135, i64 %data.i1.i676.24.extract.trunc, i32 4 + %s137 = insertelement <8 x i64> %s136, i64 %data.i1.i676.32.extract.trunc, i32 5 + %s138 = insertelement <8 x i64> %s137, i64 %data.i1.i676.16.extract.trunc, i32 6 + %s139 = insertelement <8 x i64> %s138, i64 %data.i1.i676.56.extract.trunc, i32 7 + %vecinit28.i.i699 = shufflevector <8 x i64> %s139, <8 x i64> %s130, <8 x i32> + %vecinit35.i.i700 = shufflevector <8 x i64> %vecinit28.i.i699, <8 x i64> %s139, <8 x i32> + %vecinit42.i.i701 = shufflevector <8 x i64> %vecinit35.i.i700, <8 x i64> %s139, <8 x i32> + %vecinit49.i.i702 = shufflevector <8 x i64> %vecinit42.i.i701, <8 x i64> %s130, <8 x i32> + %arrayidx72 = getelementptr inbounds <8 x i64> * %dest, i32 6 + store <8 x i64> %vecinit49.i.i702, <8 x i64> * %arrayidx72, align 64 + %arrayidx75 = getelementptr inbounds <8 x i64> * %source, i32 7 + %s140 = load <8 x i64> * %arrayidx75, align 64 + %arrayidx78 = getelementptr inbounds <8 x i64> * %secondSource, i32 7 + %s141 = load <8 x i64> * %arrayidx78, align 64 + %s142 = bitcast <8 x i64> %s140 to i512 + %data.i.i650.32.extract.shift = lshr i512 %s142, 256 + %data.i.i650.32.extract.trunc = trunc i512 %data.i.i650.32.extract.shift to i64 + %s143 = insertelement <8 x i64> undef, i64 %data.i.i650.32.extract.trunc, i32 0 + %s144 = insertelement <8 x i64> %s143, i64 %data.i.i650.32.extract.trunc, i32 1 + %data.i.i650.16.extract.shift = lshr i512 %s142, 128 + %data.i.i650.16.extract.trunc = trunc i512 %data.i.i650.16.extract.shift to i64 + %s145 = insertelement <8 x i64> %s144, i64 %data.i.i650.16.extract.trunc, i32 2 + %data.i.i650.8.extract.shift = lshr i512 %s142, 64 + %data.i.i650.8.extract.trunc = trunc i512 %data.i.i650.8.extract.shift to i64 + %s146 = insertelement <8 x i64> %s145, i64 %data.i.i650.8.extract.trunc, i32 3 + %s147 = insertelement <8 x i64> %s146, i64 %data.i.i650.8.extract.trunc, i32 4 + %data.i.i650.48.extract.shift = lshr i512 %s142, 384 + %data.i.i650.48.extract.trunc = trunc i512 %data.i.i650.48.extract.shift to i64 + %s148 = insertelement <8 x i64> %s147, i64 %data.i.i650.48.extract.trunc, i32 5 + %s149 = insertelement <8 x i64> %s148, i64 %data.i.i650.16.extract.trunc, i32 6 + %data.i.i650.0.extract.trunc = trunc i512 %s142 to i64 + %s150 = insertelement <8 x i64> %s149, i64 %data.i.i650.0.extract.trunc, i32 7 + %s151 = bitcast <8 x i64> %s141 to i512 + %data.i1.i649.32.extract.shift = lshr i512 %s151, 256 + %data.i1.i649.32.extract.trunc = trunc i512 %data.i1.i649.32.extract.shift to i64 + %s152 = insertelement <8 x i64> undef, i64 %data.i1.i649.32.extract.trunc, i32 0 + %s153 = insertelement <8 x i64> %s152, i64 %data.i1.i649.32.extract.trunc, i32 1 + %data.i1.i649.16.extract.shift = lshr i512 %s151, 128 + %data.i1.i649.16.extract.trunc = trunc i512 %data.i1.i649.16.extract.shift to i64 + %s154 = insertelement <8 x i64> %s153, i64 %data.i1.i649.16.extract.trunc, i32 2 + %data.i1.i649.8.extract.shift = lshr i512 %s151, 64 + %data.i1.i649.8.extract.trunc = trunc i512 %data.i1.i649.8.extract.shift to i64 + %s155 = insertelement <8 x i64> %s154, i64 %data.i1.i649.8.extract.trunc, i32 3 + %s156 = insertelement <8 x i64> %s155, i64 %data.i1.i649.8.extract.trunc, i32 4 + %data.i1.i649.48.extract.shift = lshr i512 %s151, 384 + %data.i1.i649.48.extract.trunc = trunc i512 %data.i1.i649.48.extract.shift to i64 + %s157 = insertelement <8 x i64> %s156, i64 %data.i1.i649.48.extract.trunc, i32 5 + %s158 = insertelement <8 x i64> %s157, i64 %data.i1.i649.16.extract.trunc, i32 6 + %data.i1.i649.0.extract.trunc = trunc i512 %s151 to i64 + %s159 = insertelement <8 x i64> %s158, i64 %data.i1.i649.0.extract.trunc, i32 7 + %vecinit7.i.i669 = shufflevector <8 x i64> %s159, <8 x i64> %s150, <8 x i32> + %vecinit14.i.i670 = shufflevector <8 x i64> %vecinit7.i.i669, <8 x i64> %s150, <8 x i32> + %vecinit21.i.i671 = shufflevector <8 x i64> %vecinit14.i.i670, <8 x i64> %s150, <8 x i32> + %vecinit28.i.i672 = shufflevector <8 x i64> %vecinit21.i.i671, <8 x i64> %s150, <8 x i32> + %vecinit35.i.i673 = shufflevector <8 x i64> %vecinit28.i.i672, <8 x i64> %s159, <8 x i32> + %vecinit42.i.i674 = shufflevector <8 x i64> %vecinit35.i.i673, <8 x i64> %s159, <8 x i32> + %vecinit49.i.i675 = shufflevector <8 x i64> %vecinit42.i.i674, <8 x i64> %s159, <8 x i32> + %arrayidx83 = getelementptr inbounds <8 x i64> * %dest, i32 7 + store <8 x i64> %vecinit49.i.i675, <8 x i64> * %arrayidx83, align 64 + ret void +} -- cgit v1.2.3-70-g09d2 From 992347f27131a403043a1e2f1bec4da82568df35 Mon Sep 17 00:00:00 2001 From: Stepan Dyatkovskiy Date: Fri, 5 Apr 2013 07:34:08 +0000 Subject: Buildbot fix for r178851: mistake was in wrong TargetRegisterInfo::getRegClass usage. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178854 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'lib/Target/ARM/ARMLoadStoreOptimizer.cpp') diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index 0682459e1c..b7ac5d57c3 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -103,7 +103,7 @@ namespace { SmallVector getUnitRegs(unsigned Reg) { SmallVector Res; - const TargetRegisterClass* TRC = TRI->getRegClass(Reg); + const TargetRegisterClass* TRC = TRI->getMinimalPhysRegClass(Reg); if (TRC == &ARM::QPRRegClass) { if (Reg > ARM::Q7) { Res.push_back(TRI->getSubReg(Reg, ARM::dsub_0)); -- cgit v1.2.3-70-g09d2 From 84581daf2058eb9d37e24a50ad3be198529bdf28 Mon Sep 17 00:00:00 2001 From: Renato Golin Date: Fri, 5 Apr 2013 16:39:53 +0000 Subject: Reverting 178851 as it broke buildbots git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178883 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 171 ++------------------- .../ARM/2013-04-05-overridden-loads-PR14824.ll | 110 ------------- 2 files changed, 10 insertions(+), 271 deletions(-) delete mode 100644 test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll (limited to 'lib/Target/ARM/ARMLoadStoreOptimizer.cpp') diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index b7ac5d57c3..e4e683c2a0 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -87,53 +87,6 @@ namespace { MachineBasicBlock::iterator i) : Offset(o), Reg(r), isKill(k), Position(p), MBBI(i), Merged(false) {} }; - class UnitRegsMap { - public: - UnitRegsMap(const TargetRegisterInfo* _TRI) : TRI(_TRI) {} - const SmallVector& operator[](unsigned Reg) { - DenseMap >::iterator found = - Cache.find(Reg); - if (found != Cache.end()) - return found->second; - else - return Cache.insert(std::make_pair(Reg, this->getUnitRegs(Reg))) - .first->second; - } - private: - SmallVector getUnitRegs(unsigned Reg) { - SmallVector Res; - - const TargetRegisterClass* TRC = TRI->getMinimalPhysRegClass(Reg); - if (TRC == &ARM::QPRRegClass) { - if (Reg > ARM::Q7) { - Res.push_back(TRI->getSubReg(Reg, ARM::dsub_0)); - Res.push_back(TRI->getSubReg(Reg, ARM::dsub_1)); - return Res; - } - - Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0)); - Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1)); - Res.push_back(TRI->getSubReg(Reg, ARM::ssub_2)); - Res.push_back(TRI->getSubReg(Reg, ARM::ssub_3)); - - return Res; - } - - if (TRC == &ARM::DPRRegClass && Reg < ARM::D15) { - Res.push_back(TRI->getSubReg(Reg, ARM::ssub_0)); - Res.push_back(TRI->getSubReg(Reg, ARM::ssub_1)); - - return Res; - } - - Res.push_back(Reg); - - return Res; - - } - const TargetRegisterInfo* TRI; - DenseMap > Cache; - }; typedef SmallVector MemOpQueue; typedef MemOpQueue::iterator MemOpQueueIter; @@ -175,11 +128,6 @@ namespace { MachineBasicBlock::iterator MBBI, bool &Advance, MachineBasicBlock::iterator &I); - unsigned AddMemOp(MemOpQueue& MemOps, - const MemOpQueueEntry newEntry, - UnitRegsMap& UnitRegsInfo, - SmallSet& UsedUnitRegs, - unsigned At = -1U); bool LoadStoreMultipleOpti(MachineBasicBlock &MBB); bool MergeReturnIntoLDM(MachineBasicBlock &MBB); }; @@ -1265,103 +1213,12 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB, return false; } -/// AddMemOp - helper for ARMLoadStoreOpt::LoadStoreMultipleOpti. -/// It adds store mem ops with simple push_back/insert method, -/// without any additional logic. -/// For load operation it does the next: -/// 1. Adds new load operation into MemOp collection at "At" position. -/// 2. Removes any "load" operations from MemOps, that changes "Reg" register -/// contents, prior to "At". -/// UnitRegsInfo - Map of type Map< Register, UnitRegisters-vector > -/// UsedUnitRegs - set of unit-registers currently in use. -/// At - position at which it would added, and prior which the clean-up -/// should be made (for load operation). -/// FIXME: The clean-up also should be made for store operations, -/// but the memory address should be analyzed instead of unit registers. -unsigned ARMLoadStoreOpt::AddMemOp(MemOpQueue& MemOps, - const MemOpQueueEntry NewEntry, - UnitRegsMap& UnitRegsInfo, - SmallSet& UsedUnitRegs, - unsigned At) { - unsigned Cleaned = 0; - - if (At == -1U) { - At = MemOps.size(); - MemOps.push_back(NewEntry); - } else - MemOps.insert(&MemOps[At], NewEntry); - - // FIXME: - // If operation is not load, leave it as is by now, - // So 0 overridden ops would cleaned in this case. - if (!NewEntry.MBBI->mayLoad()) - return 0; - - const SmallVector& NewEntryUnitRegs = UnitRegsInfo[NewEntry.Reg]; - - bool FoundOverriddenLoads = false; - - for (unsigned i = 0, e = NewEntryUnitRegs.size(); i != e; ++i) - if (UsedUnitRegs.count(NewEntryUnitRegs[i])) { - FoundOverriddenLoads = true; - break; - } - - // If we detect that this register is used by load operations that are - // predecessors for the new one, remove them from MemOps then. - if (FoundOverriddenLoads) { - MemOpQueue UpdatedMemOps; - - // Scan through MemOps entries. - for (unsigned i = 0; i != At; ++i) { - MemOpQueueEntry& MemOpEntry = MemOps[i]; - - // FIXME: Skip non-load operations by now. - if (!MemOpEntry.MBBI->mayLoad()) - continue; - - const SmallVector& MemOpUnitRegs = - UnitRegsInfo[MemOpEntry.Reg]; - - // Lookup entry that loads contents into register used by new entry. - bool ReleaseThisEntry = false; - for (unsigned m = 0, em = MemOpUnitRegs.size(); m != em; ++m) { - if (std::find(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end(), - MemOpUnitRegs[m]) != NewEntryUnitRegs.end()) { - ReleaseThisEntry = true; - ++Cleaned; - break; - } - } - - if (ReleaseThisEntry) { - const SmallVector& RelesedRegs = UnitRegsInfo[MemOpEntry.Reg]; - for (unsigned r = 0, er = RelesedRegs.size(); r != er; ++r) - UsedUnitRegs.erase(RelesedRegs[r]); - } else - UpdatedMemOps.push_back(MemOpEntry); - } - - // Keep anything without changes after At position. - for (unsigned i = At, e = MemOps.size(); i != e; ++i) - UpdatedMemOps.push_back(MemOps[i]); - - MemOps.swap(UpdatedMemOps); - } - - UsedUnitRegs.insert(NewEntryUnitRegs.begin(), NewEntryUnitRegs.end()); - - return Cleaned; -} - /// LoadStoreMultipleOpti - An optimization pass to turn multiple LDR / STR /// ops of the same base and incrementing offset into LDM / STM ops. bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { unsigned NumMerges = 0; unsigned NumMemOps = 0; MemOpQueue MemOps; - UnitRegsMap UnitRegsInfo(TRI); - SmallSet UsedRegUnits; unsigned CurrBase = 0; int CurrOpc = -1; unsigned CurrSize = 0; @@ -1408,11 +1265,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { CurrSize = Size; CurrPred = Pred; CurrPredReg = PredReg; - MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); ++NumMemOps; - const SmallVector& EntryUnitRegs = UnitRegsInfo[Reg]; - UsedRegUnits.insert(EntryUnitRegs.begin(), EntryUnitRegs.end()); Advance = true; } else { if (Clobber) { @@ -1424,24 +1278,20 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // No need to match PredReg. // Continue adding to the queue. if (Offset > MemOps.back().Offset) { - unsigned OverridesCleaned = - AddMemOp(MemOps, - MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI), - UnitRegsInfo, UsedRegUnits) != 0; - NumMemOps += 1 - OverridesCleaned; + MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, + Position, MBBI)); + ++NumMemOps; Advance = true; } else { - for (unsigned I = 0; I != NumMemOps; ++I) { - if (Offset < MemOps[I].Offset) { - MemOpQueueEntry entry(Offset, Reg, isKill, Position, MBBI); - unsigned OverridesCleaned = - AddMemOp(MemOps, entry, UnitRegsInfo, - UsedRegUnits, I) != 0; - NumMemOps += 1 - OverridesCleaned; - + for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); + I != E; ++I) { + if (Offset < I->Offset) { + MemOps.insert(I, MemOpQueueEntry(Offset, Reg, isKill, + Position, MBBI)); + ++NumMemOps; Advance = true; break; - } else if (Offset == MemOps[I].Offset) { + } else if (Offset == I->Offset) { // Collision! This can't be merged! break; } @@ -1512,7 +1362,6 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { CurrPredReg = 0; if (NumMemOps) { MemOps.clear(); - UsedRegUnits.clear(); NumMemOps = 0; } diff --git a/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll b/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll deleted file mode 100644 index 2561686c1f..0000000000 --- a/test/CodeGen/ARM/2013-04-05-overridden-loads-PR14824.ll +++ /dev/null @@ -1,110 +0,0 @@ -; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp | FileCheck %s -; The test is presented by Jiangning Liu. -;CHECK-NOT: vldmia - -define void @sample_test(<8 x i64> * %secondSource, <8 x i64> * %source, <8 x i64> * %dest) nounwind { -entry: - %s0 = load <8 x i64> * %source, align 64 - %s1 = load <8 x i64> * %secondSource, align 64 - %s2 = bitcast <8 x i64> %s0 to i512 - %data.i.i.48.extract.shift = lshr i512 %s2, 384 - %data.i.i.48.extract.trunc = trunc i512 %data.i.i.48.extract.shift to i64 - %arrayidx64 = getelementptr inbounds <8 x i64> * %source, i32 6 - %s120 = load <8 x i64> * %arrayidx64, align 64 - %arrayidx67 = getelementptr inbounds <8 x i64> * %secondSource, i32 6 - %s121 = load <8 x i64> * %arrayidx67, align 64 - %s122 = bitcast <8 x i64> %s120 to i512 - %data.i.i677.48.extract.shift = lshr i512 %s122, 384 - %data.i.i677.48.extract.trunc = trunc i512 %data.i.i677.48.extract.shift to i64 - %s123 = insertelement <8 x i64> undef, i64 %data.i.i677.48.extract.trunc, i32 0 - %data.i.i677.32.extract.shift = lshr i512 %s122, 256 - %data.i.i677.32.extract.trunc = trunc i512 %data.i.i677.32.extract.shift to i64 - %s124 = insertelement <8 x i64> %s123, i64 %data.i.i677.32.extract.trunc, i32 1 - %data.i.i677.16.extract.shift = lshr i512 %s122, 128 - %data.i.i677.16.extract.trunc = trunc i512 %data.i.i677.16.extract.shift to i64 - %s125 = insertelement <8 x i64> %s124, i64 %data.i.i677.16.extract.trunc, i32 2 - %data.i.i677.56.extract.shift = lshr i512 %s122, 448 - %data.i.i677.56.extract.trunc = trunc i512 %data.i.i677.56.extract.shift to i64 - %s126 = insertelement <8 x i64> %s125, i64 %data.i.i677.56.extract.trunc, i32 3 - %data.i.i677.24.extract.shift = lshr i512 %s122, 192 - %data.i.i677.24.extract.trunc = trunc i512 %data.i.i677.24.extract.shift to i64 - %s127 = insertelement <8 x i64> %s126, i64 %data.i.i677.24.extract.trunc, i32 4 - %s128 = insertelement <8 x i64> %s127, i64 %data.i.i677.32.extract.trunc, i32 5 - %s129 = insertelement <8 x i64> %s128, i64 %data.i.i677.16.extract.trunc, i32 6 - %s130 = insertelement <8 x i64> %s129, i64 %data.i.i677.56.extract.trunc, i32 7 - %s131 = bitcast <8 x i64> %s121 to i512 - %data.i1.i676.48.extract.shift = lshr i512 %s131, 384 - %data.i1.i676.48.extract.trunc = trunc i512 %data.i1.i676.48.extract.shift to i64 - %s132 = insertelement <8 x i64> undef, i64 %data.i1.i676.48.extract.trunc, i32 0 - %data.i1.i676.32.extract.shift = lshr i512 %s131, 256 - %data.i1.i676.32.extract.trunc = trunc i512 %data.i1.i676.32.extract.shift to i64 - %s133 = insertelement <8 x i64> %s132, i64 %data.i1.i676.32.extract.trunc, i32 1 - %data.i1.i676.16.extract.shift = lshr i512 %s131, 128 - %data.i1.i676.16.extract.trunc = trunc i512 %data.i1.i676.16.extract.shift to i64 - %s134 = insertelement <8 x i64> %s133, i64 %data.i1.i676.16.extract.trunc, i32 2 - %data.i1.i676.56.extract.shift = lshr i512 %s131, 448 - %data.i1.i676.56.extract.trunc = trunc i512 %data.i1.i676.56.extract.shift to i64 - %s135 = insertelement <8 x i64> %s134, i64 %data.i1.i676.56.extract.trunc, i32 3 - %data.i1.i676.24.extract.shift = lshr i512 %s131, 192 - %data.i1.i676.24.extract.trunc = trunc i512 %data.i1.i676.24.extract.shift to i64 - %s136 = insertelement <8 x i64> %s135, i64 %data.i1.i676.24.extract.trunc, i32 4 - %s137 = insertelement <8 x i64> %s136, i64 %data.i1.i676.32.extract.trunc, i32 5 - %s138 = insertelement <8 x i64> %s137, i64 %data.i1.i676.16.extract.trunc, i32 6 - %s139 = insertelement <8 x i64> %s138, i64 %data.i1.i676.56.extract.trunc, i32 7 - %vecinit28.i.i699 = shufflevector <8 x i64> %s139, <8 x i64> %s130, <8 x i32> - %vecinit35.i.i700 = shufflevector <8 x i64> %vecinit28.i.i699, <8 x i64> %s139, <8 x i32> - %vecinit42.i.i701 = shufflevector <8 x i64> %vecinit35.i.i700, <8 x i64> %s139, <8 x i32> - %vecinit49.i.i702 = shufflevector <8 x i64> %vecinit42.i.i701, <8 x i64> %s130, <8 x i32> - %arrayidx72 = getelementptr inbounds <8 x i64> * %dest, i32 6 - store <8 x i64> %vecinit49.i.i702, <8 x i64> * %arrayidx72, align 64 - %arrayidx75 = getelementptr inbounds <8 x i64> * %source, i32 7 - %s140 = load <8 x i64> * %arrayidx75, align 64 - %arrayidx78 = getelementptr inbounds <8 x i64> * %secondSource, i32 7 - %s141 = load <8 x i64> * %arrayidx78, align 64 - %s142 = bitcast <8 x i64> %s140 to i512 - %data.i.i650.32.extract.shift = lshr i512 %s142, 256 - %data.i.i650.32.extract.trunc = trunc i512 %data.i.i650.32.extract.shift to i64 - %s143 = insertelement <8 x i64> undef, i64 %data.i.i650.32.extract.trunc, i32 0 - %s144 = insertelement <8 x i64> %s143, i64 %data.i.i650.32.extract.trunc, i32 1 - %data.i.i650.16.extract.shift = lshr i512 %s142, 128 - %data.i.i650.16.extract.trunc = trunc i512 %data.i.i650.16.extract.shift to i64 - %s145 = insertelement <8 x i64> %s144, i64 %data.i.i650.16.extract.trunc, i32 2 - %data.i.i650.8.extract.shift = lshr i512 %s142, 64 - %data.i.i650.8.extract.trunc = trunc i512 %data.i.i650.8.extract.shift to i64 - %s146 = insertelement <8 x i64> %s145, i64 %data.i.i650.8.extract.trunc, i32 3 - %s147 = insertelement <8 x i64> %s146, i64 %data.i.i650.8.extract.trunc, i32 4 - %data.i.i650.48.extract.shift = lshr i512 %s142, 384 - %data.i.i650.48.extract.trunc = trunc i512 %data.i.i650.48.extract.shift to i64 - %s148 = insertelement <8 x i64> %s147, i64 %data.i.i650.48.extract.trunc, i32 5 - %s149 = insertelement <8 x i64> %s148, i64 %data.i.i650.16.extract.trunc, i32 6 - %data.i.i650.0.extract.trunc = trunc i512 %s142 to i64 - %s150 = insertelement <8 x i64> %s149, i64 %data.i.i650.0.extract.trunc, i32 7 - %s151 = bitcast <8 x i64> %s141 to i512 - %data.i1.i649.32.extract.shift = lshr i512 %s151, 256 - %data.i1.i649.32.extract.trunc = trunc i512 %data.i1.i649.32.extract.shift to i64 - %s152 = insertelement <8 x i64> undef, i64 %data.i1.i649.32.extract.trunc, i32 0 - %s153 = insertelement <8 x i64> %s152, i64 %data.i1.i649.32.extract.trunc, i32 1 - %data.i1.i649.16.extract.shift = lshr i512 %s151, 128 - %data.i1.i649.16.extract.trunc = trunc i512 %data.i1.i649.16.extract.shift to i64 - %s154 = insertelement <8 x i64> %s153, i64 %data.i1.i649.16.extract.trunc, i32 2 - %data.i1.i649.8.extract.shift = lshr i512 %s151, 64 - %data.i1.i649.8.extract.trunc = trunc i512 %data.i1.i649.8.extract.shift to i64 - %s155 = insertelement <8 x i64> %s154, i64 %data.i1.i649.8.extract.trunc, i32 3 - %s156 = insertelement <8 x i64> %s155, i64 %data.i1.i649.8.extract.trunc, i32 4 - %data.i1.i649.48.extract.shift = lshr i512 %s151, 384 - %data.i1.i649.48.extract.trunc = trunc i512 %data.i1.i649.48.extract.shift to i64 - %s157 = insertelement <8 x i64> %s156, i64 %data.i1.i649.48.extract.trunc, i32 5 - %s158 = insertelement <8 x i64> %s157, i64 %data.i1.i649.16.extract.trunc, i32 6 - %data.i1.i649.0.extract.trunc = trunc i512 %s151 to i64 - %s159 = insertelement <8 x i64> %s158, i64 %data.i1.i649.0.extract.trunc, i32 7 - %vecinit7.i.i669 = shufflevector <8 x i64> %s159, <8 x i64> %s150, <8 x i32> - %vecinit14.i.i670 = shufflevector <8 x i64> %vecinit7.i.i669, <8 x i64> %s150, <8 x i32> - %vecinit21.i.i671 = shufflevector <8 x i64> %vecinit14.i.i670, <8 x i64> %s150, <8 x i32> - %vecinit28.i.i672 = shufflevector <8 x i64> %vecinit21.i.i671, <8 x i64> %s150, <8 x i32> - %vecinit35.i.i673 = shufflevector <8 x i64> %vecinit28.i.i672, <8 x i64> %s159, <8 x i32> - %vecinit42.i.i674 = shufflevector <8 x i64> %vecinit35.i.i673, <8 x i64> %s159, <8 x i32> - %vecinit49.i.i675 = shufflevector <8 x i64> %vecinit42.i.i674, <8 x i64> %s159, <8 x i32> - %arrayidx83 = getelementptr inbounds <8 x i64> * %dest, i32 7 - store <8 x i64> %vecinit49.i.i675, <8 x i64> * %arrayidx83, align 64 - ret void -} -- cgit v1.2.3-70-g09d2 From d050e96133fac8565e3bb1eabe9a587dd5a6ac4d Mon Sep 17 00:00:00 2001 From: Hao Liu Date: Thu, 18 Apr 2013 09:11:08 +0000 Subject: Fix for PR14824, An ARM Load/Store Optimization bug git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179751 91177308-0d34-0410-b5e6-96231b3b80d8 --- lib/Target/ARM/ARMLoadStoreOptimizer.cpp | 18 ++++- .../CodeGen/ARM/2013-04-18-load-overlap-PR14824.ll | 82 ++++++++++++++++++++++ 2 files changed, 99 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/ARM/2013-04-18-load-overlap-PR14824.ll (limited to 'lib/Target/ARM/ARMLoadStoreOptimizer.cpp') diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp index e4e683c2a0..c8ed5760f9 100644 --- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp +++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp @@ -1258,6 +1258,22 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { // merge the ldr's so far, including this one. But don't try to // combine the following ldr(s). Clobber = (isi32Load(Opcode) && Base == MBBI->getOperand(0).getReg()); + + // Watch out for: + // r4 := ldr [r0, #8] + // r4 := ldr [r0, #4] + // + // The optimization may reorder the second ldr in front of the first + // ldr, which violates write after write(WAW) dependence. The same as + // str. Try to merge inst(s) already in MemOps. + bool Overlap = false; + for (MemOpQueueIter I = MemOps.begin(), E = MemOps.end(); I != E; ++I) { + if (TRI->regsOverlap(Reg, I->MBBI->getOperand(0).getReg())) { + Overlap = true; + break; + } + } + if (CurrBase == 0 && !Clobber) { // Start of a new chain. CurrBase = Base; @@ -1268,7 +1284,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) { MemOps.push_back(MemOpQueueEntry(Offset, Reg, isKill, Position, MBBI)); ++NumMemOps; Advance = true; - } else { + } else if (!Overlap) { if (Clobber) { TryMerge = true; Advance = true; diff --git a/test/CodeGen/ARM/2013-04-18-load-overlap-PR14824.ll b/test/CodeGen/ARM/2013-04-18-load-overlap-PR14824.ll new file mode 100644 index 0000000000..4599928187 --- /dev/null +++ b/test/CodeGen/ARM/2013-04-18-load-overlap-PR14824.ll @@ -0,0 +1,82 @@ +; RUN: llc < %s -mtriple=thumbv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp | FileCheck %s +; PR14824. The test is presented by Jiangning Liu. If the ld/st optimization algorithm is changed, this test case may fail. +; Also if the machine code for ld/st optimizor is changed, this test case may fail. If so, remove this test. + +define void @sample_test(<8 x i64> * %secondSource, <8 x i64> * %source, <8 x i64> * %dest) nounwind { +; CHECK: sample_test +; CHECK-NOT: vldmia +; CHECK: add +entry: + +; Load %source + %s0 = load <8 x i64> * %source, align 64 + %arrayidx64 = getelementptr inbounds <8 x i64> * %source, i32 6 + %s120 = load <8 x i64> * %arrayidx64, align 64 + %s122 = bitcast <8 x i64> %s120 to i512 + %data.i.i677.48.extract.shift = lshr i512 %s122, 384 + %data.i.i677.48.extract.trunc = trunc i512 %data.i.i677.48.extract.shift to i64 + %s123 = insertelement <8 x i64> undef, i64 %data.i.i677.48.extract.trunc, i32 0 + %data.i.i677.32.extract.shift = lshr i512 %s122, 256 + %data.i.i677.32.extract.trunc = trunc i512 %data.i.i677.32.extract.shift to i64 + %s124 = insertelement <8 x i64> %s123, i64 %data.i.i677.32.extract.trunc, i32 1 + %data.i.i677.16.extract.shift = lshr i512 %s122, 128 + %data.i.i677.16.extract.trunc = trunc i512 %data.i.i677.16.extract.shift to i64 + %s125 = insertelement <8 x i64> %s124, i64 %data.i.i677.16.extract.trunc, i32 2 + %data.i.i677.56.extract.shift = lshr i512 %s122, 448 + %data.i.i677.56.extract.trunc = trunc i512 %data.i.i677.56.extract.shift to i64 + %s126 = insertelement <8 x i64> %s125, i64 %data.i.i677.56.extract.trunc, i32 3 + %data.i.i677.24.extract.shift = lshr i512 %s122, 192 + %data.i.i677.24.extract.trunc = trunc i512 %data.i.i677.24.extract.shift to i64 + %s127 = insertelement <8 x i64> %s126, i64 %data.i.i677.24.extract.trunc, i32 4 + %s128 = insertelement <8 x i64> %s127, i64 %data.i.i677.32.extract.trunc, i32 5 + %s129 = insertelement <8 x i64> %s128, i64 %data.i.i677.16.extract.trunc, i32 6 + %s130 = insertelement <8 x i64> %s129, i64 %data.i.i677.56.extract.trunc, i32 7 + +; Load %secondSource + %s1 = load <8 x i64> * %secondSource, align 64 + %arrayidx67 = getelementptr inbounds <8 x i64> * %secondSource, i32 6 + %s121 = load <8 x i64> * %arrayidx67, align 64 + %s131 = bitcast <8 x i64> %s121 to i512 + %data.i1.i676.48.extract.shift = lshr i512 %s131, 384 + %data.i1.i676.48.extract.trunc = trunc i512 %data.i1.i676.48.extract.shift to i64 + %s132 = insertelement <8 x i64> undef, i64 %data.i1.i676.48.extract.trunc, i32 0 + %data.i1.i676.32.extract.shift = lshr i512 %s131, 256 + %data.i1.i676.32.extract.trunc = trunc i512 %data.i1.i676.32.extract.shift to i64 + %s133 = insertelement <8 x i64> %s132, i64 %data.i1.i676.32.extract.trunc, i32 1 + %data.i1.i676.16.extract.shift = lshr i512 %s131, 128 + %data.i1.i676.16.extract.trunc = trunc i512 %data.i1.i676.16.extract.shift to i64 + %s134 = insertelement <8 x i64> %s133, i64 %data.i1.i676.16.extract.trunc, i32 2 + %data.i1.i676.56.extract.shift = lshr i512 %s131, 448 + %data.i1.i676.56.extract.trunc = trunc i512 %data.i1.i676.56.extract.shift to i64 + %s135 = insertelement <8 x i64> %s134, i64 %data.i1.i676.56.extract.trunc, i32 3 + %data.i1.i676.24.extract.shift = lshr i512 %s131, 192 + %data.i1.i676.24.extract.trunc = trunc i512 %data.i1.i676.24.extract.shift to i64 + %s136 = insertelement <8 x i64> %s135, i64 %data.i1.i676.24.extract.trunc, i32 4 + %s137 = insertelement <8 x i64> %s136, i64 %data.i1.i676.32.extract.trunc, i32 5 + %s138 = insertelement <8 x i64> %s137, i64 %data.i1.i676.16.extract.trunc, i32 6 + %s139 = insertelement <8 x i64> %s138, i64 %data.i1.i676.56.extract.trunc, i32 7 + +; Operations about %Source and %secondSource + %vecinit28.i.i699 = shufflevector <8 x i64> %s139, <8 x i64> %s130, <8 x i32> + %vecinit35.i.i700 = shufflevector <8 x i64> %vecinit28.i.i699, <8 x i64> %s139, <8 x i32> + %vecinit42.i.i701 = shufflevector <8 x i64> %vecinit35.i.i700, <8 x i64> %s139, <8 x i32> + %vecinit49.i.i702 = shufflevector <8 x i64> %vecinit42.i.i701, <8 x i64> %s130, <8 x i32> + %arrayidx72 = getelementptr inbounds <8 x i64> * %dest, i32 6 + store <8 x i64> %vecinit49.i.i702, <8 x i64> * %arrayidx72, align 64 + %arrayidx78 = getelementptr inbounds <8 x i64> * %secondSource, i32 7 + %s141 = load <8 x i64> * %arrayidx78, align 64 + %s151 = bitcast <8 x i64> %s141 to i512 + %data.i1.i649.32.extract.shift = lshr i512 %s151, 256 + %data.i1.i649.32.extract.trunc = trunc i512 %data.i1.i649.32.extract.shift to i64 + %s152 = insertelement <8 x i64> undef, i64 %data.i1.i649.32.extract.trunc, i32 0 + %s153 = insertelement <8 x i64> %s152, i64 %data.i1.i649.32.extract.trunc, i32 1 + %data.i1.i649.16.extract.shift = lshr i512 %s151, 128 + %data.i1.i649.16.extract.trunc = trunc i512 %data.i1.i649.16.extract.shift to i64 + %s154 = insertelement <8 x i64> %s153, i64 %data.i1.i649.16.extract.trunc, i32 2 + %data.i1.i649.8.extract.shift = lshr i512 %s151, 64 + %data.i1.i649.8.extract.trunc = trunc i512 %data.i1.i649.8.extract.shift to i64 + %s155 = insertelement <8 x i64> %s154, i64 %data.i1.i649.8.extract.trunc, i32 3 + %arrayidx83 = getelementptr inbounds <8 x i64> * %dest, i32 7 + store <8 x i64> %s155, <8 x i64> * %arrayidx83, align 64 + ret void +} -- cgit v1.2.3-70-g09d2