aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorEvan Cheng <evan.cheng@apple.com>2009-04-17 01:29:40 +0000
committerEvan Cheng <evan.cheng@apple.com>2009-04-17 01:29:40 +0000
commit276b77e66c538264d79b78c00bbad9f890f58011 (patch)
treef668c05c94b70225a5bc83d12f1e1912cb485226
parentd10a4ce5825d0981107c0106c49089b9e5792e40 (diff)
Teach spiller to unfold instructions which modref spill slot when a scratch
register is available and when it's profitable. e.g. xorq %r12<kill>, %r13 addq %rax, -184(%rbp) addq %r13, -184(%rbp) ==> xorq %r12<kill>, %r13 movq -184(%rbp), %r12 addq %rax, %r12 addq %r13, %r12 movq %r12, -184(%rbp) Two more instructions, but fewer memory accesses. It can also open up opportunities for more optimizations. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@69341 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/CodeGen/Spiller.cpp219
-rw-r--r--lib/CodeGen/Spiller.h19
-rw-r--r--test/CodeGen/X86/2009-04-16-SpillerUnfold.ll139
3 files changed, 366 insertions, 11 deletions
diff --git a/lib/CodeGen/Spiller.cpp b/lib/CodeGen/Spiller.cpp
index 5edde38170..92bb785de6 100644
--- a/lib/CodeGen/Spiller.cpp
+++ b/lib/CodeGen/Spiller.cpp
@@ -29,6 +29,7 @@ STATISTIC(NumLoads , "Number of loads added");
STATISTIC(NumReused , "Number of values reused");
STATISTIC(NumDCE , "Number of copies elided");
STATISTIC(NumSUnfold , "Number of stores unfolded");
+STATISTIC(NumModRefUnfold, "Number of modref unfolded");
namespace {
enum SpillerName { simple, local };
@@ -524,6 +525,7 @@ bool LocalSpiller::runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM) {
RegInfo = &MF.getRegInfo();
TRI = MF.getTarget().getRegisterInfo();
TII = MF.getTarget().getInstrInfo();
+ AllocatableRegs = TRI->getAllocatableSet(MF);
DOUT << "\n**** Local spiller rewriting function '"
<< MF.getFunction()->getName() << "':\n";
DOUT << "**** Machine Instrs (NOTE! Does not include spills and reloads!)"
@@ -595,7 +597,201 @@ bool LocalSpiller::runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM) {
}
-/// PrepForUnfoldOpti - Turn a store folding instruction into a load folding
+/// FoldsStackSlotModRef - Return true if the specified MI folds the specified
+/// stack slot mod/ref. It also checks if it's possible to unfold the
+/// instruction by having it define a specified physical register instead.
+static bool FoldsStackSlotModRef(MachineInstr &MI, int SS, unsigned PhysReg,
+ const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI,
+ VirtRegMap &VRM) {
+ if (VRM.hasEmergencySpills(&MI) || VRM.isSpillPt(&MI))
+ return false;
+
+ bool Found = false;
+ VirtRegMap::MI2VirtMapTy::const_iterator I, End;
+ for (tie(I, End) = VRM.getFoldedVirts(&MI); I != End; ++I) {
+ unsigned VirtReg = I->second.first;
+ VirtRegMap::ModRef MR = I->second.second;
+ if (MR & VirtRegMap::isModRef)
+ if (VRM.getStackSlot(VirtReg) == SS) {
+ Found= TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(), true, true) != 0;
+ break;
+ }
+ }
+ if (!Found)
+ return false;
+
+ // Does the instruction uses a register that overlaps the scratch register?
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI.getOperand(i);
+ if (!MO.isReg() || MO.getReg() == 0)
+ continue;
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (!VRM.hasPhys(Reg))
+ continue;
+ Reg = VRM.getPhys(Reg);
+ }
+ if (TRI->regsOverlap(PhysReg, Reg))
+ return false;
+ }
+ return true;
+}
+
+/// FindFreeRegister - Find a free register of a given register class by looking
+/// at (at most) the last two machine instructions.
+static unsigned FindFreeRegister(MachineBasicBlock::iterator MII,
+ MachineBasicBlock &MBB,
+ const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI,
+ BitVector &AllocatableRegs) {
+ BitVector Defs(TRI->getNumRegs());
+ BitVector Uses(TRI->getNumRegs());
+ SmallVector<unsigned, 4> LocalUses;
+ SmallVector<unsigned, 4> Kills;
+
+ // Take a look at 2 instructions at most.
+ for (unsigned Count = 0; Count < 2; ++Count) {
+ if (MII == MBB.begin())
+ break;
+ MachineInstr *PrevMI = prior(MII);
+ for (unsigned i = 0, e = PrevMI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = PrevMI->getOperand(i);
+ if (!MO.isReg() || MO.getReg() == 0)
+ continue;
+ unsigned Reg = MO.getReg();
+ if (MO.isDef()) {
+ Defs.set(Reg);
+ for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+ Defs.set(*AS);
+ } else {
+ LocalUses.push_back(Reg);
+ if (MO.isKill() && AllocatableRegs[Reg])
+ Kills.push_back(Reg);
+ }
+ }
+
+ for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+ unsigned Kill = Kills[i];
+ if (!Defs[Kill] && !Uses[Kill] &&
+ TRI->getPhysicalRegisterRegClass(Kill) == RC)
+ return Kill;
+ }
+ for (unsigned i = 0, e = LocalUses.size(); i != e; ++i) {
+ unsigned Reg = LocalUses[i];
+ Uses.set(Reg);
+ for (const unsigned *AS = TRI->getAliasSet(Reg); *AS; ++AS)
+ Uses.set(*AS);
+ }
+
+ MII = PrevMI;
+ }
+
+ return 0;
+}
+
+static
+void AssignPhysToVirtReg(MachineInstr *MI, unsigned VirtReg, unsigned PhysReg) {
+ for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+ MachineOperand &MO = MI->getOperand(i);
+ if (MO.isReg() && MO.getReg() == VirtReg)
+ MO.setReg(PhysReg);
+ }
+}
+
+/// OptimizeByUnfold2 - Unfold a series of load / store folding instructions if
+/// a scratch register is available.
+/// xorq %r12<kill>, %r13
+/// addq %rax, -184(%rbp)
+/// addq %r13, -184(%rbp)
+/// ==>
+/// xorq %r12<kill>, %r13
+/// movq -184(%rbp), %r12
+/// addq %rax, %r12
+/// addq %r13, %r12
+/// movq %r12, -184(%rbp)
+bool LocalSpiller::OptimizeByUnfold2(unsigned VirtReg, int SS,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MII,
+ std::vector<MachineInstr*> &MaybeDeadStores,
+ AvailableSpills &Spills,
+ BitVector &RegKills,
+ std::vector<MachineOperand*> &KillOps,
+ VirtRegMap &VRM) {
+ MachineBasicBlock::iterator NextMII = next(MII);
+ if (NextMII == MBB.end())
+ return false;
+
+ if (TII->getOpcodeAfterMemoryUnfold(MII->getOpcode(), true, true) == 0)
+ return false;
+
+ // Now let's see if the last couple of instructions happens to have freed up
+ // a register.
+ const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
+ unsigned PhysReg = FindFreeRegister(MII, MBB, RC, TRI, AllocatableRegs);
+ if (!PhysReg)
+ return false;
+
+ MachineFunction &MF = *MBB.getParent();
+ TRI = MF.getTarget().getRegisterInfo();
+ MachineInstr &MI = *MII;
+ if (!FoldsStackSlotModRef(MI, SS, PhysReg, TII, TRI, VRM))
+ return false;
+
+ // If the next instruction also folds the same SS modref and can be unfoled,
+ // then it's worthwhile to issue a load from SS into the free register and
+ // then unfold these instructions.
+ if (!FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, VRM))
+ return false;
+
+ // Load from SS to the spare physical register.
+ TII->loadRegFromStackSlot(MBB, MII, PhysReg, SS, RC);
+ // This invalidates Phys.
+ Spills.ClobberPhysReg(PhysReg);
+ // Remember it's available.
+ Spills.addAvailable(SS, PhysReg);
+ MaybeDeadStores[SS] = NULL;
+
+ // Unfold current MI.
+ SmallVector<MachineInstr*, 4> NewMIs;
+ if (!TII->unfoldMemoryOperand(MF, &MI, VirtReg, false, false, NewMIs))
+ assert(0 && "Unable unfold the load / store folding instruction!");
+ assert(NewMIs.size() == 1);
+ AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg);
+ VRM.transferRestorePts(&MI, NewMIs[0]);
+ MII = MBB.insert(MII, NewMIs[0]);
+ InvalidateKills(MI, RegKills, KillOps);
+ VRM.RemoveMachineInstrFromMaps(&MI);
+ MBB.erase(&MI);
+ ++NumModRefUnfold;
+
+ // Unfold next instructions that fold the same SS.
+ do {
+ MachineInstr &NextMI = *NextMII;
+ NextMII = next(NextMII);
+ NewMIs.clear();
+ if (!TII->unfoldMemoryOperand(MF, &NextMI, VirtReg, false, false, NewMIs))
+ assert(0 && "Unable unfold the load / store folding instruction!");
+ assert(NewMIs.size() == 1);
+ AssignPhysToVirtReg(NewMIs[0], VirtReg, PhysReg);
+ VRM.transferRestorePts(&NextMI, NewMIs[0]);
+ MBB.insert(NextMII, NewMIs[0]);
+ InvalidateKills(NextMI, RegKills, KillOps);
+ VRM.RemoveMachineInstrFromMaps(&NextMI);
+ MBB.erase(&NextMI);
+ ++NumModRefUnfold;
+ } while (FoldsStackSlotModRef(*NextMII, SS, PhysReg, TII, TRI, VRM));
+
+ // Store the value back into SS.
+ TII->storeRegToStackSlot(MBB, NextMII, PhysReg, true, SS, RC);
+ MachineInstr *StoreMI = prior(NextMII);
+ VRM.addSpillSlotUse(SS, StoreMI);
+ VRM.virtFolded(VirtReg, StoreMI, VirtRegMap::isMod);
+
+ return true;
+}
+
+/// OptimizeByUnfold - Turn a store folding instruction into a load folding
/// instruction. e.g.
/// xorl %edi, %eax
/// movl %eax, -32(%ebp)
@@ -607,7 +803,7 @@ bool LocalSpiller::runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM) {
/// mov %eax, -32(%ebp)
/// This enables unfolding optimization for a subsequent instruction which will
/// also eliminate the newly introduced store instruction.
-bool LocalSpiller::PrepForUnfoldOpti(MachineBasicBlock &MBB,
+bool LocalSpiller::OptimizeByUnfold(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MII,
std::vector<MachineInstr*> &MaybeDeadStores,
AvailableSpills &Spills,
@@ -646,8 +842,14 @@ bool LocalSpiller::PrepForUnfoldOpti(MachineBasicBlock &MBB,
}
}
- if (!UnfoldedOpc)
- return false;
+ if (!UnfoldedOpc) {
+ if (!UnfoldVR)
+ return false;
+
+ // Look for other unfolding opportunities.
+ return OptimizeByUnfold2(UnfoldVR, FoldedSS, MBB, MII,
+ MaybeDeadStores, Spills, RegKills, KillOps, VRM);
+ }
for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI.getOperand(i);
@@ -705,6 +907,7 @@ bool LocalSpiller::PrepForUnfoldOpti(MachineBasicBlock &MBB,
MF.DeleteMachineInstr(NewMI);
}
}
+
return false;
}
@@ -770,7 +973,7 @@ bool LocalSpiller::CommuteToFoldReload(MachineBasicBlock &MBB,
VRM.addSpillSlotUse(SS, FoldedMI);
VRM.virtFolded(VirtReg, FoldedMI, VirtRegMap::isRef);
// Insert new def MI and spill MI.
- const TargetRegisterClass* RC = MF.getRegInfo().getRegClass(VirtReg);
+ const TargetRegisterClass* RC = RegInfo->getRegClass(VirtReg);
TII->storeRegToStackSlot(MBB, &MI, NewReg, true, SS, RC);
MII = prior(MII);
MachineInstr *StoreMI = MII;
@@ -935,13 +1138,13 @@ void LocalSpiller::RewriteMBB(MachineBasicBlock &MBB, VirtRegMap &VRM,
DistanceMap.clear();
for (MachineBasicBlock::iterator MII = MBB.begin(), E = MBB.end();
MII != E; ) {
- MachineBasicBlock::iterator NextMII = MII; ++NextMII;
+ MachineBasicBlock::iterator NextMII = next(MII);
VirtRegMap::MI2VirtMapTy::const_iterator I, End;
bool Erased = false;
bool BackTracked = false;
- if (PrepForUnfoldOpti(MBB, MII,
- MaybeDeadStores, Spills, RegKills, KillOps, VRM))
+ if (OptimizeByUnfold(MBB, MII,
+ MaybeDeadStores, Spills, RegKills, KillOps, VRM))
NextMII = next(MII);
MachineInstr &MI = *MII;
diff --git a/lib/CodeGen/Spiller.h b/lib/CodeGen/Spiller.h
index 5a42a8279d..c0d0837960 100644
--- a/lib/CodeGen/Spiller.h
+++ b/lib/CodeGen/Spiller.h
@@ -97,7 +97,7 @@ namespace llvm {
const TargetRegisterInfo *getRegInfo() const { return TRI; }
/// getSpillSlotOrReMatPhysReg - If the specified stack slot or remat is
- /// available in a physical register, return that PhysReg, otherwise
+ /// available in a physical register, return that PhysReg, otherwise
/// return 0.
unsigned getSpillSlotOrReMatPhysReg(int Slot) const {
std::map<int, unsigned>::const_iterator I =
@@ -284,6 +284,7 @@ namespace llvm {
MachineRegisterInfo *RegInfo;
const TargetRegisterInfo *TRI;
const TargetInstrInfo *TII;
+ BitVector AllocatableRegs;
DenseMap<MachineInstr*, unsigned> DistanceMap;
public:
bool runOnMachineFunction(MachineFunction &MF, VirtRegMap &VRM);
@@ -291,12 +292,22 @@ namespace llvm {
void TransferDeadness(MachineBasicBlock *MBB, unsigned CurDist,
unsigned Reg, BitVector &RegKills,
std::vector<MachineOperand*> &KillOps);
- bool PrepForUnfoldOpti(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &MII,
+
+ bool OptimizeByUnfold(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MII,
+ std::vector<MachineInstr*> &MaybeDeadStores,
+ AvailableSpills &Spills, BitVector &RegKills,
+ std::vector<MachineOperand*> &KillOps,
+ VirtRegMap &VRM);
+
+ bool OptimizeByUnfold2(unsigned VirtReg, int SS,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator &MII,
std::vector<MachineInstr*> &MaybeDeadStores,
AvailableSpills &Spills, BitVector &RegKills,
std::vector<MachineOperand*> &KillOps,
VirtRegMap &VRM);
+
bool CommuteToFoldReload(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MII,
unsigned VirtReg, unsigned SrcReg, int SS,
@@ -305,6 +316,7 @@ namespace llvm {
std::vector<MachineOperand*> &KillOps,
const TargetRegisterInfo *TRI,
VirtRegMap &VRM);
+
void SpillRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator &MII,
int Idx, unsigned PhysReg, int StackSlot,
@@ -315,6 +327,7 @@ namespace llvm {
BitVector &RegKills,
std::vector<MachineOperand*> &KillOps,
VirtRegMap &VRM);
+
void RewriteMBB(MachineBasicBlock &MBB, VirtRegMap &VRM,
AvailableSpills &Spills,
BitVector &RegKills, std::vector<MachineOperand*> &KillOps);
diff --git a/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
new file mode 100644
index 0000000000..e9e2e4aaa1
--- /dev/null
+++ b/test/CodeGen/X86/2009-04-16-SpillerUnfold.ll
@@ -0,0 +1,139 @@
+; RUN: llvm-as < %s | llc -mtriple=x86_64-apple-darwin10.0 -relocation-model=pic -disable-fp-elim -stats |& grep {Number of modref unfolded}
+
+ %struct.SHA512_CTX = type { [8 x i64], i64, i64, %struct.anon, i32, i32 }
+ %struct.anon = type { [16 x i64] }
+@K512 = external constant [80 x i64], align 32 ; <[80 x i64]*> [#uses=2]
+
+define fastcc void @sha512_block_data_order(%struct.SHA512_CTX* nocapture %ctx, i8* nocapture %in, i64 %num) nounwind ssp {
+entry:
+ br label %bb349
+
+bb349: ; preds = %bb349, %entry
+ %e.0489 = phi i64 [ 0, %entry ], [ %e.0, %bb349 ] ; <i64> [#uses=3]
+ %b.0472 = phi i64 [ 0, %entry ], [ %87, %bb349 ] ; <i64> [#uses=2]
+ %asmtmp356 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 41, i64 %e.0489) nounwind ; <i64> [#uses=1]
+ %0 = xor i64 0, %asmtmp356 ; <i64> [#uses=1]
+ %1 = add i64 0, %0 ; <i64> [#uses=1]
+ %2 = add i64 %1, 0 ; <i64> [#uses=1]
+ %3 = add i64 %2, 0 ; <i64> [#uses=1]
+ %4 = add i64 %3, 0 ; <i64> [#uses=5]
+ %asmtmp372 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 34, i64 %4) nounwind ; <i64> [#uses=1]
+ %asmtmp373 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 39, i64 %4) nounwind ; <i64> [#uses=0]
+ %5 = xor i64 %asmtmp372, 0 ; <i64> [#uses=0]
+ %6 = xor i64 0, %b.0472 ; <i64> [#uses=1]
+ %7 = and i64 %4, %6 ; <i64> [#uses=1]
+ %8 = xor i64 %7, 0 ; <i64> [#uses=1]
+ %9 = add i64 0, %8 ; <i64> [#uses=1]
+ %10 = add i64 %9, 0 ; <i64> [#uses=2]
+ %asmtmp377 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 61, i64 0) nounwind ; <i64> [#uses=1]
+ %11 = xor i64 0, %asmtmp377 ; <i64> [#uses=1]
+ %12 = add i64 0, %11 ; <i64> [#uses=1]
+ %13 = add i64 %12, 0 ; <i64> [#uses=1]
+ %not381 = xor i64 0, -1 ; <i64> [#uses=1]
+ %14 = and i64 %e.0489, %not381 ; <i64> [#uses=1]
+ %15 = xor i64 0, %14 ; <i64> [#uses=1]
+ %16 = add i64 %15, 0 ; <i64> [#uses=1]
+ %17 = add i64 %16, %13 ; <i64> [#uses=1]
+ %18 = add i64 %17, 0 ; <i64> [#uses=1]
+ %19 = add i64 %18, 0 ; <i64> [#uses=2]
+ %20 = add i64 %19, %b.0472 ; <i64> [#uses=3]
+ %21 = add i64 %19, 0 ; <i64> [#uses=1]
+ %22 = add i64 %21, 0 ; <i64> [#uses=1]
+ %23 = add i32 0, 12 ; <i32> [#uses=1]
+ %24 = and i32 %23, 12 ; <i32> [#uses=1]
+ %25 = zext i32 %24 to i64 ; <i64> [#uses=1]
+ %26 = getelementptr [16 x i64]* null, i64 0, i64 %25 ; <i64*> [#uses=0]
+ %27 = add i64 0, %e.0489 ; <i64> [#uses=1]
+ %28 = add i64 %27, 0 ; <i64> [#uses=1]
+ %29 = add i64 %28, 0 ; <i64> [#uses=1]
+ %30 = add i64 %29, 0 ; <i64> [#uses=2]
+ %31 = and i64 %10, %4 ; <i64> [#uses=1]
+ %32 = xor i64 0, %31 ; <i64> [#uses=1]
+ %33 = add i64 %30, 0 ; <i64> [#uses=3]
+ %34 = add i64 %30, %32 ; <i64> [#uses=1]
+ %35 = add i64 %34, 0 ; <i64> [#uses=1]
+ %36 = and i64 %33, %20 ; <i64> [#uses=1]
+ %37 = xor i64 %36, 0 ; <i64> [#uses=1]
+ %38 = add i64 %37, 0 ; <i64> [#uses=1]
+ %39 = add i64 %38, 0 ; <i64> [#uses=1]
+ %40 = add i64 %39, 0 ; <i64> [#uses=1]
+ %41 = add i64 %40, 0 ; <i64> [#uses=1]
+ %42 = add i64 %41, %4 ; <i64> [#uses=3]
+ %43 = or i32 0, 6 ; <i32> [#uses=1]
+ %44 = and i32 %43, 14 ; <i32> [#uses=1]
+ %45 = zext i32 %44 to i64 ; <i64> [#uses=1]
+ %46 = getelementptr [16 x i64]* null, i64 0, i64 %45 ; <i64*> [#uses=1]
+ %not417 = xor i64 %42, -1 ; <i64> [#uses=1]
+ %47 = and i64 %20, %not417 ; <i64> [#uses=1]
+ %48 = xor i64 0, %47 ; <i64> [#uses=1]
+ %49 = getelementptr [80 x i64]* @K512, i64 0, i64 0 ; <i64*> [#uses=1]
+ %50 = load i64* %49, align 8 ; <i64> [#uses=1]
+ %51 = add i64 %48, 0 ; <i64> [#uses=1]
+ %52 = add i64 %51, 0 ; <i64> [#uses=1]
+ %53 = add i64 %52, 0 ; <i64> [#uses=1]
+ %54 = add i64 %53, %50 ; <i64> [#uses=2]
+ %asmtmp420 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 34, i64 0) nounwind ; <i64> [#uses=1]
+ %asmtmp421 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 39, i64 0) nounwind ; <i64> [#uses=1]
+ %55 = xor i64 %asmtmp420, 0 ; <i64> [#uses=1]
+ %56 = xor i64 %55, %asmtmp421 ; <i64> [#uses=1]
+ %57 = add i64 %54, %10 ; <i64> [#uses=5]
+ %58 = add i64 %54, 0 ; <i64> [#uses=1]
+ %59 = add i64 %58, %56 ; <i64> [#uses=2]
+ %60 = or i32 0, 7 ; <i32> [#uses=1]
+ %61 = and i32 %60, 15 ; <i32> [#uses=1]
+ %62 = zext i32 %61 to i64 ; <i64> [#uses=1]
+ %63 = getelementptr [16 x i64]* null, i64 0, i64 %62 ; <i64*> [#uses=2]
+ %64 = load i64* null, align 8 ; <i64> [#uses=1]
+ %65 = lshr i64 %64, 6 ; <i64> [#uses=1]
+ %66 = xor i64 0, %65 ; <i64> [#uses=1]
+ %67 = xor i64 %66, 0 ; <i64> [#uses=1]
+ %68 = load i64* %46, align 8 ; <i64> [#uses=1]
+ %69 = load i64* null, align 8 ; <i64> [#uses=1]
+ %70 = add i64 %68, 0 ; <i64> [#uses=1]
+ %71 = add i64 %70, %67 ; <i64> [#uses=1]
+ %72 = add i64 %71, %69 ; <i64> [#uses=1]
+ %asmtmp427 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 18, i64 %57) nounwind ; <i64> [#uses=1]
+ %asmtmp428 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 41, i64 %57) nounwind ; <i64> [#uses=1]
+ %73 = xor i64 %asmtmp427, 0 ; <i64> [#uses=1]
+ %74 = xor i64 %73, %asmtmp428 ; <i64> [#uses=1]
+ %75 = and i64 %57, %42 ; <i64> [#uses=1]
+ %not429 = xor i64 %57, -1 ; <i64> [#uses=1]
+ %76 = and i64 %33, %not429 ; <i64> [#uses=1]
+ %77 = xor i64 %75, %76 ; <i64> [#uses=1]
+ %78 = getelementptr [80 x i64]* @K512, i64 0, i64 0 ; <i64*> [#uses=1]
+ %79 = load i64* %78, align 16 ; <i64> [#uses=1]
+ %80 = add i64 %77, %20 ; <i64> [#uses=1]
+ %81 = add i64 %80, %72 ; <i64> [#uses=1]
+ %82 = add i64 %81, %74 ; <i64> [#uses=1]
+ %83 = add i64 %82, %79 ; <i64> [#uses=1]
+ %asmtmp432 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 34, i64 %59) nounwind ; <i64> [#uses=1]
+ %asmtmp433 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 39, i64 %59) nounwind ; <i64> [#uses=1]
+ %84 = xor i64 %asmtmp432, 0 ; <i64> [#uses=1]
+ %85 = xor i64 %84, %asmtmp433 ; <i64> [#uses=1]
+ %86 = add i64 %83, %22 ; <i64> [#uses=2]
+ %87 = add i64 0, %85 ; <i64> [#uses=1]
+ %asmtmp435 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 8, i64 0) nounwind ; <i64> [#uses=1]
+ %88 = xor i64 0, %asmtmp435 ; <i64> [#uses=1]
+ %89 = load i64* null, align 8 ; <i64> [#uses=3]
+ %asmtmp436 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 19, i64 %89) nounwind ; <i64> [#uses=1]
+ %asmtmp437 = call i64 asm "rorq $1,$0", "=r,J,0,~{dirflag},~{fpsr},~{flags},~{cc}"(i32 61, i64 %89) nounwind ; <i64> [#uses=1]
+ %90 = lshr i64 %89, 6 ; <i64> [#uses=1]
+ %91 = xor i64 %asmtmp436, %90 ; <i64> [#uses=1]
+ %92 = xor i64 %91, %asmtmp437 ; <i64> [#uses=1]
+ %93 = load i64* %63, align 8 ; <i64> [#uses=1]
+ %94 = load i64* null, align 8 ; <i64> [#uses=1]
+ %95 = add i64 %93, %88 ; <i64> [#uses=1]
+ %96 = add i64 %95, %92 ; <i64> [#uses=1]
+ %97 = add i64 %96, %94 ; <i64> [#uses=2]
+ store i64 %97, i64* %63, align 8
+ %98 = and i64 %86, %57 ; <i64> [#uses=1]
+ %not441 = xor i64 %86, -1 ; <i64> [#uses=1]
+ %99 = and i64 %42, %not441 ; <i64> [#uses=1]
+ %100 = xor i64 %98, %99 ; <i64> [#uses=1]
+ %101 = add i64 %100, %33 ; <i64> [#uses=1]
+ %102 = add i64 %101, %97 ; <i64> [#uses=1]
+ %103 = add i64 %102, 0 ; <i64> [#uses=1]
+ %104 = add i64 %103, 0 ; <i64> [#uses=1]
+ %e.0 = add i64 %104, %35 ; <i64> [#uses=1]
+ br label %bb349
+}