aboutsummaryrefslogtreecommitdiff
path: root/lib/Target
diff options
context:
space:
mode:
authorArnold Schwaighofer <arnold.schwaighofer@gmail.com>2007-10-11 19:40:01 +0000
committerArnold Schwaighofer <arnold.schwaighofer@gmail.com>2007-10-11 19:40:01 +0000
commitc85e1716f0e45e4c18a9ef2fbe431a51ac3a4252 (patch)
treee6e745a81699085d757db166e645f8a6639cde7a /lib/Target
parent68c8411ef591818d89c6a81098a328fcd75be72a (diff)
Added tail call optimization to the x86 back end. It can be
enabled by passing -tailcallopt to llc. The optimization is performed if the following conditions are satisfied: * caller/callee are fastcc * elf/pic is disabled OR elf/pic enabled + callee is in module + callee has visibility protected or hidden git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@42870 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target')
-rw-r--r--lib/Target/TargetMachine.cpp7
-rw-r--r--lib/Target/X86/README.txt80
-rw-r--r--lib/Target/X86/X86CallingConv.td50
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp511
-rw-r--r--lib/Target/X86/X86ISelLowering.h24
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp2
-rw-r--r--lib/Target/X86/X86InstrInfo.td43
-rw-r--r--lib/Target/X86/X86InstrX86-64.td35
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h15
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp129
10 files changed, 831 insertions, 65 deletions
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 6c00a3f492..9caea11dd3 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -33,6 +33,7 @@ namespace llvm {
bool ExceptionHandling;
Reloc::Model RelocationModel;
CodeModel::Model CMModel;
+ bool PerformTailCallOpt;
}
namespace {
cl::opt<bool, true> PrintCode("print-machineinstrs",
@@ -116,6 +117,12 @@ namespace {
clEnumValN(CodeModel::Large, "large",
" Large code model"),
clEnumValEnd));
+
+ cl::opt<bool, true>
+ EnablePerformTailCallOpt("tailcallopt",
+ cl::desc("Turn on tail call optimization."),
+ cl::location(PerformTailCallOpt),
+ cl::init(false));
}
//---------------------------------------------------------------------------
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 9bafff73d5..0d4dce32d8 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1368,3 +1368,83 @@ L7:
L5:
//===---------------------------------------------------------------------===//
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place if
+that was a not tail call optimized functiong call ) before moving them
+to actual stack slot. this is done to prevent overwriting of paramters
+(see example below) that might be used, since the arguments of the
+callee overwrites callers arguments.
+
+ example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+ - only push those arguments to the top of the stack that are actual
+ parameters of the caller function and have no local value in the
+ caller
+
+ in above example local does not need to be pushed onto the top of
+ the stack as it is definitetly not a caller's function parameter
+
+ - analyse the actual parameters of the callee to see which would
+ overwrite a caller paramter which is used by the callee and only
+ push them onto the top of the stack
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ here we need to push the arguments because they overwrite each other
+
+
+ code for lowering directly onto callers arguments:
++ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
++ SmallVector<SDOperand, 8> MemOpChains;
++
++ SDOperand FramePtr;
++ SDOperand PtrOff;
++ SDOperand FIN;
++ int FI = 0;
++ // Walk the register/memloc assignments, inserting copies/loads.
++ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
++ CCValAssign &VA = ArgLocs[i];
++ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
++
++ ....
++
++ if (VA.isRegLoc()) {
++ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
++ } else {
++ assert(VA.isMemLoc());
++ // create frame index
++ int32_t Offset = VA.getLocMemOffset()+FPDiff;
++ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
++ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
++ FIN = DAG.getFrameIndex(FI, MVT::i32);
++ // store relative to framepointer
++ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
++ }
++ }
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 9c2d95a199..f23e580656 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -127,6 +127,40 @@ def CC_X86_64_C : CallingConv<[
CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
]>;
+// tail call convetion (fast) one register is reserved for target address
+// namely R9
+def CC_X86_64_TailCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>,
+
+ // The first 8 MMX vector arguments are passed in GPRs.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64],
+ CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
//===----------------------------------------------------------------------===//
// X86 C Calling Convention
@@ -173,6 +207,22 @@ def CC_X86_32_C : CallingConv<[
CCDelegateTo<CC_X86_32_Common>
]>;
+/// Same as C calling convention up to nonfree ECX which is used for storing
+/// potential pointer to tail called function
+def CC_X86_32_TailCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
def CC_X86_32_FastCall : CallingConv<[
// Promote i8/i16 arguments to i32.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1917a6a291..8767d8d33b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -32,6 +32,8 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SSARegMap.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ParameterAttributes.h"
@@ -43,6 +45,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
RegInfo = TM.getRegisterInfo();
@@ -641,6 +644,19 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
//===----------------------------------------------------------------------===//
#include "X86GenCallingConv.inc"
+
+/// GetPossiblePreceedingTailCall - Get preceeding X86ISD::TAILCALL node if it
+/// exists skip possible ISD:TokenFactor.
+static SDOperand GetPossiblePreceedingTailCall(SDOperand Chain) {
+ if (Chain.getOpcode()==X86ISD::TAILCALL) {
+ return Chain;
+ } else if (Chain.getOpcode()==ISD::TokenFactor) {
+ if (Chain.getNumOperands() &&
+ Chain.getOperand(0).getOpcode()==X86ISD::TAILCALL)
+ return Chain.getOperand(0);
+ }
+ return Chain;
+}
/// LowerRET - Lower an ISD::RET node.
SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
@@ -651,8 +667,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
CCInfo.AnalyzeReturn(Op.Val, RetCC_X86);
-
-
+
// If this is the first return lowered for this function, add the regs to the
// liveout set for the function.
if (DAG.getMachineFunction().liveout_empty()) {
@@ -660,10 +675,38 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
if (RVLocs[i].isRegLoc())
DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
}
-
SDOperand Chain = Op.getOperand(0);
- SDOperand Flag;
+ // Handle tail call return.
+ Chain = GetPossiblePreceedingTailCall(Chain);
+ if (Chain.getOpcode() == X86ISD::TAILCALL) {
+ SDOperand TailCall = Chain;
+ SDOperand TargetAddress = TailCall.getOperand(1);
+ SDOperand StackAdjustment = TailCall.getOperand(2);
+ assert ( ((TargetAddress.getOpcode() == ISD::Register &&
+ (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX ||
+ cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
+ TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
+ TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
+ "Expecting an global address, external symbol, or register");
+ assert( StackAdjustment.getOpcode() == ISD::Constant &&
+ "Expecting a const value");
+
+ SmallVector<SDOperand,8> Operands;
+ Operands.push_back(Chain.getOperand(0));
+ Operands.push_back(TargetAddress);
+ Operands.push_back(StackAdjustment);
+ // Copy registers used by the call. Last operand is a flag so it is not
+ // copied.
+ for(unsigned i=3; i < TailCall.getNumOperands()-1;i++) {
+ Operands.push_back(Chain.getOperand(i));
+ }
+ return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0], Operands.size());
+ }
+
+ // Regular return.
+ SDOperand Flag;
+
// Copy the result values into the output registers.
if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() ||
RVLocs[0].getLocReg() != X86::ST0) {
@@ -684,7 +727,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
if ((X86ScalarSSEf32 && RVLocs[0].getValVT()==MVT::f32) ||
(X86ScalarSSEf64 && RVLocs[0].getValVT()==MVT::f64)) {
SDOperand MemLoc;
-
+
// If this is a load into a scalarsse value, don't store the loaded value
// back to the stack, only to reload it: just replace the scalar-sse load.
if (ISD::isNON_EXTLoad(Value.Val) &&
@@ -784,12 +827,14 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
//===----------------------------------------------------------------------===//
-// C & StdCall Calling Convention implementation
+// C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
// StdCall calling convention seems to be standard for many Windows' API
// routines and around. It differs from C calling convention just a little:
// callee should clean up the stack, not caller. Symbols should be also
// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
/// AddLiveIn - This helper function adds the specified physical register to the
/// MachineFunction as a live in value. It also creates a corresponding virtual
@@ -802,6 +847,9 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
return VReg;
}
+// align stack arguments according to platform alignment needed for tail calls
+unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG);
+
SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,
const CCValAssign &VA,
MachineFrameInfo *MFI,
@@ -826,13 +874,17 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
MachineFrameInfo *MFI = MF.getFrameInfo();
SDOperand Root = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
-
+ unsigned CC = MF.getFunction()->getCallingConv();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ CCState CCInfo(CC, isVarArg,
getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
-
+ // Check for possible tail call calling convention.
+ if (CC == CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_TailCall);
+ else
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
+
SmallVector<SDOperand, 8> ArgValues;
unsigned LastVal = ~0U;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -877,6 +929,9 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
}
unsigned StackSize = CCInfo.getNextStackOffset();
+ // align stack specially for tail calls
+ if (CC==CallingConv::Fast)
+ StackSize = GetAlignedArgumentStackSize(StackSize,DAG);
ArgValues.push_back(Root);
@@ -885,7 +940,12 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
if (isVarArg)
VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
- if (isStdCall && !isVarArg) {
+ // Tail call calling convention (CallingConv::Fast) does not support varargs.
+ assert( !(isVarArg && CC == CallingConv::Fast) &&
+ "CallingConv::Fast does not support varargs.");
+
+ if (isStdCall && !isVarArg &&
+ (CC==CallingConv::Fast && PerformTailCallOpt || CC!=CallingConv::Fast)) {
BytesToPopOnReturn = StackSize; // Callee pops everything..
BytesCallerReserves = 0;
} else {
@@ -914,17 +974,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
unsigned CC) {
SDOperand Chain = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
- bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
SDOperand Callee = Op.getOperand(4);
unsigned NumOps = (Op.getNumOperands() - 5) / 2;
-
+
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
+ if(CC==CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall);
+ else
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (CC==CallingConv::Fast)
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
@@ -1023,19 +1087,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (InFlag.Val)
Ops.push_back(InFlag);
-
- Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
- NodeTys, &Ops[0], Ops.size());
+
+ Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPush = 0;
- if (CC == CallingConv::X86_StdCall) {
+ if (CC == CallingConv::X86_StdCall ||
+ (CC == CallingConv::Fast && PerformTailCallOpt)) {
if (isVarArg)
NumBytesForCalleeToPush = isSRet ? 4 : 0;
else
NumBytesForCalleeToPush = NumBytes;
+ assert(!(isVarArg && CC==CallingConv::Fast) &&
+ "CallingConv::Fast does not support varargs.");
} else {
// If this is is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
@@ -1132,7 +1198,8 @@ X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) {
if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
// Make sure the instruction takes 8n+4 bytes to make sure the start of the
- // arguments and the arguments after the retaddr has been pushed are aligned.
+ // arguments and the arguments after the retaddr has been pushed are
+ // aligned.
if ((StackSize & 7) == 0)
StackSize += 4;
}
@@ -1194,7 +1261,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
// Make sure the instruction takes 8n+4 bytes to make sure the start of the
- // arguments and the arguments after the retaddr has been pushed are aligned.
+ // arguments and the arguments after the retaddr has been pushed are
+ // aligned.
if ((NumBytes & 7) == 0)
NumBytes += 4;
}
@@ -1292,8 +1360,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (InFlag.Val)
Ops.push_back(InFlag);
- // FIXME: Do not generate X86ISD::TAILCALL for now.
- Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+ assert(isTailCall==false && "no tail call here");
+ Chain = DAG.getNode(X86ISD::CALL,
NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
@@ -1312,6 +1380,314 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
}
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// * elf/pic is disabled OR
+// * elf/pic enabled + callee is in module + callee has
+// visibility protected or hidden
+// To ensure the stack is aligned according to platform abi pass
+// tail-call-align-stack. This makes sure that argument delta is always
+// multiples of stack alignment. (Dynamic linkers need this - darwin's dyld for
+// example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achived by reserving an area the size of the argument delta right after the
+// original REtADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) {
+ if (PerformTailCallOpt) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameInfo &TFI = *TM.getFrameInfo();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4;
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ StackSize = Offset;
+ }
+ return StackSize;
+}
+
+/// IsEligibleForTailCallElimination - Check to see whether the next instruction
+// following the call is a return. A function is eligible if caller/callee
+// calling conventions match, currently only fastcc supports tail calls, and the
+// function CALL is immediatly followed by a RET.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call,
+ SDOperand Ret,
+ SelectionDAG& DAG) const {
+ bool IsEligible = false;
+
+ // Check whether CALL node immediatly preceeds the RET node and whether the
+ // return uses the result of the node or is a void return.
+ if ((Ret.getNumOperands() == 1 &&
+ (Ret.getOperand(0)== SDOperand(Call.Val,1) ||
+ Ret.getOperand(0)== SDOperand(Call.Val,0))) ||
+ (Ret.getOperand(0)== SDOperand(Call.Val,Call.Val->getNumValues()-1) &&
+ Ret.getOperand(1)== SDOperand(Call.Val,0))) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned CallerCC = MF.getFunction()->getCallingConv();
+ unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue();
+ if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+ SDOperand Callee = Call.getOperand(4);
+ // On elf/pic %ebx needs to be livein.
+ if(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT()) {
+ // Can only do local tail calls with PIC.
+ GlobalValue * GV = 0;
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if(G != 0 &&
+ (GV = G->getGlobal()) &&
+ (GV->hasHiddenVisibility() || GV->hasProtectedVisibility()))
+ IsEligible=true;
+ } else {
+ IsEligible=true;
+ }
+ }
+ }
+ return IsEligible;
+}
+
+SDOperand X86TargetLowering::LowerX86_TailCallTo(SDOperand Op,
+ SelectionDAG &DAG,
+ unsigned CC) {
+ SDOperand Chain = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+ SDOperand Callee = Op.getOperand(4);
+ bool is64Bit = Subtarget->is64Bit();
+
+ assert(isTailCall && PerformTailCallOpt && "Should only emit tail calls.");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ if (is64Bit)
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall);
+ else
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall);
+
+
+ // Lower arguments at fp - stackoffset + fpdiff.
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ unsigned NumBytesToBePushed =
+ GetAlignedArgumentStackSize(CCInfo.getNextStackOffset(), DAG);
+
+ unsigned NumBytesCallerPushed =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+ int FPDiff = NumBytesCallerPushed - NumBytesToBePushed;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
+ MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+
+ // Adjust the ret address stack slot.
+ if (FPDiff) {
+ MVT::ValueType VT = is64Bit ? MVT::i64 : MVT::i32;
+ SDOperand RetAddrFrIdx = getReturnAddressFrameIndex(DAG);
+ RetAddrFrIdx =
+ DAG.getLoad(VT, DAG.getEntryNode(),RetAddrFrIdx, NULL, 0);
+ // Emit a store of the saved ret value to the new location.
+ int SlotSize = is64Bit ? 8 : 4;
+ int NewReturnAddrFI =
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
+ SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+ Chain = DAG.getStore(Chain,RetAddrFrIdx, NewRetAddrFrIdx, NULL, 0);
+ }
+
+ Chain = DAG.
+ getCALLSEQ_START(Chain, DAG.getConstant(NumBytesToBePushed, getPointerTy()));
+
+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+ SmallVector<SDOperand, 8> MemOpChains;
+ SmallVector<SDOperand, 8> MemOpChains2;
+ SDOperand FramePtr, StackPtr;
+ SDOperand PtrOff;
+ SDOperand FIN;
+ int FI = 0;
+
+ // Walk the register/memloc assignments, inserting copies/loads. Lower
+ // arguments first to the stack slot where they would normally - in case of a
+ // normal function call - be.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+ if (StackPtr.Val == 0)
+ StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+
+ MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain,
+ Arg));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDOperand InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+ InFlag = SDOperand();
+ // Copy from stack slots to stack slot of a tail called function. This needs
+ // to be done because if we would lower the arguments directly to their real
+ // stack slot we might end up overwriting each other.
+ // TODO: To make this more efficient (sometimes saving a store/load) we could
+ // analyse the arguments and emit this store/load/store sequence only for
+ // arguments which would be overwritten otherwise.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc()) {
+ SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo());
+ unsigned Flags = cast<ConstantSDNode>(FlagsOp)->getValue();
+
+ // Get source stack slot.
+ SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+ PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
+ FIN = DAG.getFrameIndex(FI, MVT::i32);
+ if (Flags & ISD::ParamFlags::ByVal) {
+ // Copy relative to framepointer.
+ unsigned Align = 1 << ((Flags & ISD::ParamFlags::ByValAlign) >>
+ ISD::ParamFlags::ByValAlignOffs);
+
+ unsigned Size = (Flags & ISD::ParamFlags::ByValSize) >>
+ ISD::ParamFlags::ByValSizeOffs;
+
+ SDOperand AlignNode = DAG.getConstant(Align, MVT::i32);
+ SDOperand SizeNode = DAG.getConstant(Size, MVT::i32);
+ // Copy relative to framepointer.
+ MemOpChains2.push_back(DAG.getNode(ISD::MEMCPY, MVT::Other, Chain, FIN,
+ PtrOff, SizeNode, AlignNode));
+ } else {
+ SDOperand LoadedArg = DAG.getLoad(VA.getValVT(), Chain, PtrOff, NULL,0);
+ // Store relative to framepointer.
+ MemOpChains2.push_back(DAG.getStore(Chain, LoadedArg, FIN, NULL, 0));
+ }
+ }
+ }
+
+ if (!MemOpChains2.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains2[0], MemOpChains.size());
+
+ // ELF / PIC requires GOT in the EBX register before function calls via PLT
+ // GOT pointer.
+ // Does not work with tail call since ebx is not restored correctly by
+ // tailcaller. TODO: at least for x86 - verify for x86-64
+
+ // If the callee is a GlobalAddress node (quite common, every direct call is)
+ // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+ if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ // We should use extra load for direct calls to dllimported functions in
+ // non-JIT mode.
+ if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(),
+ getTargetMachine(), true))
+ Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy());
+ } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee))
+ Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy());
+ else {
+ assert(Callee.getOpcode() == ISD::LOAD &&
+ "Function destination must be loaded into virtual register");
+ unsigned Opc = is64Bit ? X86::R9 : X86::ECX;
+
+ Chain = DAG.getCopyToReg(Chain,
+ DAG.getRegister(Opc, getPointerTy()) ,
+ Callee,InFlag);
+ Callee = DAG.getRegister(Opc, getPointerTy());
+ // Add register as live out.
+ DAG.getMachineFunction().addLiveOut(Opc);
+ }
+
+ SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ SmallVector<SDOperand, 8> Ops;
+
+ Ops.push_back(Chain);
+ Ops.push_back(DAG.getConstant(NumBytesToBePushed, getPointerTy()));
+ Ops.push_back(DAG.getConstant(0, getPointerTy()));
+ if (InFlag.Val)
+ Ops.push_back(InFlag);
+ Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size());
+ InFlag = Chain.getValue(1);
+
+ // Returns a chain & a flag for retval copy to use.
+ NodeTys = DAG.getVTList(MVT::Other, MVT::Flag);
+ Ops.clear();
+ Ops.push_back(Chain);
+ Ops.push_back(Callee);
+ Ops.push_back(DAG.getConstant(FPDiff, MVT::i32));
+ // Add argument registers to the end of the list so that they are known live
+ // into the call.
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+ Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+ RegsToPass[i].second.getValueType()));
+ if (InFlag.Val)
+ Ops.push_back(InFlag);
+ assert(InFlag.Val &&
+ "Flag must be set. Depend on flag being set in LowerRET");
+ Chain = DAG.getNode(X86ISD::TAILCALL,
+ Op.Val->getVTList(), &Ops[0], Ops.size());
+
+ return SDOperand(Chain.Val, Op.ResNo);
+}
//===----------------------------------------------------------------------===//
// X86-64 C Calling Convention implementation
@@ -1323,6 +1699,7 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
MachineFrameInfo *MFI = MF.getFrameInfo();
SDOperand Root = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ unsigned CC= MF.getFunction()->getCallingConv();
static const unsigned GPR64ArgRegs[] = {
X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
@@ -1335,9 +1712,12 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ CCState CCInfo(CC, isVarArg,
getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);
+ if (CC == CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_TailCall);
+ else
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C);
SmallVector<SDOperand, 8> ArgValues;
unsigned LastVal = ~0U;
@@ -1398,10 +1778,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
}
unsigned StackSize = CCInfo.getNextStackOffset();
+ if (CC==CallingConv::Fast)
+ StackSize =GetAlignedArgumentStackSize(StackSize, DAG);
// If the function takes variable number of arguments, make a frame index for
// the start of the first vararg value... for expansion of llvm.va_start.
if (isVarArg) {
+ assert(CC!=CallingConv::Fast
+ && "Var arg not supported with calling convention fastcc");
unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6);
unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8);
@@ -1446,10 +1830,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) {
}
ArgValues.push_back(Root);
-
- BytesToPopOnReturn = 0; // Callee pops nothing.
- BytesCallerReserves = StackSize;
-
+ // Tail call convention (fastcc) needs callee pop.
+ if (CC == CallingConv::Fast && PerformTailCallOpt){
+ BytesToPopOnReturn = StackSize; // Callee pops everything.
+ BytesCallerReserves = 0;
+ } else {
+ BytesToPopOnReturn = 0; // Callee pops nothing.
+ BytesCallerReserves = StackSize;
+ }
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn);
@@ -1463,16 +1851,21 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG,
unsigned CC) {
SDOperand Chain = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
- bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
SDOperand Callee = Op.getOperand(4);
<