aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArnold Schwaighofer <arnold.schwaighofer@gmail.com>2007-10-11 19:40:01 +0000
committerArnold Schwaighofer <arnold.schwaighofer@gmail.com>2007-10-11 19:40:01 +0000
commitc85e1716f0e45e4c18a9ef2fbe431a51ac3a4252 (patch)
treee6e745a81699085d757db166e645f8a6639cde7a
parent68c8411ef591818d89c6a81098a328fcd75be72a (diff)
Added tail call optimization to the x86 back end. It can be
enabled by passing -tailcallopt to llc. The optimization is performed if the following conditions are satisfied: * caller/callee are fastcc * elf/pic is disabled OR elf/pic enabled + callee is in module + callee has visibility protected or hidden git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@42870 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/Target/TargetLowering.h9
-rw-r--r--include/llvm/Target/TargetOptions.h5
-rw-r--r--lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp48
-rw-r--r--lib/Target/TargetMachine.cpp7
-rw-r--r--lib/Target/X86/README.txt80
-rw-r--r--lib/Target/X86/X86CallingConv.td50
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp511
-rw-r--r--lib/Target/X86/X86ISelLowering.h24
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp2
-rw-r--r--lib/Target/X86/X86InstrInfo.td43
-rw-r--r--lib/Target/X86/X86InstrX86-64.td35
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.h15
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp129
-rw-r--r--test/CodeGen/X86/tailcall1.ll11
-rw-r--r--test/CodeGen/X86/tailcallpic1.ll12
-rw-r--r--test/CodeGen/X86/tailcallpic2.ll12
16 files changed, 928 insertions, 65 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 8586d7f091..1352eaff73 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -860,6 +860,15 @@ public:
/// implement this. The default implementation of this aborts.
virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG);
+ /// IsEligibleForTailCallOptimization - Check whether the call is eligible for
+ /// tail call optimization. Target which want to do tail call optimization
+ /// should implement this function.
+ virtual bool IsEligibleForTailCallOptimization(SDOperand Call,
+ SDOperand Ret,
+ SelectionDAG &DAG) const {
+ return false;
+ }
+
/// CustomPromoteOperation - This callback is invoked for operations that are
/// unsupported by the target, are registered to use 'custom' lowering, and
/// whose type needs to be promoted.
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 7421f96b7b..dd54432442 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -73,6 +73,11 @@ namespace llvm {
/// ExceptionHandling - This flag indicates that exception information should
/// be emitted.
extern bool ExceptionHandling;
+
+ /// PerformTailCallOpt - This flag is enabled when the -tailcallopt is
+ /// specified on the commandline. When the flag is on, the target will perform
+ /// tail call optimization (pop the caller's stack) providing it supports it.
+ extern bool PerformTailCallOpt;
} // End llvm namespace
#endif
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 5f321654d8..b1bf475c47 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -4444,6 +4444,48 @@ static void copyCatchInfo(BasicBlock *SrcBB, BasicBlock *DestBB,
}
}
+/// CheckDAGForTailCallsAndFixThem - This Function looks for CALL nodes in the
+/// DAG and fixes their tailcall attribute operand
+static void CheckDAGForTailCallsAndFixThem(SelectionDAG &DAG,
+ TargetLowering& TLI) {
+ SDNode * Ret = NULL;
+ SDOperand Terminator = DAG.getRoot();
+
+ // Find RET node.
+ if (Terminator.getOpcode() == ISD::RET) {
+ Ret = Terminator.Val;
+ }
+
+ // Fix tail call attribute of CALL nodes.
+ for (SelectionDAG::allnodes_iterator BE = DAG.allnodes_begin(),
+ BI = prior(DAG.allnodes_end()); BI != BE; --BI) {
+ if (BI->getOpcode() == ISD::CALL) {
+ SDOperand OpRet(Ret, 0);
+ SDOperand OpCall(static_cast<SDNode*>(BI), 0);
+ bool isMarkedTailCall =
+ cast<ConstantSDNode>(OpCall.getOperand(3))->getValue() != 0;
+ // If CALL node has tail call attribute set to true and the call is not
+ // eligible (no RET or the target rejects) the attribute is fixed to
+ // false. The TargetLowering::IsEligibleForTailCallOptimization function
+ // must correctly identify tail call optimizable calls.
+ if (isMarkedTailCall &&
+ (Ret==NULL ||
+ !TLI.IsEligibleForTailCallOptimization(OpCall, OpRet, DAG))) {
+ SmallVector<SDOperand, 32> Ops;
+ unsigned idx=0;
+ for(SDNode::op_iterator I =OpCall.Val->op_begin(),
+ E=OpCall.Val->op_end(); I!=E; I++, idx++) {
+ if (idx!=3)
+ Ops.push_back(*I);
+ else
+ Ops.push_back(DAG.getConstant(false, TLI.getPointerTy()));
+ }
+ DAG.UpdateNodeOperands(OpCall, Ops.begin(), Ops.size());
+ }
+ }
+ }
+}
+
void SelectionDAGISel::BuildSelectionDAG(SelectionDAG &DAG, BasicBlock *LLVMBB,
std::vector<std::pair<MachineInstr*, unsigned> > &PHINodesToUpdate,
FunctionLoweringInfo &FuncInfo) {
@@ -4621,6 +4663,12 @@ void SelectionDAGISel::BuildSelectionDAG(SelectionDAG &DAG, BasicBlock *LLVMBB,
// Make sure the root of the DAG is up-to-date.
DAG.setRoot(SDL.getRoot());
+
+ // Check whether calls in this block are real tail calls. Fix up CALL nodes
+ // with correct tailcall attribute so that the target can rely on the tailcall
+ // attribute indicating whether the call is really eligible for tail call
+ // optimization.
+ CheckDAGForTailCallsAndFixThem(DAG, TLI);
}
void SelectionDAGISel::CodeGenAndEmitDAG(SelectionDAG &DAG) {
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 6c00a3f492..9caea11dd3 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -33,6 +33,7 @@ namespace llvm {
bool ExceptionHandling;
Reloc::Model RelocationModel;
CodeModel::Model CMModel;
+ bool PerformTailCallOpt;
}
namespace {
cl::opt<bool, true> PrintCode("print-machineinstrs",
@@ -116,6 +117,12 @@ namespace {
clEnumValN(CodeModel::Large, "large",
" Large code model"),
clEnumValEnd));
+
+ cl::opt<bool, true>
+ EnablePerformTailCallOpt("tailcallopt",
+ cl::desc("Turn on tail call optimization."),
+ cl::location(PerformTailCallOpt),
+ cl::init(false));
}
//---------------------------------------------------------------------------
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 9bafff73d5..0d4dce32d8 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1368,3 +1368,83 @@ L7:
L5:
//===---------------------------------------------------------------------===//
+Tail call optimization improvements: Tail call optimization currently
+pushes all arguments on the top of the stack (their normal place if
+that was a not tail call optimized functiong call ) before moving them
+to actual stack slot. this is done to prevent overwriting of paramters
+(see example below) that might be used, since the arguments of the
+callee overwrites callers arguments.
+
+ example:
+
+int callee(int32, int64);
+int caller(int32 arg1, int32 arg2) {
+ int64 local = arg2 * 2;
+ return callee(arg2, (int64)local);
+}
+
+[arg1] [!arg2 no longer valid since we moved local onto it]
+[arg2] -> [(int64)
+[RETADDR] local ]
+
+moving arg1 onto the stack slot of callee function would overwrite
+arg2 of the caller.
+
+Possible optimizations:
+
+ - only push those arguments to the top of the stack that are actual
+ parameters of the caller function and have no local value in the
+ caller
+
+ in above example local does not need to be pushed onto the top of
+ the stack as it is definitetly not a caller's function parameter
+
+ - analyse the actual parameters of the callee to see which would
+ overwrite a caller paramter which is used by the callee and only
+ push them onto the top of the stack
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg1,arg2);
+ }
+
+ here we don't need to write any variables to the top of the stack
+ since they don't overwrite each other
+
+ int callee (int32 arg1, int32 arg2);
+ int caller (int32 arg1, int32 arg2) {
+ return callee(arg2,arg1);
+ }
+
+ here we need to push the arguments because they overwrite each other
+
+
+ code for lowering directly onto callers arguments:
++ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
++ SmallVector<SDOperand, 8> MemOpChains;
++
++ SDOperand FramePtr;
++ SDOperand PtrOff;
++ SDOperand FIN;
++ int FI = 0;
++ // Walk the register/memloc assignments, inserting copies/loads.
++ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
++ CCValAssign &VA = ArgLocs[i];
++ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
++
++ ....
++
++ if (VA.isRegLoc()) {
++ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
++ } else {
++ assert(VA.isMemLoc());
++ // create frame index
++ int32_t Offset = VA.getLocMemOffset()+FPDiff;
++ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
++ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
++ FIN = DAG.getFrameIndex(FI, MVT::i32);
++ // store relative to framepointer
++ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
++ }
++ }
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 9c2d95a199..f23e580656 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -127,6 +127,40 @@ def CC_X86_64_C : CallingConv<[
CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
]>;
+// tail call convetion (fast) one register is reserved for target address
+// namely R9
+def CC_X86_64_TailCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The first 6 integer arguments are passed in integer registers.
+ CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>,
+ CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The first 8 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+ CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>,
+
+ // The first 8 MMX vector arguments are passed in GPRs.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64],
+ CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>,
+
+ // The 'nest' parameter, if any, is passed in R10.
+ CCIfNest<CCAssignToReg<[R10]>>,
+
+ // Integer/FP values get stored in stack slots that are 8 bytes in size and
+ // 8-byte aligned if there are no more registers to hold them.
+ CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>,
+
+ // Vectors get 16-byte stack slots that are 16-byte aligned.
+ CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>,
+
+ // __m64 vectors get 8-byte stack slots that are 8-byte aligned.
+ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>>
+]>;
+
//===----------------------------------------------------------------------===//
// X86 C Calling Convention
@@ -173,6 +207,22 @@ def CC_X86_32_C : CallingConv<[
CCDelegateTo<CC_X86_32_Common>
]>;
+/// Same as C calling convention up to nonfree ECX which is used for storing
+/// potential pointer to tail called function
+def CC_X86_32_TailCall : CallingConv<[
+ // Promote i8/i16 arguments to i32.
+ CCIfType<[i8, i16], CCPromoteToType<i32>>,
+
+ // The 'nest' parameter, if any, is passed in ECX.
+ CCIfNest<CCAssignToReg<[ECX]>>,
+
+ // The first 3 integer arguments, if marked 'inreg' and if the call is not
+ // a vararg call, are passed in integer registers.
+ CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>>>,
+
+ // Otherwise, same as everything else.
+ CCDelegateTo<CC_X86_32_Common>
+]>;
def CC_X86_32_FastCall : CallingConv<[
// Promote i8/i16 arguments to i32.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1917a6a291..8767d8d33b 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -32,6 +32,8 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/CodeGen/SSARegMap.h"
#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
#include "llvm/Target/TargetOptions.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ParameterAttributes.h"
@@ -43,6 +45,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
X86ScalarSSEf64 = Subtarget->hasSSE2();
X86ScalarSSEf32 = Subtarget->hasSSE1();
X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP;
+
RegInfo = TM.getRegisterInfo();
@@ -641,6 +644,19 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM)
//===----------------------------------------------------------------------===//
#include "X86GenCallingConv.inc"
+
+/// GetPossiblePreceedingTailCall - Get preceeding X86ISD::TAILCALL node if it
+/// exists skip possible ISD:TokenFactor.
+static SDOperand GetPossiblePreceedingTailCall(SDOperand Chain) {
+ if (Chain.getOpcode()==X86ISD::TAILCALL) {
+ return Chain;
+ } else if (Chain.getOpcode()==ISD::TokenFactor) {
+ if (Chain.getNumOperands() &&
+ Chain.getOperand(0).getOpcode()==X86ISD::TAILCALL)
+ return Chain.getOperand(0);
+ }
+ return Chain;
+}
/// LowerRET - Lower an ISD::RET node.
SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
@@ -651,8 +667,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs);
CCInfo.AnalyzeReturn(Op.Val, RetCC_X86);
-
-
+
// If this is the first return lowered for this function, add the regs to the
// liveout set for the function.
if (DAG.getMachineFunction().liveout_empty()) {
@@ -660,10 +675,38 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
if (RVLocs[i].isRegLoc())
DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg());
}
-
SDOperand Chain = Op.getOperand(0);
- SDOperand Flag;
+ // Handle tail call return.
+ Chain = GetPossiblePreceedingTailCall(Chain);
+ if (Chain.getOpcode() == X86ISD::TAILCALL) {
+ SDOperand TailCall = Chain;
+ SDOperand TargetAddress = TailCall.getOperand(1);
+ SDOperand StackAdjustment = TailCall.getOperand(2);
+ assert ( ((TargetAddress.getOpcode() == ISD::Register &&
+ (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX ||
+ cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) ||
+ TargetAddress.getOpcode() == ISD::TargetExternalSymbol ||
+ TargetAddress.getOpcode() == ISD::TargetGlobalAddress) &&
+ "Expecting an global address, external symbol, or register");
+ assert( StackAdjustment.getOpcode() == ISD::Constant &&
+ "Expecting a const value");
+
+ SmallVector<SDOperand,8> Operands;
+ Operands.push_back(Chain.getOperand(0));
+ Operands.push_back(TargetAddress);
+ Operands.push_back(StackAdjustment);
+ // Copy registers used by the call. Last operand is a flag so it is not
+ // copied.
+ for(unsigned i=3; i < TailCall.getNumOperands()-1;i++) {
+ Operands.push_back(Chain.getOperand(i));
+ }
+ return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0], Operands.size());
+ }
+
+ // Regular return.
+ SDOperand Flag;
+
// Copy the result values into the output registers.
if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() ||
RVLocs[0].getLocReg() != X86::ST0) {
@@ -684,7 +727,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) {
if ((X86ScalarSSEf32 && RVLocs[0].getValVT()==MVT::f32) ||
(X86ScalarSSEf64 && RVLocs[0].getValVT()==MVT::f64)) {
SDOperand MemLoc;
-
+
// If this is a load into a scalarsse value, don't store the loaded value
// back to the stack, only to reload it: just replace the scalar-sse load.
if (ISD::isNON_EXTLoad(Value.Val) &&
@@ -784,12 +827,14 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall,
//===----------------------------------------------------------------------===//
-// C & StdCall Calling Convention implementation
+// C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
// StdCall calling convention seems to be standard for many Windows' API
// routines and around. It differs from C calling convention just a little:
// callee should clean up the stack, not caller. Symbols should be also
// decorated in some fancy way :) It doesn't support any vector arguments.
+// For info on fast calling convention see Fast Calling Convention (tail call)
+// implementation LowerX86_32FastCCCallTo.
/// AddLiveIn - This helper function adds the specified physical register to the
/// MachineFunction as a live in value. It also creates a corresponding virtual
@@ -802,6 +847,9 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg,
return VReg;
}
+// align stack arguments according to platform alignment needed for tail calls
+unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG);
+
SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG,
const CCValAssign &VA,
MachineFrameInfo *MFI,
@@ -826,13 +874,17 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
MachineFrameInfo *MFI = MF.getFrameInfo();
SDOperand Root = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
-
+ unsigned CC = MF.getFunction()->getCallingConv();
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
- CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg,
+ CCState CCInfo(CC, isVarArg,
getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
-
+ // Check for possible tail call calling convention.
+ if (CC == CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_TailCall);
+ else
+ CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C);
+
SmallVector<SDOperand, 8> ArgValues;
unsigned LastVal = ~0U;
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
@@ -877,6 +929,9 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
}
unsigned StackSize = CCInfo.getNextStackOffset();
+ // align stack specially for tail calls
+ if (CC==CallingConv::Fast)
+ StackSize = GetAlignedArgumentStackSize(StackSize,DAG);
ArgValues.push_back(Root);
@@ -885,7 +940,12 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG,
if (isVarArg)
VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize);
- if (isStdCall && !isVarArg) {
+ // Tail call calling convention (CallingConv::Fast) does not support varargs.
+ assert( !(isVarArg && CC == CallingConv::Fast) &&
+ "CallingConv::Fast does not support varargs.");
+
+ if (isStdCall && !isVarArg &&
+ (CC==CallingConv::Fast && PerformTailCallOpt || CC!=CallingConv::Fast)) {
BytesToPopOnReturn = StackSize; // Callee pops everything..
BytesCallerReserves = 0;
} else {
@@ -914,17 +974,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
unsigned CC) {
SDOperand Chain = Op.getOperand(0);
bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
- bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
SDOperand Callee = Op.getOperand(4);
unsigned NumOps = (Op.getNumOperands() - 5) / 2;
-
+
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
- CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
+ if(CC==CallingConv::Fast && PerformTailCallOpt)
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall);
+ else
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C);
// Get a count of how many bytes are to be pushed on the stack.
unsigned NumBytes = CCInfo.getNextStackOffset();
+ if (CC==CallingConv::Fast)
+ NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy()));
@@ -1023,19 +1087,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (InFlag.Val)
Ops.push_back(InFlag);
-
- Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
- NodeTys, &Ops[0], Ops.size());
+
+ Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
// Create the CALLSEQ_END node.
unsigned NumBytesForCalleeToPush = 0;
- if (CC == CallingConv::X86_StdCall) {
+ if (CC == CallingConv::X86_StdCall ||
+ (CC == CallingConv::Fast && PerformTailCallOpt)) {
if (isVarArg)
NumBytesForCalleeToPush = isSRet ? 4 : 0;
else
NumBytesForCalleeToPush = NumBytes;
+ assert(!(isVarArg && CC==CallingConv::Fast) &&
+ "CallingConv::Fast does not support varargs.");
} else {
// If this is is a call to a struct-return function, the callee
// pops the hidden struct pointer, so we have to push it back.
@@ -1132,7 +1198,8 @@ X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) {
if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
// Make sure the instruction takes 8n+4 bytes to make sure the start of the
- // arguments and the arguments after the retaddr has been pushed are aligned.
+ // arguments and the arguments after the retaddr has been pushed are
+ // aligned.
if ((StackSize & 7) == 0)
StackSize += 4;
}
@@ -1194,7 +1261,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) {
// Make sure the instruction takes 8n+4 bytes to make sure the start of the
- // arguments and the arguments after the retaddr has been pushed are aligned.
+ // arguments and the arguments after the retaddr has been pushed are
+ // aligned.
if ((NumBytes & 7) == 0)
NumBytes += 4;
}
@@ -1292,8 +1360,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
if (InFlag.Val)
Ops.push_back(InFlag);
- // FIXME: Do not generate X86ISD::TAILCALL for now.
- Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL,
+ assert(isTailCall==false && "no tail call here");
+ Chain = DAG.getNode(X86ISD::CALL,
NodeTys, &Ops[0], Ops.size());
InFlag = Chain.getValue(1);
@@ -1312,6 +1380,314 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG,
return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo);
}
+//===----------------------------------------------------------------------===//
+// Fast Calling Convention (tail call) implementation
+//===----------------------------------------------------------------------===//
+
+// Like std call, callee cleans arguments, convention except that ECX is
+// reserved for storing the tail called function address. Only 2 registers are
+// free for argument passing (inreg). Tail call optimization is performed
+// provided:
+// * tailcallopt is enabled
+// * caller/callee are fastcc
+// * elf/pic is disabled OR
+// * elf/pic enabled + callee is in module + callee has
+// visibility protected or hidden
+// To ensure the stack is aligned according to platform abi pass
+// tail-call-align-stack. This makes sure that argument delta is always
+// multiples of stack alignment. (Dynamic linkers need this - darwin's dyld for
+// example)
+// If a tail called function callee has more arguments than the caller the
+// caller needs to make sure that there is room to move the RETADDR to. This is
+// achived by reserving an area the size of the argument delta right after the
+// original REtADDR, but before the saved framepointer or the spilled registers
+// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
+// stack layout:
+// arg1
+// arg2
+// RETADDR
+// [ new RETADDR
+// move area ]
+// (possible EBP)
+// ESI
+// EDI
+// local1 ..
+
+/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned
+/// for a 16 byte align requirement.
+unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
+ SelectionDAG& DAG) {
+ if (PerformTailCallOpt) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const TargetMachine &TM = MF.getTarget();
+ const TargetFrameInfo &TFI = *TM.getFrameInfo();
+ unsigned StackAlignment = TFI.getStackAlignment();
+ uint64_t AlignMask = StackAlignment - 1;
+ int64_t Offset = StackSize;
+ unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4;
+ if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
+ // Number smaller than 12 so just add the difference.
+ Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
+ } else {
+ // Mask out lower bits, add stackalignment once plus the 12 bytes.
+ Offset = ((~AlignMask) & Offset) + StackAlignment +
+ (StackAlignment-SlotSize);
+ }
+ StackSize = Offset;
+ }
+ return StackSize;
+}
+
+/// IsEligibleForTailCallElimination - Check to see whether the next instruction
+// following the call is a return. A function is eligible if caller/callee
+// calling conventions match, currently only fastcc supports tail calls, and the
+// function CALL is immediatly followed by a RET.
+bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call,
+ SDOperand Ret,
+ SelectionDAG& DAG) const {
+ bool IsEligible = false;
+
+ // Check whether CALL node immediatly preceeds the RET node and whether the
+ // return uses the result of the node or is a void return.
+ if ((Ret.getNumOperands() == 1 &&
+ (Ret.getOperand(0)== SDOperand(Call.Val,1) ||
+ Ret.getOperand(0)== SDOperand(Call.Val,0))) ||
+ (Ret.getOperand(0)== SDOperand(Call.Val,Call.Val->getNumValues()-1) &&
+ Ret.getOperand(1)== SDOperand(Call.Val,0))) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ unsigned CallerCC = MF.getFunction()->getCallingConv();
+ unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue();
+ if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) {
+ SDOperand Callee = Call.getOperand(4);
+ // On elf/pic %ebx needs to be livein.
+ if(getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
+ Subtarget->isPICStyleGOT()) {
+ // Can only do local tail calls with PIC.
+ GlobalValue * GV = 0;
+ GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+ if(G != 0 &&
+ (GV = G->getGlobal()) &&
+ (GV->hasHiddenVisibility() || GV->hasProtectedVisibility()))
+ IsEligible=true;
+ } else {
+ IsEligible=true;
+ }
+ }
+ }
+ return IsEligible;
+}
+
+SDOperand X86TargetLowering::LowerX86_TailCallTo(SDOperand Op,
+ SelectionDAG &DAG,
+ unsigned CC) {
+ SDOperand Chain = Op.getOperand(0);
+ bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0;
+ bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0;
+ SDOperand Callee = Op.getOperand(4);
+ bool is64Bit = Subtarget->is64Bit();
+
+ assert(isTailCall && PerformTailCallOpt && "Should only emit tail calls.");
+
+ // Analyze operands of the call, assigning locations to each operand.
+ SmallVector<CCValAssign, 16> ArgLocs;
+ CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs);
+ if (is64Bit)
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall);
+ else
+ CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall);
+
+
+ // Lower arguments at fp - stackoffset + fpdiff.
+ MachineFunction &MF = DAG.getMachineFunction();
+
+ unsigned NumBytesToBePushed =
+ GetAlignedArgumentStackSize(CCInfo.getNextStackOffset(), DAG);
+
+ unsigned NumBytesCallerPushed =
+ MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn();
+ int FPDiff = NumBytesCallerPushed - NumBytesToBePushed;
+
+ // Set the delta of movement of the returnaddr stackslot.
+ // But only set if delta is greater than previous delta.
+ if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta()))
+ MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff);
+
+ // Adjust the ret address stack slot.
+ if (FPDiff) {
+ MVT::ValueType VT = is64Bit ? MVT::i64 : MVT::i32;
+ SDOperand RetAddrFrIdx = getReturnAddressFrameIndex(DAG);
+ RetAddrFrIdx =
+ DAG.getLoad(VT, DAG.getEntryNode(),RetAddrFrIdx, NULL, 0);
+ // Emit a store of the saved ret value to the new location.
+ int SlotSize = is64Bit ? 8 : 4;
+ int NewReturnAddrFI =
+ MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize);
+ SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+ Chain = DAG.getStore(Chain,RetAddrFrIdx, NewRetAddrFrIdx, NULL, 0);
+ }
+
+ Chain = DAG.
+ getCALLSEQ_START(Chain, DAG.getConstant(NumBytesToBePushed, getPointerTy()));
+
+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
+ SmallVector<SDOperand, 8> MemOpChains;
+ SmallVector<SDOperand, 8> MemOpChains2;
+ SDOperand FramePtr, StackPtr;
+ SDOperand PtrOff;
+ SDOperand FIN;
+ int FI = 0;
+
+ // Walk the register/memloc assignments, inserting copies/loads. Lower
+ // arguments first to the stack slot where they would normally - in case of a
+ // normal function call - be.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
+
+ // Promote the value if needed.
+ switch (VA.getLocInfo()) {
+ default: assert(0 && "Unknown loc info!");
+ case CCValAssign::Full: break;
+ case CCValAssign::SExt:
+ Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::ZExt:
+ Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg);
+ break;
+ case CCValAssign::AExt:
+ Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg);
+ break;
+ }
+
+ if (VA.isRegLoc()) {
+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+ } else {
+ assert(VA.isMemLoc());
+ if (StackPtr.Val == 0)
+ StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy());
+
+ MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain,
+ Arg));
+ }
+ }
+
+ if (!MemOpChains.empty())
+ Chain = DAG.getNode(ISD::TokenFactor, MVT::Other,
+ &MemOpChains[0], MemOpChains.size());
+
+ // Build a sequence of copy-to-reg nodes chained together with token chain
+ // and flag operands which copy the outgoing args into registers.
+ SDOperand InFlag;
+ for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+ Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second,
+ InFlag);
+ InFlag = Chain.getValue(1);
+ }
+ InFlag = SDOperand();
+ // Copy from stack slots to stack slot of a tail called function. This needs
+ // to be done because if we would lower the arguments directly to their real
+ // stack slot we might end up overwriting each other.
+ // TODO: To make this more efficient (sometimes saving a store/load) we could
+ // analyse the arguments and emit this store/load/store sequence only for
+ // arguments which would be overwritten otherwise.
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+ CCValAssign &VA = ArgLocs[i];
+ if (!VA.isRegLoc()) {
+ SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo());
+ unsigned Flags = cast<ConstantSDNode>(FlagsOp)->getValue();
+
+ // Get source stack slot.
+ SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy());
+ PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff);
+ // Create frame index.
+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
+ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
+ FIN = DAG.getFrameIndex(FI, MVT::i32);
+ if (Flags & ISD::ParamFlags::ByVal) {
+ // Copy relative to framepointer.
+ unsigned Align = 1 << ((Flags & ISD::ParamFlags::ByValAlign) >>
+ ISD::ParamFlags::ByValAlignOffs);
+
+ unsigned Size = (Flags & ISD::ParamFlags::ByValSize) >>
+ ISD::ParamFlags::ByValSizeOffs;
+