diff options
author | Arnold Schwaighofer <arnold.schwaighofer@gmail.com> | 2007-10-11 19:40:01 +0000 |
---|---|---|
committer | Arnold Schwaighofer <arnold.schwaighofer@gmail.com> | 2007-10-11 19:40:01 +0000 |
commit | c85e1716f0e45e4c18a9ef2fbe431a51ac3a4252 (patch) | |
tree | e6e745a81699085d757db166e645f8a6639cde7a | |
parent | 68c8411ef591818d89c6a81098a328fcd75be72a (diff) |
Added tail call optimization to the x86 back end. It can be
enabled by passing -tailcallopt to llc. The optimization is
performed if the following conditions are satisfied:
* caller/callee are fastcc
* elf/pic is disabled OR
elf/pic enabled + callee is in module + callee has
visibility protected or hidden
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@42870 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/Target/TargetLowering.h | 9 | ||||
-rw-r--r-- | include/llvm/Target/TargetOptions.h | 5 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 48 | ||||
-rw-r--r-- | lib/Target/TargetMachine.cpp | 7 | ||||
-rw-r--r-- | lib/Target/X86/README.txt | 80 | ||||
-rw-r--r-- | lib/Target/X86/X86CallingConv.td | 50 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 511 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 24 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 2 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 43 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrX86-64.td | 35 | ||||
-rw-r--r-- | lib/Target/X86/X86MachineFunctionInfo.h | 15 | ||||
-rw-r--r-- | lib/Target/X86/X86RegisterInfo.cpp | 129 | ||||
-rw-r--r-- | test/CodeGen/X86/tailcall1.ll | 11 | ||||
-rw-r--r-- | test/CodeGen/X86/tailcallpic1.ll | 12 | ||||
-rw-r--r-- | test/CodeGen/X86/tailcallpic2.ll | 12 |
16 files changed, 928 insertions, 65 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 8586d7f091..1352eaff73 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -860,6 +860,15 @@ public: /// implement this. The default implementation of this aborts. virtual SDOperand LowerOperation(SDOperand Op, SelectionDAG &DAG); + /// IsEligibleForTailCallOptimization - Check whether the call is eligible for + /// tail call optimization. Target which want to do tail call optimization + /// should implement this function. + virtual bool IsEligibleForTailCallOptimization(SDOperand Call, + SDOperand Ret, + SelectionDAG &DAG) const { + return false; + } + /// CustomPromoteOperation - This callback is invoked for operations that are /// unsupported by the target, are registered to use 'custom' lowering, and /// whose type needs to be promoted. diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h index 7421f96b7b..dd54432442 100644 --- a/include/llvm/Target/TargetOptions.h +++ b/include/llvm/Target/TargetOptions.h @@ -73,6 +73,11 @@ namespace llvm { /// ExceptionHandling - This flag indicates that exception information should /// be emitted. extern bool ExceptionHandling; + + /// PerformTailCallOpt - This flag is enabled when the -tailcallopt is + /// specified on the commandline. When the flag is on, the target will perform + /// tail call optimization (pop the caller's stack) providing it supports it. + extern bool PerformTailCallOpt; } // End llvm namespace #endif diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 5f321654d8..b1bf475c47 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -4444,6 +4444,48 @@ static void copyCatchInfo(BasicBlock *SrcBB, BasicBlock *DestBB, } } +/// CheckDAGForTailCallsAndFixThem - This Function looks for CALL nodes in the +/// DAG and fixes their tailcall attribute operand +static void CheckDAGForTailCallsAndFixThem(SelectionDAG &DAG, + TargetLowering& TLI) { + SDNode * Ret = NULL; + SDOperand Terminator = DAG.getRoot(); + + // Find RET node. + if (Terminator.getOpcode() == ISD::RET) { + Ret = Terminator.Val; + } + + // Fix tail call attribute of CALL nodes. + for (SelectionDAG::allnodes_iterator BE = DAG.allnodes_begin(), + BI = prior(DAG.allnodes_end()); BI != BE; --BI) { + if (BI->getOpcode() == ISD::CALL) { + SDOperand OpRet(Ret, 0); + SDOperand OpCall(static_cast<SDNode*>(BI), 0); + bool isMarkedTailCall = + cast<ConstantSDNode>(OpCall.getOperand(3))->getValue() != 0; + // If CALL node has tail call attribute set to true and the call is not + // eligible (no RET or the target rejects) the attribute is fixed to + // false. The TargetLowering::IsEligibleForTailCallOptimization function + // must correctly identify tail call optimizable calls. + if (isMarkedTailCall && + (Ret==NULL || + !TLI.IsEligibleForTailCallOptimization(OpCall, OpRet, DAG))) { + SmallVector<SDOperand, 32> Ops; + unsigned idx=0; + for(SDNode::op_iterator I =OpCall.Val->op_begin(), + E=OpCall.Val->op_end(); I!=E; I++, idx++) { + if (idx!=3) + Ops.push_back(*I); + else + Ops.push_back(DAG.getConstant(false, TLI.getPointerTy())); + } + DAG.UpdateNodeOperands(OpCall, Ops.begin(), Ops.size()); + } + } + } +} + void SelectionDAGISel::BuildSelectionDAG(SelectionDAG &DAG, BasicBlock *LLVMBB, std::vector<std::pair<MachineInstr*, unsigned> > &PHINodesToUpdate, FunctionLoweringInfo &FuncInfo) { @@ -4621,6 +4663,12 @@ void SelectionDAGISel::BuildSelectionDAG(SelectionDAG &DAG, BasicBlock *LLVMBB, // Make sure the root of the DAG is up-to-date. DAG.setRoot(SDL.getRoot()); + + // Check whether calls in this block are real tail calls. Fix up CALL nodes + // with correct tailcall attribute so that the target can rely on the tailcall + // attribute indicating whether the call is really eligible for tail call + // optimization. + CheckDAGForTailCallsAndFixThem(DAG, TLI); } void SelectionDAGISel::CodeGenAndEmitDAG(SelectionDAG &DAG) { diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 6c00a3f492..9caea11dd3 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -33,6 +33,7 @@ namespace llvm { bool ExceptionHandling; Reloc::Model RelocationModel; CodeModel::Model CMModel; + bool PerformTailCallOpt; } namespace { cl::opt<bool, true> PrintCode("print-machineinstrs", @@ -116,6 +117,12 @@ namespace { clEnumValN(CodeModel::Large, "large", " Large code model"), clEnumValEnd)); + + cl::opt<bool, true> + EnablePerformTailCallOpt("tailcallopt", + cl::desc("Turn on tail call optimization."), + cl::location(PerformTailCallOpt), + cl::init(false)); } //--------------------------------------------------------------------------- diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 9bafff73d5..0d4dce32d8 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -1368,3 +1368,83 @@ L7: L5: //===---------------------------------------------------------------------===// +Tail call optimization improvements: Tail call optimization currently +pushes all arguments on the top of the stack (their normal place if +that was a not tail call optimized functiong call ) before moving them +to actual stack slot. this is done to prevent overwriting of paramters +(see example below) that might be used, since the arguments of the +callee overwrites callers arguments. + + example: + +int callee(int32, int64); +int caller(int32 arg1, int32 arg2) { + int64 local = arg2 * 2; + return callee(arg2, (int64)local); +} + +[arg1] [!arg2 no longer valid since we moved local onto it] +[arg2] -> [(int64) +[RETADDR] local ] + +moving arg1 onto the stack slot of callee function would overwrite +arg2 of the caller. + +Possible optimizations: + + - only push those arguments to the top of the stack that are actual + parameters of the caller function and have no local value in the + caller + + in above example local does not need to be pushed onto the top of + the stack as it is definitetly not a caller's function parameter + + - analyse the actual parameters of the callee to see which would + overwrite a caller paramter which is used by the callee and only + push them onto the top of the stack + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg1,arg2); + } + + here we don't need to write any variables to the top of the stack + since they don't overwrite each other + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg2,arg1); + } + + here we need to push the arguments because they overwrite each other + + + code for lowering directly onto callers arguments: ++ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; ++ SmallVector<SDOperand, 8> MemOpChains; ++ ++ SDOperand FramePtr; ++ SDOperand PtrOff; ++ SDOperand FIN; ++ int FI = 0; ++ // Walk the register/memloc assignments, inserting copies/loads. ++ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { ++ CCValAssign &VA = ArgLocs[i]; ++ SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); ++ ++ .... ++ ++ if (VA.isRegLoc()) { ++ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); ++ } else { ++ assert(VA.isMemLoc()); ++ // create frame index ++ int32_t Offset = VA.getLocMemOffset()+FPDiff; ++ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; ++ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); ++ FIN = DAG.getFrameIndex(FI, MVT::i32); ++ // store relative to framepointer ++ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0)); ++ } ++ } +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 9c2d95a199..f23e580656 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -127,6 +127,40 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>> ]>; +// tail call convetion (fast) one register is reserved for target address +// namely R9 +def CC_X86_64_TailCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8]>>, + + // The first 6 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>, + + // The first 8 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>, + + // The first 8 MMX vector arguments are passed in GPRs. + CCIfType<[v8i8, v4i16, v2i32, v1i64], + CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // __m64 vectors get 8-byte stack slots that are 8-byte aligned. + CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>> +]>; + //===----------------------------------------------------------------------===// // X86 C Calling Convention @@ -173,6 +207,22 @@ def CC_X86_32_C : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +/// Same as C calling convention up to nonfree ECX which is used for storing +/// potential pointer to tail called function +def CC_X86_32_TailCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in ECX. + CCIfNest<CCAssignToReg<[ECX]>>, + + // The first 3 integer arguments, if marked 'inreg' and if the call is not + // a vararg call, are passed in integer registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; def CC_X86_32_FastCall : CallingConv<[ // Promote i8/i16 arguments to i32. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1917a6a291..8767d8d33b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32,6 +32,8 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SSARegMap.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ParameterAttributes.h" @@ -43,6 +45,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; + RegInfo = TM.getRegisterInfo(); @@ -641,6 +644,19 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) //===----------------------------------------------------------------------===// #include "X86GenCallingConv.inc" + +/// GetPossiblePreceedingTailCall - Get preceeding X86ISD::TAILCALL node if it +/// exists skip possible ISD:TokenFactor. +static SDOperand GetPossiblePreceedingTailCall(SDOperand Chain) { + if (Chain.getOpcode()==X86ISD::TAILCALL) { + return Chain; + } else if (Chain.getOpcode()==ISD::TokenFactor) { + if (Chain.getNumOperands() && + Chain.getOperand(0).getOpcode()==X86ISD::TAILCALL) + return Chain.getOperand(0); + } + return Chain; +} /// LowerRET - Lower an ISD::RET node. SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { @@ -651,8 +667,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); CCInfo.AnalyzeReturn(Op.Val, RetCC_X86); - - + // If this is the first return lowered for this function, add the regs to the // liveout set for the function. if (DAG.getMachineFunction().liveout_empty()) { @@ -660,10 +675,38 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { if (RVLocs[i].isRegLoc()) DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg()); } - SDOperand Chain = Op.getOperand(0); - SDOperand Flag; + // Handle tail call return. + Chain = GetPossiblePreceedingTailCall(Chain); + if (Chain.getOpcode() == X86ISD::TAILCALL) { + SDOperand TailCall = Chain; + SDOperand TargetAddress = TailCall.getOperand(1); + SDOperand StackAdjustment = TailCall.getOperand(2); + assert ( ((TargetAddress.getOpcode() == ISD::Register && + (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX || + cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) || + TargetAddress.getOpcode() == ISD::TargetExternalSymbol || + TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && + "Expecting an global address, external symbol, or register"); + assert( StackAdjustment.getOpcode() == ISD::Constant && + "Expecting a const value"); + + SmallVector<SDOperand,8> Operands; + Operands.push_back(Chain.getOperand(0)); + Operands.push_back(TargetAddress); + Operands.push_back(StackAdjustment); + // Copy registers used by the call. Last operand is a flag so it is not + // copied. + for(unsigned i=3; i < TailCall.getNumOperands()-1;i++) { + Operands.push_back(Chain.getOperand(i)); + } + return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0], Operands.size()); + } + + // Regular return. + SDOperand Flag; + // Copy the result values into the output registers. if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() || RVLocs[0].getLocReg() != X86::ST0) { @@ -684,7 +727,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { if ((X86ScalarSSEf32 && RVLocs[0].getValVT()==MVT::f32) || (X86ScalarSSEf64 && RVLocs[0].getValVT()==MVT::f64)) { SDOperand MemLoc; - + // If this is a load into a scalarsse value, don't store the loaded value // back to the stack, only to reload it: just replace the scalar-sse load. if (ISD::isNON_EXTLoad(Value.Val) && @@ -784,12 +827,14 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, //===----------------------------------------------------------------------===// -// C & StdCall Calling Convention implementation +// C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. +// For info on fast calling convention see Fast Calling Convention (tail call) +// implementation LowerX86_32FastCCCallTo. /// AddLiveIn - This helper function adds the specified physical register to the /// MachineFunction as a live in value. It also creates a corresponding virtual @@ -802,6 +847,9 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg, return VReg; } +// align stack arguments according to platform alignment needed for tail calls +unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG); + SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, @@ -826,13 +874,17 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, MachineFrameInfo *MFI = MF.getFrameInfo(); SDOperand Root = Op.getOperand(0); bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; - + unsigned CC = MF.getFunction()->getCallingConv(); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); - CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C); - + // Check for possible tail call calling convention. + if (CC == CallingConv::Fast && PerformTailCallOpt) + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_TailCall); + else + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C); + SmallVector<SDOperand, 8> ArgValues; unsigned LastVal = ~0U; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -877,6 +929,9 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, } unsigned StackSize = CCInfo.getNextStackOffset(); + // align stack specially for tail calls + if (CC==CallingConv::Fast) + StackSize = GetAlignedArgumentStackSize(StackSize,DAG); ArgValues.push_back(Root); @@ -885,7 +940,12 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, if (isVarArg) VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); - if (isStdCall && !isVarArg) { + // Tail call calling convention (CallingConv::Fast) does not support varargs. + assert( !(isVarArg && CC == CallingConv::Fast) && + "CallingConv::Fast does not support varargs."); + + if (isStdCall && !isVarArg && + (CC==CallingConv::Fast && PerformTailCallOpt || CC!=CallingConv::Fast)) { BytesToPopOnReturn = StackSize; // Callee pops everything.. BytesCallerReserves = 0; } else { @@ -914,17 +974,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC) { SDOperand Chain = Op.getOperand(0); bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; - bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; SDOperand Callee = Op.getOperand(4); unsigned NumOps = (Op.getNumOperands() - 5) / 2; - + // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); - CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C); + if(CC==CallingConv::Fast && PerformTailCallOpt) + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall); + else + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); + if (CC==CallingConv::Fast) + NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy())); @@ -1023,19 +1087,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, if (InFlag.Val) Ops.push_back(InFlag); - - Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, - NodeTys, &Ops[0], Ops.size()); + + Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPush = 0; - if (CC == CallingConv::X86_StdCall) { + if (CC == CallingConv::X86_StdCall || + (CC == CallingConv::Fast && PerformTailCallOpt)) { if (isVarArg) NumBytesForCalleeToPush = isSRet ? 4 : 0; else NumBytesForCalleeToPush = NumBytes; + assert(!(isVarArg && CC==CallingConv::Fast) && + "CallingConv::Fast does not support varargs."); } else { // If this is is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. @@ -1132,7 +1198,8 @@ X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) { if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) { // Make sure the instruction takes 8n+4 bytes to make sure the start of the - // arguments and the arguments after the retaddr has been pushed are aligned. + // arguments and the arguments after the retaddr has been pushed are + // aligned. if ((StackSize & 7) == 0) StackSize += 4; } @@ -1194,7 +1261,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) { // Make sure the instruction takes 8n+4 bytes to make sure the start of the - // arguments and the arguments after the retaddr has been pushed are aligned. + // arguments and the arguments after the retaddr has been pushed are + // aligned. if ((NumBytes & 7) == 0) NumBytes += 4; } @@ -1292,8 +1360,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, if (InFlag.Val) Ops.push_back(InFlag); - // FIXME: Do not generate X86ISD::TAILCALL for now. - Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, + assert(isTailCall==false && "no tail call here"); + Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); @@ -1312,6 +1380,314 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo); } +//===----------------------------------------------------------------------===// +// Fast Calling Convention (tail call) implementation +//===----------------------------------------------------------------------===// + +// Like std call, callee cleans arguments, convention except that ECX is +// reserved for storing the tail called function address. Only 2 registers are +// free for argument passing (inreg). Tail call optimization is performed +// provided: +// * tailcallopt is enabled +// * caller/callee are fastcc +// * elf/pic is disabled OR +// * elf/pic enabled + callee is in module + callee has +// visibility protected or hidden +// To ensure the stack is aligned according to platform abi pass +// tail-call-align-stack. This makes sure that argument delta is always +// multiples of stack alignment. (Dynamic linkers need this - darwin's dyld for +// example) +// If a tail called function callee has more arguments than the caller the +// caller needs to make sure that there is room to move the RETADDR to. This is +// achived by reserving an area the size of the argument delta right after the +// original REtADDR, but before the saved framepointer or the spilled registers +// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) +// stack layout: +// arg1 +// arg2 +// RETADDR +// [ new RETADDR +// move area ] +// (possible EBP) +// ESI +// EDI +// local1 .. + +/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned +/// for a 16 byte align requirement. +unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG& DAG) { + if (PerformTailCallOpt) { + MachineFunction &MF = DAG.getMachineFunction(); + const TargetMachine &TM = MF.getTarget(); + const TargetFrameInfo &TFI = *TM.getFrameInfo(); + unsigned StackAlignment = TFI.getStackAlignment(); + uint64_t AlignMask = StackAlignment - 1; + int64_t Offset = StackSize; + unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4; + if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { + // Number smaller than 12 so just add the difference. + Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); + } else { + // Mask out lower bits, add stackalignment once plus the 12 bytes. + Offset = ((~AlignMask) & Offset) + StackAlignment + + (StackAlignment-SlotSize); + } + StackSize = Offset; + } + return StackSize; +} + +/// IsEligibleForTailCallElimination - Check to see whether the next instruction +// following the call is a return. A function is eligible if caller/callee +// calling conventions match, currently only fastcc supports tail calls, and the +// function CALL is immediatly followed by a RET. +bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call, + SDOperand Ret, + SelectionDAG& DAG) const { + bool IsEligible = false; + + // Check whether CALL node immediatly preceeds the RET node and whether the + // return uses the result of the node or is a void return. + if ((Ret.getNumOperands() == 1 && + (Ret.getOperand(0)== SDOperand(Call.Val,1) || + Ret.getOperand(0)== SDOperand(Call.Val,0))) || + (Ret.getOperand(0)== SDOperand(Call.Val,Call.Val->getNumValues()-1) && + Ret.getOperand(1)== SDOperand(Call.Val,0))) { + MachineFunction &MF = DAG.getMachineFunction(); + unsigned CallerCC = MF.getFunction()->getCallingConv(); + unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue(); + if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { + SDOperand Callee = Call.getOperand(4); + // On elf/pic %ebx needs to be livein. + if(getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) { + // Can only do local tail calls with PIC. + GlobalValue * GV = 0; + GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); + if(G != 0 && + (GV = G->getGlobal()) && + (GV->hasHiddenVisibility() || GV->hasProtectedVisibility())) + IsEligible=true; + } else { + IsEligible=true; + } + } + } + return IsEligible; +} + +SDOperand X86TargetLowering::LowerX86_TailCallTo(SDOperand Op, + SelectionDAG &DAG, + unsigned CC) { + SDOperand Chain = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; + SDOperand Callee = Op.getOperand(4); + bool is64Bit = Subtarget->is64Bit(); + + assert(isTailCall && PerformTailCallOpt && "Should only emit tail calls."); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); + if (is64Bit) + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall); + else + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall); + + + // Lower arguments at fp - stackoffset + fpdiff. + MachineFunction &MF = DAG.getMachineFunction(); + + unsigned NumBytesToBePushed = + GetAlignedArgumentStackSize(CCInfo.getNextStackOffset(), DAG); + + unsigned NumBytesCallerPushed = + MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); + int FPDiff = NumBytesCallerPushed - NumBytesToBePushed; + + // Set the delta of movement of the returnaddr stackslot. + // But only set if delta is greater than previous delta. + if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) + MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); + + // Adjust the ret address stack slot. + if (FPDiff) { + MVT::ValueType VT = is64Bit ? MVT::i64 : MVT::i32; + SDOperand RetAddrFrIdx = getReturnAddressFrameIndex(DAG); + RetAddrFrIdx = + DAG.getLoad(VT, DAG.getEntryNode(),RetAddrFrIdx, NULL, 0); + // Emit a store of the saved ret value to the new location. + int SlotSize = is64Bit ? 8 : 4; + int NewReturnAddrFI = + MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); + SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); + Chain = DAG.getStore(Chain,RetAddrFrIdx, NewRetAddrFrIdx, NULL, 0); + } + + Chain = DAG. + getCALLSEQ_START(Chain, DAG.getConstant(NumBytesToBePushed, getPointerTy())); + + SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; + SmallVector<SDOperand, 8> MemOpChains; + SmallVector<SDOperand, 8> MemOpChains2; + SDOperand FramePtr, StackPtr; + SDOperand PtrOff; + SDOperand FIN; + int FI = 0; + + // Walk the register/memloc assignments, inserting copies/loads. Lower + // arguments first to the stack slot where they would normally - in case of a + // normal function call - be. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: assert(0 && "Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + if (StackPtr.Val == 0) + StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain, + Arg)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + InFlag = SDOperand(); + // Copy from stack slots to stack slot of a tail called function. This needs + // to be done because if we would lower the arguments directly to their real + // stack slot we might end up overwriting each other. + // TODO: To make this more efficient (sometimes saving a store/load) we could + // analyse the arguments and emit this store/load/store sequence only for + // arguments which would be overwritten otherwise. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (!VA.isRegLoc()) { + SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo()); + unsigned Flags = cast<ConstantSDNode>(FlagsOp)->getValue(); + + // Get source stack slot. + SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy()); + PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff); + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); + FIN = DAG.getFrameIndex(FI, MVT::i32); + if (Flags & ISD::ParamFlags::ByVal) { + // Copy relative to framepointer. + unsigned Align = 1 << ((Flags & ISD::ParamFlags::ByValAlign) >> + ISD::ParamFlags::ByValAlignOffs); + + unsigned Size = (Flags & ISD::ParamFlags::ByValSize) >> + ISD::ParamFlags::ByValSizeOffs; + |