diff options
author | Arnold Schwaighofer <arnold.schwaighofer@gmail.com> | 2007-10-11 19:40:01 +0000 |
---|---|---|
committer | Arnold Schwaighofer <arnold.schwaighofer@gmail.com> | 2007-10-11 19:40:01 +0000 |
commit | c85e1716f0e45e4c18a9ef2fbe431a51ac3a4252 (patch) | |
tree | e6e745a81699085d757db166e645f8a6639cde7a /lib/Target | |
parent | 68c8411ef591818d89c6a81098a328fcd75be72a (diff) |
Added tail call optimization to the x86 back end. It can be
enabled by passing -tailcallopt to llc. The optimization is
performed if the following conditions are satisfied:
* caller/callee are fastcc
* elf/pic is disabled OR
elf/pic enabled + callee is in module + callee has
visibility protected or hidden
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@42870 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target')
-rw-r--r-- | lib/Target/TargetMachine.cpp | 7 | ||||
-rw-r--r-- | lib/Target/X86/README.txt | 80 | ||||
-rw-r--r-- | lib/Target/X86/X86CallingConv.td | 50 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.cpp | 511 | ||||
-rw-r--r-- | lib/Target/X86/X86ISelLowering.h | 24 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.cpp | 2 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrInfo.td | 43 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrX86-64.td | 35 | ||||
-rw-r--r-- | lib/Target/X86/X86MachineFunctionInfo.h | 15 | ||||
-rw-r--r-- | lib/Target/X86/X86RegisterInfo.cpp | 129 |
10 files changed, 831 insertions, 65 deletions
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp index 6c00a3f492..9caea11dd3 100644 --- a/lib/Target/TargetMachine.cpp +++ b/lib/Target/TargetMachine.cpp @@ -33,6 +33,7 @@ namespace llvm { bool ExceptionHandling; Reloc::Model RelocationModel; CodeModel::Model CMModel; + bool PerformTailCallOpt; } namespace { cl::opt<bool, true> PrintCode("print-machineinstrs", @@ -116,6 +117,12 @@ namespace { clEnumValN(CodeModel::Large, "large", " Large code model"), clEnumValEnd)); + + cl::opt<bool, true> + EnablePerformTailCallOpt("tailcallopt", + cl::desc("Turn on tail call optimization."), + cl::location(PerformTailCallOpt), + cl::init(false)); } //--------------------------------------------------------------------------- diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 9bafff73d5..0d4dce32d8 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -1368,3 +1368,83 @@ L7: L5: //===---------------------------------------------------------------------===// +Tail call optimization improvements: Tail call optimization currently +pushes all arguments on the top of the stack (their normal place if +that was a not tail call optimized functiong call ) before moving them +to actual stack slot. this is done to prevent overwriting of paramters +(see example below) that might be used, since the arguments of the +callee overwrites callers arguments. + + example: + +int callee(int32, int64); +int caller(int32 arg1, int32 arg2) { + int64 local = arg2 * 2; + return callee(arg2, (int64)local); +} + +[arg1] [!arg2 no longer valid since we moved local onto it] +[arg2] -> [(int64) +[RETADDR] local ] + +moving arg1 onto the stack slot of callee function would overwrite +arg2 of the caller. + +Possible optimizations: + + - only push those arguments to the top of the stack that are actual + parameters of the caller function and have no local value in the + caller + + in above example local does not need to be pushed onto the top of + the stack as it is definitetly not a caller's function parameter + + - analyse the actual parameters of the callee to see which would + overwrite a caller paramter which is used by the callee and only + push them onto the top of the stack + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg1,arg2); + } + + here we don't need to write any variables to the top of the stack + since they don't overwrite each other + + int callee (int32 arg1, int32 arg2); + int caller (int32 arg1, int32 arg2) { + return callee(arg2,arg1); + } + + here we need to push the arguments because they overwrite each other + + + code for lowering directly onto callers arguments: ++ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; ++ SmallVector<SDOperand, 8> MemOpChains; ++ ++ SDOperand FramePtr; ++ SDOperand PtrOff; ++ SDOperand FIN; ++ int FI = 0; ++ // Walk the register/memloc assignments, inserting copies/loads. ++ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { ++ CCValAssign &VA = ArgLocs[i]; ++ SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); ++ ++ .... ++ ++ if (VA.isRegLoc()) { ++ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); ++ } else { ++ assert(VA.isMemLoc()); ++ // create frame index ++ int32_t Offset = VA.getLocMemOffset()+FPDiff; ++ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; ++ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); ++ FIN = DAG.getFrameIndex(FI, MVT::i32); ++ // store relative to framepointer ++ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0)); ++ } ++ } +//===---------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 9c2d95a199..f23e580656 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -127,6 +127,40 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>> ]>; +// tail call convetion (fast) one register is reserved for target address +// namely R9 +def CC_X86_64_TailCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + CCIfStruct<CCStructAssign<[RDI, RSI, RDX, RCX, R8]>>, + + // The first 6 integer arguments are passed in integer registers. + CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX, R8D]>>, + CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>, + + // The first 8 FP/Vector arguments are passed in XMM registers. + CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>, + + // The first 8 MMX vector arguments are passed in GPRs. + CCIfType<[v8i8, v4i16, v2i32, v1i64], + CCAssignToReg<[RDI, RSI, RDX, RCX, R8]>>, + + // The 'nest' parameter, if any, is passed in R10. + CCIfNest<CCAssignToReg<[R10]>>, + + // Integer/FP values get stored in stack slots that are 8 bytes in size and + // 8-byte aligned if there are no more registers to hold them. + CCIfType<[i32, i64, f32, f64], CCAssignToStack<8, 8>>, + + // Vectors get 16-byte stack slots that are 16-byte aligned. + CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCAssignToStack<16, 16>>, + + // __m64 vectors get 8-byte stack slots that are 8-byte aligned. + CCIfType<[v8i8, v4i16, v2i32, v1i64], CCAssignToStack<8, 8>> +]>; + //===----------------------------------------------------------------------===// // X86 C Calling Convention @@ -173,6 +207,22 @@ def CC_X86_32_C : CallingConv<[ CCDelegateTo<CC_X86_32_Common> ]>; +/// Same as C calling convention up to nonfree ECX which is used for storing +/// potential pointer to tail called function +def CC_X86_32_TailCall : CallingConv<[ + // Promote i8/i16 arguments to i32. + CCIfType<[i8, i16], CCPromoteToType<i32>>, + + // The 'nest' parameter, if any, is passed in ECX. + CCIfNest<CCAssignToReg<[ECX]>>, + + // The first 3 integer arguments, if marked 'inreg' and if the call is not + // a vararg call, are passed in integer registers. + CCIfNotVarArg<CCIfInReg<CCIfType<[i32], CCAssignToReg<[EAX, EDX]>>>>, + + // Otherwise, same as everything else. + CCDelegateTo<CC_X86_32_Common> +]>; def CC_X86_32_FastCall : CallingConv<[ // Promote i8/i16 arguments to i32. diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 1917a6a291..8767d8d33b 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -32,6 +32,8 @@ #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SSARegMap.h" #include "llvm/Support/MathExtras.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include "llvm/Target/TargetOptions.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ParameterAttributes.h" @@ -43,6 +45,7 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); X86StackPtr = Subtarget->is64Bit() ? X86::RSP : X86::ESP; + RegInfo = TM.getRegisterInfo(); @@ -641,6 +644,19 @@ X86TargetLowering::X86TargetLowering(TargetMachine &TM) //===----------------------------------------------------------------------===// #include "X86GenCallingConv.inc" + +/// GetPossiblePreceedingTailCall - Get preceeding X86ISD::TAILCALL node if it +/// exists skip possible ISD:TokenFactor. +static SDOperand GetPossiblePreceedingTailCall(SDOperand Chain) { + if (Chain.getOpcode()==X86ISD::TAILCALL) { + return Chain; + } else if (Chain.getOpcode()==ISD::TokenFactor) { + if (Chain.getNumOperands() && + Chain.getOperand(0).getOpcode()==X86ISD::TAILCALL) + return Chain.getOperand(0); + } + return Chain; +} /// LowerRET - Lower an ISD::RET node. SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { @@ -651,8 +667,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg(); CCState CCInfo(CC, isVarArg, getTargetMachine(), RVLocs); CCInfo.AnalyzeReturn(Op.Val, RetCC_X86); - - + // If this is the first return lowered for this function, add the regs to the // liveout set for the function. if (DAG.getMachineFunction().liveout_empty()) { @@ -660,10 +675,38 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { if (RVLocs[i].isRegLoc()) DAG.getMachineFunction().addLiveOut(RVLocs[i].getLocReg()); } - SDOperand Chain = Op.getOperand(0); - SDOperand Flag; + // Handle tail call return. + Chain = GetPossiblePreceedingTailCall(Chain); + if (Chain.getOpcode() == X86ISD::TAILCALL) { + SDOperand TailCall = Chain; + SDOperand TargetAddress = TailCall.getOperand(1); + SDOperand StackAdjustment = TailCall.getOperand(2); + assert ( ((TargetAddress.getOpcode() == ISD::Register && + (cast<RegisterSDNode>(TargetAddress)->getReg() == X86::ECX || + cast<RegisterSDNode>(TargetAddress)->getReg() == X86::R9)) || + TargetAddress.getOpcode() == ISD::TargetExternalSymbol || + TargetAddress.getOpcode() == ISD::TargetGlobalAddress) && + "Expecting an global address, external symbol, or register"); + assert( StackAdjustment.getOpcode() == ISD::Constant && + "Expecting a const value"); + + SmallVector<SDOperand,8> Operands; + Operands.push_back(Chain.getOperand(0)); + Operands.push_back(TargetAddress); + Operands.push_back(StackAdjustment); + // Copy registers used by the call. Last operand is a flag so it is not + // copied. + for(unsigned i=3; i < TailCall.getNumOperands()-1;i++) { + Operands.push_back(Chain.getOperand(i)); + } + return DAG.getNode(X86ISD::TC_RETURN, MVT::Other, &Operands[0], Operands.size()); + } + + // Regular return. + SDOperand Flag; + // Copy the result values into the output registers. if (RVLocs.size() != 1 || !RVLocs[0].isRegLoc() || RVLocs[0].getLocReg() != X86::ST0) { @@ -684,7 +727,7 @@ SDOperand X86TargetLowering::LowerRET(SDOperand Op, SelectionDAG &DAG) { if ((X86ScalarSSEf32 && RVLocs[0].getValVT()==MVT::f32) || (X86ScalarSSEf64 && RVLocs[0].getValVT()==MVT::f64)) { SDOperand MemLoc; - + // If this is a load into a scalarsse value, don't store the loaded value // back to the stack, only to reload it: just replace the scalar-sse load. if (ISD::isNON_EXTLoad(Value.Val) && @@ -784,12 +827,14 @@ LowerCallResult(SDOperand Chain, SDOperand InFlag, SDNode *TheCall, //===----------------------------------------------------------------------===// -// C & StdCall Calling Convention implementation +// C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. +// For info on fast calling convention see Fast Calling Convention (tail call) +// implementation LowerX86_32FastCCCallTo. /// AddLiveIn - This helper function adds the specified physical register to the /// MachineFunction as a live in value. It also creates a corresponding virtual @@ -802,6 +847,9 @@ static unsigned AddLiveIn(MachineFunction &MF, unsigned PReg, return VReg; } +// align stack arguments according to platform alignment needed for tail calls +unsigned GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG); + SDOperand X86TargetLowering::LowerMemArgument(SDOperand Op, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, @@ -826,13 +874,17 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, MachineFrameInfo *MFI = MF.getFrameInfo(); SDOperand Root = Op.getOperand(0); bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; - + unsigned CC = MF.getFunction()->getCallingConv(); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); - CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C); - + // Check for possible tail call calling convention. + if (CC == CallingConv::Fast && PerformTailCallOpt) + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_TailCall); + else + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_32_C); + SmallVector<SDOperand, 8> ArgValues; unsigned LastVal = ~0U; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { @@ -877,6 +929,9 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, } unsigned StackSize = CCInfo.getNextStackOffset(); + // align stack specially for tail calls + if (CC==CallingConv::Fast) + StackSize = GetAlignedArgumentStackSize(StackSize,DAG); ArgValues.push_back(Root); @@ -885,7 +940,12 @@ SDOperand X86TargetLowering::LowerCCCArguments(SDOperand Op, SelectionDAG &DAG, if (isVarArg) VarArgsFrameIndex = MFI->CreateFixedObject(1, StackSize); - if (isStdCall && !isVarArg) { + // Tail call calling convention (CallingConv::Fast) does not support varargs. + assert( !(isVarArg && CC == CallingConv::Fast) && + "CallingConv::Fast does not support varargs."); + + if (isStdCall && !isVarArg && + (CC==CallingConv::Fast && PerformTailCallOpt || CC!=CallingConv::Fast)) { BytesToPopOnReturn = StackSize; // Callee pops everything.. BytesCallerReserves = 0; } else { @@ -914,17 +974,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC) { SDOperand Chain = Op.getOperand(0); bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; - bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; SDOperand Callee = Op.getOperand(4); unsigned NumOps = (Op.getNumOperands() - 5) / 2; - + // Analyze operands of the call, assigning locations to each operand. SmallVector<CCValAssign, 16> ArgLocs; CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); - CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C); + if(CC==CallingConv::Fast && PerformTailCallOpt) + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall); + else + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_C); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); + if (CC==CallingConv::Fast) + NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); Chain = DAG.getCALLSEQ_START(Chain,DAG.getConstant(NumBytes, getPointerTy())); @@ -1023,19 +1087,21 @@ SDOperand X86TargetLowering::LowerCCCCallTo(SDOperand Op, SelectionDAG &DAG, if (InFlag.Val) Ops.push_back(InFlag); - - Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, - NodeTys, &Ops[0], Ops.size()); + + Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPush = 0; - if (CC == CallingConv::X86_StdCall) { + if (CC == CallingConv::X86_StdCall || + (CC == CallingConv::Fast && PerformTailCallOpt)) { if (isVarArg) NumBytesForCalleeToPush = isSRet ? 4 : 0; else NumBytesForCalleeToPush = NumBytes; + assert(!(isVarArg && CC==CallingConv::Fast) && + "CallingConv::Fast does not support varargs."); } else { // If this is is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. @@ -1132,7 +1198,8 @@ X86TargetLowering::LowerFastCCArguments(SDOperand Op, SelectionDAG &DAG) { if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) { // Make sure the instruction takes 8n+4 bytes to make sure the start of the - // arguments and the arguments after the retaddr has been pushed are aligned. + // arguments and the arguments after the retaddr has been pushed are + // aligned. if ((StackSize & 7) == 0) StackSize += 4; } @@ -1194,7 +1261,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, if (!Subtarget->isTargetCygMing() && !Subtarget->isTargetWindows()) { // Make sure the instruction takes 8n+4 bytes to make sure the start of the - // arguments and the arguments after the retaddr has been pushed are aligned. + // arguments and the arguments after the retaddr has been pushed are + // aligned. if ((NumBytes & 7) == 0) NumBytes += 4; } @@ -1292,8 +1360,8 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, if (InFlag.Val) Ops.push_back(InFlag); - // FIXME: Do not generate X86ISD::TAILCALL for now. - Chain = DAG.getNode(isTailCall ? X86ISD::TAILCALL : X86ISD::CALL, + assert(isTailCall==false && "no tail call here"); + Chain = DAG.getNode(X86ISD::CALL, NodeTys, &Ops[0], Ops.size()); InFlag = Chain.getValue(1); @@ -1312,6 +1380,314 @@ SDOperand X86TargetLowering::LowerFastCCCallTo(SDOperand Op, SelectionDAG &DAG, return SDOperand(LowerCallResult(Chain, InFlag, Op.Val, CC, DAG), Op.ResNo); } +//===----------------------------------------------------------------------===// +// Fast Calling Convention (tail call) implementation +//===----------------------------------------------------------------------===// + +// Like std call, callee cleans arguments, convention except that ECX is +// reserved for storing the tail called function address. Only 2 registers are +// free for argument passing (inreg). Tail call optimization is performed +// provided: +// * tailcallopt is enabled +// * caller/callee are fastcc +// * elf/pic is disabled OR +// * elf/pic enabled + callee is in module + callee has +// visibility protected or hidden +// To ensure the stack is aligned according to platform abi pass +// tail-call-align-stack. This makes sure that argument delta is always +// multiples of stack alignment. (Dynamic linkers need this - darwin's dyld for +// example) +// If a tail called function callee has more arguments than the caller the +// caller needs to make sure that there is room to move the RETADDR to. This is +// achived by reserving an area the size of the argument delta right after the +// original REtADDR, but before the saved framepointer or the spilled registers +// e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) +// stack layout: +// arg1 +// arg2 +// RETADDR +// [ new RETADDR +// move area ] +// (possible EBP) +// ESI +// EDI +// local1 .. + +/// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned +/// for a 16 byte align requirement. +unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, + SelectionDAG& DAG) { + if (PerformTailCallOpt) { + MachineFunction &MF = DAG.getMachineFunction(); + const TargetMachine &TM = MF.getTarget(); + const TargetFrameInfo &TFI = *TM.getFrameInfo(); + unsigned StackAlignment = TFI.getStackAlignment(); + uint64_t AlignMask = StackAlignment - 1; + int64_t Offset = StackSize; + unsigned SlotSize = Subtarget->is64Bit() ? 8 : 4; + if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { + // Number smaller than 12 so just add the difference. + Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); + } else { + // Mask out lower bits, add stackalignment once plus the 12 bytes. + Offset = ((~AlignMask) & Offset) + StackAlignment + + (StackAlignment-SlotSize); + } + StackSize = Offset; + } + return StackSize; +} + +/// IsEligibleForTailCallElimination - Check to see whether the next instruction +// following the call is a return. A function is eligible if caller/callee +// calling conventions match, currently only fastcc supports tail calls, and the +// function CALL is immediatly followed by a RET. +bool X86TargetLowering::IsEligibleForTailCallOptimization(SDOperand Call, + SDOperand Ret, + SelectionDAG& DAG) const { + bool IsEligible = false; + + // Check whether CALL node immediatly preceeds the RET node and whether the + // return uses the result of the node or is a void return. + if ((Ret.getNumOperands() == 1 && + (Ret.getOperand(0)== SDOperand(Call.Val,1) || + Ret.getOperand(0)== SDOperand(Call.Val,0))) || + (Ret.getOperand(0)== SDOperand(Call.Val,Call.Val->getNumValues()-1) && + Ret.getOperand(1)== SDOperand(Call.Val,0))) { + MachineFunction &MF = DAG.getMachineFunction(); + unsigned CallerCC = MF.getFunction()->getCallingConv(); + unsigned CalleeCC = cast<ConstantSDNode>(Call.getOperand(1))->getValue(); + if (CalleeCC == CallingConv::Fast && CallerCC == CalleeCC) { + SDOperand Callee = Call.getOperand(4); + // On elf/pic %ebx needs to be livein. + if(getTargetMachine().getRelocationModel() == Reloc::PIC_ && + Subtarget->isPICStyleGOT()) { + // Can only do local tail calls with PIC. + GlobalValue * GV = 0; + GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee); + if(G != 0 && + (GV = G->getGlobal()) && + (GV->hasHiddenVisibility() || GV->hasProtectedVisibility())) + IsEligible=true; + } else { + IsEligible=true; + } + } + } + return IsEligible; +} + +SDOperand X86TargetLowering::LowerX86_TailCallTo(SDOperand Op, + SelectionDAG &DAG, + unsigned CC) { + SDOperand Chain = Op.getOperand(0); + bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; + SDOperand Callee = Op.getOperand(4); + bool is64Bit = Subtarget->is64Bit(); + + assert(isTailCall && PerformTailCallOpt && "Should only emit tail calls."); + + // Analyze operands of the call, assigning locations to each operand. + SmallVector<CCValAssign, 16> ArgLocs; + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); + if (is64Bit) + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_64_TailCall); + else + CCInfo.AnalyzeCallOperands(Op.Val, CC_X86_32_TailCall); + + + // Lower arguments at fp - stackoffset + fpdiff. + MachineFunction &MF = DAG.getMachineFunction(); + + unsigned NumBytesToBePushed = + GetAlignedArgumentStackSize(CCInfo.getNextStackOffset(), DAG); + + unsigned NumBytesCallerPushed = + MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn(); + int FPDiff = NumBytesCallerPushed - NumBytesToBePushed; + + // Set the delta of movement of the returnaddr stackslot. + // But only set if delta is greater than previous delta. + if (FPDiff < (MF.getInfo<X86MachineFunctionInfo>()->getTCReturnAddrDelta())) + MF.getInfo<X86MachineFunctionInfo>()->setTCReturnAddrDelta(FPDiff); + + // Adjust the ret address stack slot. + if (FPDiff) { + MVT::ValueType VT = is64Bit ? MVT::i64 : MVT::i32; + SDOperand RetAddrFrIdx = getReturnAddressFrameIndex(DAG); + RetAddrFrIdx = + DAG.getLoad(VT, DAG.getEntryNode(),RetAddrFrIdx, NULL, 0); + // Emit a store of the saved ret value to the new location. + int SlotSize = is64Bit ? 8 : 4; + int NewReturnAddrFI = + MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize); + SDOperand NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT); + Chain = DAG.getStore(Chain,RetAddrFrIdx, NewRetAddrFrIdx, NULL, 0); + } + + Chain = DAG. + getCALLSEQ_START(Chain, DAG.getConstant(NumBytesToBePushed, getPointerTy())); + + SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass; + SmallVector<SDOperand, 8> MemOpChains; + SmallVector<SDOperand, 8> MemOpChains2; + SDOperand FramePtr, StackPtr; + SDOperand PtrOff; + SDOperand FIN; + int FI = 0; + + // Walk the register/memloc assignments, inserting copies/loads. Lower + // arguments first to the stack slot where they would normally - in case of a + // normal function call - be. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + SDOperand Arg = Op.getOperand(5+2*VA.getValNo()); + + // Promote the value if needed. + switch (VA.getLocInfo()) { + default: assert(0 && "Unknown loc info!"); + case CCValAssign::Full: break; + case CCValAssign::SExt: + Arg = DAG.getNode(ISD::SIGN_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::ZExt: + Arg = DAG.getNode(ISD::ZERO_EXTEND, VA.getLocVT(), Arg); + break; + case CCValAssign::AExt: + Arg = DAG.getNode(ISD::ANY_EXTEND, VA.getLocVT(), Arg); + break; + } + + if (VA.isRegLoc()) { + RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); + } else { + assert(VA.isMemLoc()); + if (StackPtr.Val == 0) + StackPtr = DAG.getRegister(getStackPtrReg(), getPointerTy()); + + MemOpChains.push_back(LowerMemOpCallTo(Op, DAG, StackPtr, VA, Chain, + Arg)); + } + } + + if (!MemOpChains.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains[0], MemOpChains.size()); + + // Build a sequence of copy-to-reg nodes chained together with token chain + // and flag operands which copy the outgoing args into registers. + SDOperand InFlag; + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { + Chain = DAG.getCopyToReg(Chain, RegsToPass[i].first, RegsToPass[i].second, + InFlag); + InFlag = Chain.getValue(1); + } + InFlag = SDOperand(); + // Copy from stack slots to stack slot of a tail called function. This needs + // to be done because if we would lower the arguments directly to their real + // stack slot we might end up overwriting each other. + // TODO: To make this more efficient (sometimes saving a store/load) we could + // analyse the arguments and emit this store/load/store sequence only for + // arguments which would be overwritten otherwise. + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { + CCValAssign &VA = ArgLocs[i]; + if (!VA.isRegLoc()) { + SDOperand FlagsOp = Op.getOperand(6+2*VA.getValNo()); + unsigned Flags = cast<ConstantSDNode>(FlagsOp)->getValue(); + + // Get source stack slot. + SDOperand PtrOff = DAG.getConstant(VA.getLocMemOffset(), getPointerTy()); + PtrOff = DAG.getNode(ISD::ADD, getPointerTy(), StackPtr, PtrOff); + // Create frame index. + int32_t Offset = VA.getLocMemOffset()+FPDiff; + uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8; + FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset); + FIN = DAG.getFrameIndex(FI, MVT::i32); + if (Flags & ISD::ParamFlags::ByVal) { + // Copy relative to framepointer. + unsigned Align = 1 << ((Flags & ISD::ParamFlags::ByValAlign) >> + ISD::ParamFlags::ByValAlignOffs); + + unsigned Size = (Flags & ISD::ParamFlags::ByValSize) >> + ISD::ParamFlags::ByValSizeOffs; + + SDOperand AlignNode = DAG.getConstant(Align, MVT::i32); + SDOperand SizeNode = DAG.getConstant(Size, MVT::i32); + // Copy relative to framepointer. + MemOpChains2.push_back(DAG.getNode(ISD::MEMCPY, MVT::Other, Chain, FIN, + PtrOff, SizeNode, AlignNode)); + } else { + SDOperand LoadedArg = DAG.getLoad(VA.getValVT(), Chain, PtrOff, NULL,0); + // Store relative to framepointer. + MemOpChains2.push_back(DAG.getStore(Chain, LoadedArg, FIN, NULL, 0)); + } + } + } + + if (!MemOpChains2.empty()) + Chain = DAG.getNode(ISD::TokenFactor, MVT::Other, + &MemOpChains2[0], MemOpChains.size()); + + // ELF / PIC requires GOT in the EBX register before function calls via PLT + // GOT pointer. + // Does not work with tail call since ebx is not restored correctly by + // tailcaller. TODO: at least for x86 - verify for x86-64 + + // If the callee is a GlobalAddress node (quite common, every direct call is) + // turn it into a TargetGlobalAddress node so that legalize doesn't hack it. + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { + // We should use extra load for direct calls to dllimported functions in + // non-JIT mode. + if (!Subtarget->GVRequiresExtraLoad(G->getGlobal(), + getTargetMachine(), true)) + Callee = DAG.getTargetGlobalAddress(G->getGlobal(), getPointerTy()); + } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy()); + else { + assert(Callee.getOpcode() == ISD::LOAD && + "Function destination must be loaded into virtual register"); + unsigned Opc = is64Bit ? X86::R9 : X86::ECX; + + Chain = DAG.getCopyToReg(Chain, + DAG.getRegister(Opc, getPointerTy()) , + Callee,InFlag); + Callee = DAG.getRegister(Opc, getPointerTy()); + // Add register as live out. + DAG.getMachineFunction().addLiveOut(Opc); + } + + SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + SmallVector<SDOperand, 8> Ops; + + Ops.push_back(Chain); + Ops.push_back(DAG.getConstant(NumBytesToBePushed, getPointerTy())); + Ops.push_back(DAG.getConstant(0, getPointerTy())); + if (InFlag.Val) + Ops.push_back(InFlag); + Chain = DAG.getNode(ISD::CALLSEQ_END, NodeTys, &Ops[0], Ops.size()); + InFlag = Chain.getValue(1); + + // Returns a chain & a flag for retval copy to use. + NodeTys = DAG.getVTList(MVT::Other, MVT::Flag); + Ops.clear(); + Ops.push_back(Chain); + Ops.push_back(Callee); + Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); + // Add argument registers to the end of the list so that they are known live + // into the call. + for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) + Ops.push_back(DAG.getRegister(RegsToPass[i].first, + RegsToPass[i].second.getValueType())); + if (InFlag.Val) + Ops.push_back(InFlag); + assert(InFlag.Val && + "Flag must be set. Depend on flag being set in LowerRET"); + Chain = DAG.getNode(X86ISD::TAILCALL, + Op.Val->getVTList(), &Ops[0], Ops.size()); + + return SDOperand(Chain.Val, Op.ResNo); +} //===----------------------------------------------------------------------===// // X86-64 C Calling Convention implementation @@ -1323,6 +1699,7 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) { MachineFrameInfo *MFI = MF.getFrameInfo(); SDOperand Root = Op.getOperand(0); bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; + unsigned CC= MF.getFunction()->getCallingConv(); static const unsigned GPR64ArgRegs[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 @@ -1335,9 +1712,12 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) { // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; - CCState CCInfo(MF.getFunction()->getCallingConv(), isVarArg, + CCState CCInfo(CC, isVarArg, getTargetMachine(), ArgLocs); - CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C); + if (CC == CallingConv::Fast && PerformTailCallOpt) + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_TailCall); + else + CCInfo.AnalyzeFormalArguments(Op.Val, CC_X86_64_C); SmallVector<SDOperand, 8> ArgValues; unsigned LastVal = ~0U; @@ -1398,10 +1778,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) { } unsigned StackSize = CCInfo.getNextStackOffset(); + if (CC==CallingConv::Fast) + StackSize =GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { + assert(CC!=CallingConv::Fast + && "Var arg not supported with calling convention fastcc"); unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, 6); unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); @@ -1446,10 +1830,14 @@ X86TargetLowering::LowerX86_64CCCArguments(SDOperand Op, SelectionDAG &DAG) { } ArgValues.push_back(Root); - - BytesToPopOnReturn = 0; // Callee pops nothing. - BytesCallerReserves = StackSize; - + // Tail call convention (fastcc) needs callee pop. + if (CC == CallingConv::Fast && PerformTailCallOpt){ + BytesToPopOnReturn = StackSize; // Callee pops everything. + BytesCallerReserves = 0; + } else { + BytesToPopOnReturn = 0; // Callee pops nothing. + BytesCallerReserves = StackSize; + } X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); FuncInfo->setBytesToPopOnReturn(BytesToPopOnReturn); @@ -1463,16 +1851,21 @@ X86TargetLowering::LowerX86_64CCCCallTo(SDOperand Op, SelectionDAG &DAG, unsigned CC) { SDOperand Chain = Op.getOperand(0); bool isVarArg = cast<ConstantSDNode>(Op.getOperand(2))->getValue() != 0; - bool isTailCall = cast<ConstantSDNode>(Op.getOperand(3))->getValue() != 0; SDOperand Callee = Op.getOperand(4); < |