diff options
-rw-r--r-- | include/llvm/IntrinsicsX86.td | 10 | ||||
-rw-r--r-- | lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 10 | ||||
-rw-r--r-- | lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp | 34 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFMA.td | 39 | ||||
-rw-r--r-- | lib/Target/X86/X86InstrFormats.td | 10 | ||||
-rw-r--r-- | test/CodeGen/X86/fma4-intrinsics-x86_64.ll | 9 | ||||
-rw-r--r-- | test/MC/X86/x86_64-fma4-encoding.s | 13 |
7 files changed, 118 insertions, 7 deletions
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td index cba6599d66..ea7597ca2c 100644 --- a/include/llvm/IntrinsicsX86.td +++ b/include/llvm/IntrinsicsX86.td @@ -1822,6 +1822,16 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". } //===----------------------------------------------------------------------===// +// FMA4 + +let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". + def int_x86_fma4_vfmadd_sd : GCCBuiltin<"__builtin_ia32_vfmaddsd">, + Intrinsic<[llvm_v2f64_ty], + [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty], + [IntrNoMem]>; +} + +//===----------------------------------------------------------------------===// // MMX // Empty MMX state op. diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index c50f785172..213a79d670 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -418,7 +418,12 @@ namespace X86II { /// storing a classifier in the imm8 field. To simplify our implementation, /// we handle this by storeing the classifier in the opcode field and using /// this flag to indicate that the encoder should do the wacky 3DNow! thing. - Has3DNow0F0FOpcode = 1U << 7 + Has3DNow0F0FOpcode = 1U << 7, + + /// XOP_W - Same bit as VEX_W. Used to indicate swapping of + /// operand 3 and 4 to be encoded in ModRM or I8IMM. This is used + /// for FMA4 and XOP instructions. + XOP_W = 1U << 8 }; // getBaseOpcodeFor - This function returns the "base" X86 opcode for the @@ -488,9 +493,12 @@ namespace X86II { return 0; case X86II::MRMSrcMem: { bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; + bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W; unsigned FirstMemOp = 1; if (HasVEX_4V) ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV). + if (HasXOP_W) + ++FirstMemOp;// Skip the register source (which is encoded in I8IMM). // FIXME: Maybe lea should have its own form? This is a horrible hack. //if (Opcode == X86::LEA64r || Opcode == X86::LEA64_32r || diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 1ab469cc00..dbd52078d8 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -415,6 +415,10 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // opcode extension, or ignored, depending on the opcode byte) unsigned char VEX_W = 0; + // XOP_W: opcode specific, same bit as VEX_W, but used to + // swap operand 3 and 4 for FMA4 and XOP instructions + unsigned char XOP_W = 0; + // VEX_5M (VEX m-mmmmm field): // // 0b00000: Reserved for future use @@ -453,6 +457,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, if ((TSFlags >> X86II::VEXShift) & X86II::VEX_W) VEX_W = 1; + if ((TSFlags >> X86II::VEXShift) & X86II::XOP_W) + XOP_W = 1; + if ((TSFlags >> X86II::VEXShift) & X86II::VEX_L) VEX_L = 1; @@ -529,6 +536,9 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // src1(ModR/M), MemAddr, imm8 // src1(ModR/M), MemAddr, src2(VEX_I8IMM) // + // FMA4: + // dst(ModR/M.reg), src1(VEX_4V), src2(ModR/M), src3(VEX_I8IMM) + // dst(ModR/M.reg), src1(VEX_4V), src2(VEX_I8IMM), src3(ModR/M), if (X86II::isX86_64ExtendedReg(MI.getOperand(0).getReg())) VEX_R = 0x0; @@ -629,7 +639,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, // 3 byte VEX prefix EmitByte(0xC4, CurByte, OS); EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS); - EmitByte(LastByte | (VEX_W << 7), CurByte, OS); + EmitByte(LastByte | ((VEX_W | XOP_W) << 7), CurByte, OS); } /// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64 @@ -889,6 +899,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // It uses the VEX.VVVV field? bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V; bool HasVEX_4VOp3 = (TSFlags >> X86II::VEXShift) & X86II::VEX_4VOp3; + bool HasXOP_W = (TSFlags >> X86II::VEXShift) & X86II::XOP_W; + unsigned XOP_W_I8IMMOperand = 2; // Determine where the memory operand starts, if present. int MemoryOperand = X86II::getMemoryOperandNo(TSFlags, Opcode); @@ -961,6 +973,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV) SrcRegNum++; + // GAS sets the XOP_W even with register operands, we want to match this. + // XOP_W is ignored, so there is no swapping of the operands + XOP_W_I8IMMOperand = 3; + EmitRegModRMByte(MI.getOperand(SrcRegNum), GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS); CurOp = SrcRegNum + 1; @@ -975,14 +991,20 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, ++AddrOperands; ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV). } + if(HasXOP_W) // Skip second register source (encoded in I8IMM) + ++FirstMemOp; EmitByte(BaseOpcode, CurByte, OS); EmitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)), TSFlags, CurByte, OS, Fixups); - CurOp += AddrOperands + 1; - if (HasVEX_4VOp3) - ++CurOp; + if(HasXOP_W) { + CurOp = NumOps - 1; // We have consumed all except one operand (third) + } else { + CurOp += AddrOperands + 1; + if (HasVEX_4VOp3) + ++CurOp; + } break; } @@ -1064,7 +1086,9 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS, // The last source register of a 4 operand instruction in AVX is encoded // in bits[7:4] of a immediate byte, and bits[3:0] are ignored. if ((TSFlags >> X86II::VEXShift) & X86II::VEX_I8IMM) { - const MCOperand &MO = MI.getOperand(CurOp++); + const MCOperand &MO = MI.getOperand(HasXOP_W ? XOP_W_I8IMMOperand + : CurOp); + CurOp++; bool IsExtReg = X86II::isX86_64ExtendedReg(MO.getReg()); unsigned RegNum = (IsExtReg ? (1 << 7) : 0); RegNum |= GetX86RegNum(MO) << 4; diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index d868773d2d..bdf797d5e1 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -58,3 +58,42 @@ let isAsmParserOnly = 1 in { defm VFNMSUBPS : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps">; defm VFNMSUBPD : fma_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd">, VEX_W; } + +//===----------------------------------------------------------------------===// +// FMA4 - AMD 4 operand Fused Multiply-Add instructions +//===----------------------------------------------------------------------===// + + +multiclass fma4s<bits<8> opc, string OpcodeStr> { + def rr : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src2, $src3, $src1, $dst|$dst, $src1, $src3, $src2}"), + []>, XOP_W; + def rm : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, f128mem:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>, XOP_W; + def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, f128mem:$src2, VR128:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), + []>; + +} + +let isAsmParserOnly = 1 in { + defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd">; +} + +// FMA4 Intrinsics patterns + +def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, VR128:$src3), + (VFMADDSD4rr VR128:$src1, VR128:$src2, VR128:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, VR128:$src2, + (alignedloadv2f64 addr:$src3)), + (VFMADDSD4rm VR128:$src1, VR128:$src2, addr:$src3)>; +def : Pat<(int_x86_fma4_vfmadd_sd VR128:$src1, (alignedloadv2f64 addr:$src2), + VR128:$src3), + (VFMADDSD4mr VR128:$src1, addr:$src2, VR128:$src3)>; diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index ecd6a93ef0..08c56c2e69 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -118,7 +118,7 @@ class VEX_I8IMM { bit hasVEX_i8ImmReg = 1; } class VEX_L { bit hasVEX_L = 1; } class VEX_LIG { bit ignoresVEX_L = 1; } class Has3DNow0F0FOpcode { bit has3DNow0F0FOpcode = 1; } - +class XOP_W { bit hasXOP_WPrefix = 1; } class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, Domain d = GenericDomain> : Instruction { @@ -158,6 +158,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, bit hasVEX_L = 0; // Does this inst use large (256-bit) registers? bit ignoresVEX_L = 0; // Does this instruction ignore the L-bit bit has3DNow0F0FOpcode =0;// Wacky 3dNow! encoding? + bit hasXOP_WPrefix = 0; // Same bit as VEX_W, but used for swapping operands // TSFlags layout should be kept in sync with X86InstrInfo.h. let TSFlags{5-0} = FormBits; @@ -179,6 +180,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, let TSFlags{38} = hasVEX_L; let TSFlags{39} = ignoresVEX_L; let TSFlags{40} = has3DNow0F0FOpcode; + let TSFlags{41} = hasXOP_WPrefix; } class PseudoI<dag oops, dag iops, list<dag> pattern> @@ -496,6 +498,12 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, T8, OpSize, VEX_4V, Requires<[HasFMA3]>; +// FMA4 Instruction Templates +class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern> + : I<o, F, outs, ins, asm, pattern, SSEPackedInt>, TA, + OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>; + // X86-64 Instruction templates... // diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll new file mode 100644 index 0000000000..39c2311eb5 --- /dev/null +++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll @@ -0,0 +1,9 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=fma4 | FileCheck %s + +define < 2 x double > @test_x86_fma4_vfmadd_sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) { + ; CHECK: vfmaddsd + %res = call < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double > %a0, < 2 x double > %a1, < 2 x double > %a2) ; <i64> [#uses=1] + ret < 2 x double > %res +} +declare < 2 x double > @llvm.x86.fma4.vfmadd.sd(< 2 x double >, < 2 x double >, < 2 x double >) nounwind readnone + diff --git a/test/MC/X86/x86_64-fma4-encoding.s b/test/MC/X86/x86_64-fma4-encoding.s new file mode 100644 index 0000000000..e0d2602901 --- /dev/null +++ b/test/MC/X86/x86_64-fma4-encoding.s @@ -0,0 +1,13 @@ +// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s + +// CHECK: vfmaddsd (%rcx), %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6b,0x01,0x10] + vfmaddsd (%rcx), %xmm1, %xmm0, %xmm0 + +// CHECK: vfmaddsd %xmm1, (%rcx), %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0x79,0x6b,0x01,0x10] + vfmaddsd %xmm1, (%rcx),%xmm0, %xmm0 + +// CHECK: vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 +// CHECK: encoding: [0xc4,0xe3,0xf9,0x6b,0xc2,0x10] + vfmaddsd %xmm2, %xmm1, %xmm0, %xmm0 |