aboutsummaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
authorAlexander Kornienko <alexfh@google.com>2013-03-14 10:51:38 +0000
committerAlexander Kornienko <alexfh@google.com>2013-03-14 10:51:38 +0000
commit647735c781c5b37061ee03d6e9e6c7dda92218e2 (patch)
tree5a5e56606d41060263048b5a5586b3d2380898ba /lib/Target/X86
parent6aed25d93d1cfcde5809a73ffa7dc1b0d6396f66 (diff)
parentf635ef401786c84df32090251a8cf45981ecca33 (diff)
Updating branches/google/stable to r176857
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/stable@177040 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/CMakeLists.txt1
-rw-r--r--lib/Target/X86/AsmParser/X86AsmLexer.cpp159
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp545
-rw-r--r--lib/Target/X86/CMakeLists.txt3
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp6
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.h4
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp135
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp34
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h4
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp14
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp3
-rw-r--r--lib/Target/X86/Makefile3
-rw-r--r--lib/Target/X86/README-SSE.txt9
-rw-r--r--lib/Target/X86/README.txt37
-rw-r--r--lib/Target/X86/TargetInfo/X86TargetInfo.cpp2
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.cpp18
-rw-r--r--lib/Target/X86/Utils/X86ShuffleDecode.h2
-rw-r--r--lib/Target/X86/X86.h7
-rw-r--r--lib/Target/X86/X86.td14
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp55
-rw-r--r--lib/Target/X86/X86AsmPrinter.h8
-rw-r--r--lib/Target/X86/X86CallingConv.td3
-rw-r--r--lib/Target/X86/X86CodeEmitter.cpp4
-rw-r--r--lib/Target/X86/X86CompilationCallback_Win64.asm4
-rw-r--r--lib/Target/X86/X86FastISel.cpp144
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp2
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp338
-rw-r--r--lib/Target/X86/X86FrameLowering.h6
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp23
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp2431
-rw-r--r--lib/Target/X86/X86ISelLowering.h91
-rw-r--r--lib/Target/X86/X86Instr3DNow.td6
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td392
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td12
-rw-r--r--lib/Target/X86/X86InstrCompiler.td277
-rw-r--r--lib/Target/X86/X86InstrFMA.td18
-rw-r--r--lib/Target/X86/X86InstrFormats.td2
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td8
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp711
-rw-r--r--lib/Target/X86/X86InstrInfo.td164
-rw-r--r--lib/Target/X86/X86InstrSSE.td1743
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td6
-rw-r--r--lib/Target/X86/X86InstrSystem.td8
-rw-r--r--lib/Target/X86/X86InstrTSX.td2
-rw-r--r--lib/Target/X86/X86JITInfo.cpp36
-rw-r--r--lib/Target/X86/X86JITInfo.h2
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp5
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp212
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp234
-rw-r--r--lib/Target/X86/X86RegisterInfo.h7
-rw-r--r--lib/Target/X86/X86Schedule.td5
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td1
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp10
-rw-r--r--lib/Target/X86/X86Subtarget.cpp112
-rw-r--r--lib/Target/X86/X86Subtarget.h40
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp53
-rw-r--r--lib/Target/X86/X86TargetMachine.h26
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp6
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp373
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp10
60 files changed, 4996 insertions, 3594 deletions
diff --git a/lib/Target/X86/AsmParser/CMakeLists.txt b/lib/Target/X86/AsmParser/CMakeLists.txt
index 47489bb06c..54204d4b63 100644
--- a/lib/Target/X86/AsmParser/CMakeLists.txt
+++ b/lib/Target/X86/AsmParser/CMakeLists.txt
@@ -1,7 +1,6 @@
include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
add_llvm_library(LLVMX86AsmParser
- X86AsmLexer.cpp
X86AsmParser.cpp
)
diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
deleted file mode 100644
index b12399d447..0000000000
--- a/lib/Target/X86/AsmParser/X86AsmLexer.cpp
+++ /dev/null
@@ -1,159 +0,0 @@
-//===-- X86AsmLexer.cpp - Tokenize X86 assembly to AsmTokens --------------===//
-//
-// The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/X86BaseInfo.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCTargetAsmLexer.h"
-#include "llvm/Support/TargetRegistry.h"
-
-using namespace llvm;
-
-namespace {
-
-class X86AsmLexer : public MCTargetAsmLexer {
- const MCAsmInfo &AsmInfo;
-
- bool tentativeIsValid;
- AsmToken tentativeToken;
-
- const AsmToken &lexTentative() {
- tentativeToken = getLexer()->Lex();
- tentativeIsValid = true;
- return tentativeToken;
- }
-
- const AsmToken &lexDefinite() {
- if (tentativeIsValid) {
- tentativeIsValid = false;
- return tentativeToken;
- }
- return getLexer()->Lex();
- }
-
- AsmToken LexTokenATT();
- AsmToken LexTokenIntel();
-protected:
- AsmToken LexToken() {
- if (!Lexer) {
- SetError(SMLoc(), "No MCAsmLexer installed");
- return AsmToken(AsmToken::Error, "", 0);
- }
-
- switch (AsmInfo.getAssemblerDialect()) {
- default:
- SetError(SMLoc(), "Unhandled dialect");
- return AsmToken(AsmToken::Error, "", 0);
- case 0:
- return LexTokenATT();
- case 1:
- return LexTokenIntel();
- }
- }
-public:
- X86AsmLexer(const Target &T, const MCRegisterInfo &MRI, const MCAsmInfo &MAI)
- : MCTargetAsmLexer(T), AsmInfo(MAI), tentativeIsValid(false) {
- }
-};
-
-} // end anonymous namespace
-
-#define GET_REGISTER_MATCHER
-#include "X86GenAsmMatcher.inc"
-
-AsmToken X86AsmLexer::LexTokenATT() {
- AsmToken lexedToken = lexDefinite();
-
- switch (lexedToken.getKind()) {
- default:
- return lexedToken;
- case AsmToken::Error:
- SetError(Lexer->getErrLoc(), Lexer->getErr());
- return lexedToken;
-
- case AsmToken::Percent: {
- const AsmToken &nextToken = lexTentative();
- if (nextToken.getKind() != AsmToken::Identifier)
- return lexedToken;
-
- if (unsigned regID = MatchRegisterName(nextToken.getString())) {
- lexDefinite();
-
- // FIXME: This is completely wrong when there is a space or other
- // punctuation between the % and the register name.
- StringRef regStr(lexedToken.getString().data(),
- lexedToken.getString().size() +
- nextToken.getString().size());
-
- return AsmToken(AsmToken::Register, regStr,
- static_cast<int64_t>(regID));
- }
-
- // Match register name failed. If this is "db[0-7]", match it as an alias
- // for dr[0-7].
- if (nextToken.getString().size() == 3 &&
- nextToken.getString().startswith("db")) {
- int RegNo = -1;
- switch (nextToken.getString()[2]) {
- case '0': RegNo = X86::DR0; break;
- case '1': RegNo = X86::DR1; break;
- case '2': RegNo = X86::DR2; break;
- case '3': RegNo = X86::DR3; break;
- case '4': RegNo = X86::DR4; break;
- case '5': RegNo = X86::DR5; break;
- case '6': RegNo = X86::DR6; break;
- case '7': RegNo = X86::DR7; break;
- }
-
- if (RegNo != -1) {
- lexDefinite();
-
- // FIXME: This is completely wrong when there is a space or other
- // punctuation between the % and the register name.
- StringRef regStr(lexedToken.getString().data(),
- lexedToken.getString().size() +
- nextToken.getString().size());
- return AsmToken(AsmToken::Register, regStr,
- static_cast<int64_t>(RegNo));
- }
- }
-
-
- return lexedToken;
- }
- }
-}
-
-AsmToken X86AsmLexer::LexTokenIntel() {
- const AsmToken &lexedToken = lexDefinite();
-
- switch(lexedToken.getKind()) {
- default:
- return lexedToken;
- case AsmToken::Error:
- SetError(Lexer->getErrLoc(), Lexer->getErr());
- return lexedToken;
- case AsmToken::Identifier: {
- unsigned regID = MatchRegisterName(lexedToken.getString().lower());
-
- if (regID)
- return AsmToken(AsmToken::Register,
- lexedToken.getString(),
- static_cast<int64_t>(regID));
- return lexedToken;
- }
- }
-}
-
-extern "C" void LLVMInitializeX86AsmLexer() {
- RegisterMCAsmLexer<X86AsmLexer> X(TheX86_32Target);
- RegisterMCAsmLexer<X86AsmLexer> Y(TheX86_64Target);
-}
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index fdb7583ed0..ee5c2b2bfd 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -57,7 +57,7 @@ private:
X86Operand *ParseATTOperand();
X86Operand *ParseIntelOperand();
X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc);
- X86Operand *ParseIntelTypeOperator(SMLoc StartLoc);
+ X86Operand *ParseIntelOperator(SMLoc StartLoc, unsigned OpKind);
X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc);
X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size);
X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
@@ -168,6 +168,7 @@ struct X86Operand : public MCParsedAsmOperand {
SMLoc StartLoc, EndLoc;
SMLoc OffsetOfLoc;
+ bool AddressOf;
union {
struct {
@@ -340,6 +341,10 @@ struct X86Operand : public MCParsedAsmOperand {
return OffsetOfLoc.getPointer();
}
+ bool needAddressOf() const {
+ return AddressOf;
+ }
+
bool needSizeDirective() const {
assert(Kind == Memory && "Invalid access!");
return Mem.NeedSizeDir;
@@ -463,7 +468,7 @@ struct X86Operand : public MCParsedAsmOperand {
}
static X86Operand *CreateToken(StringRef Str, SMLoc Loc) {
- SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size() - 1);
+ SMLoc EndLoc = SMLoc::getFromPointer(Loc.getPointer() + Str.size());
X86Operand *Res = new X86Operand(Token, Loc, EndLoc);
Res->Tok.Data = Str.data();
Res->Tok.Length = Str.size();
@@ -471,9 +476,11 @@ struct X86Operand : public MCParsedAsmOperand {
}
static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+ bool AddressOf = false,
SMLoc OffsetOfLoc = SMLoc()) {
X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
Res->Reg.RegNo = RegNo;
+ Res->AddressOf = AddressOf;
Res->OffsetOfLoc = OffsetOfLoc;
return Res;
}
@@ -488,7 +495,7 @@ struct X86Operand : public MCParsedAsmOperand {
/// Create an absolute memory operand.
static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
- unsigned Size = 0, bool NeedSizeDir = false){
+ unsigned Size = 0, bool NeedSizeDir = false) {
X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
Res->Mem.SegReg = 0;
Res->Mem.Disp = Disp;
@@ -497,6 +504,7 @@ struct X86Operand : public MCParsedAsmOperand {
Res->Mem.Scale = 1;
Res->Mem.Size = Size;
Res->Mem.NeedSizeDir = NeedSizeDir;
+ Res->AddressOf = false;
return Res;
}
@@ -520,6 +528,7 @@ struct X86Operand : public MCParsedAsmOperand {
Res->Mem.Scale = Scale;
Res->Mem.Size = Size;
Res->Mem.NeedSizeDir = NeedSizeDir;
+ Res->AddressOf = false;
return Res;
}
};
@@ -558,10 +567,12 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
Parser.Lex(); // Eat percent token.
const AsmToken &Tok = Parser.getTok();
+ EndLoc = Tok.getEndLoc();
+
if (Tok.isNot(AsmToken::Identifier)) {
if (isParsingIntelSyntax()) return true;
return Error(StartLoc, "invalid register name",
- SMRange(StartLoc, Tok.getEndLoc()));
+ SMRange(StartLoc, EndLoc));
}
RegNo = MatchRegisterName(Tok.getString());
@@ -582,13 +593,12 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
X86II::isX86_64ExtendedReg(RegNo))
return Error(StartLoc, "register %"
+ Tok.getString() + " is only available in 64-bit mode",
- SMRange(StartLoc, Tok.getEndLoc()));
+ SMRange(StartLoc, EndLoc));
}
// Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
if (RegNo == 0 && (Tok.getString() == "st" || Tok.getString() == "ST")) {
RegNo = X86::ST0;
- EndLoc = Tok.getLoc();
Parser.Lex(); // Eat 'st'
// Check to see if we have '(4)' after %st.
@@ -615,11 +625,13 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
if (getParser().Lex().isNot(AsmToken::RParen))
return Error(Parser.getTok().getLoc(), "expected ')'");
- EndLoc = Tok.getLoc();
+ EndLoc = Parser.getTok().getEndLoc();
Parser.Lex(); // Eat ')'
return false;
}
+ EndLoc = Parser.getTok().getEndLoc();
+
// If this is "db[0-7]", match it as an alias
// for dr[0-7].
if (RegNo == 0 && Tok.getString().size() == 3 &&
@@ -636,7 +648,7 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
}
if (RegNo != 0) {
- EndLoc = Tok.getLoc();
+ EndLoc = Parser.getTok().getEndLoc();
Parser.Lex(); // Eat it.
return false;
}
@@ -645,10 +657,9 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
if (RegNo == 0) {
if (isParsingIntelSyntax()) return true;
return Error(StartLoc, "invalid register name",
- SMRange(StartLoc, Tok.getEndLoc()));
+ SMRange(StartLoc, EndLoc));
}
- EndLoc = Tok.getEndLoc();
Parser.Lex(); // Eat identifier token.
return false;
}
@@ -673,115 +684,299 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
return Size;
}
+enum IntelBracExprState {
+ IBES_START,
+ IBES_LBRAC,
+ IBES_RBRAC,
+ IBES_REGISTER,
+ IBES_REGISTER_STAR,
+ IBES_REGISTER_STAR_INTEGER,
+ IBES_INTEGER,
+ IBES_INTEGER_STAR,
+ IBES_INDEX_REGISTER,
+ IBES_IDENTIFIER,
+ IBES_DISP_EXPR,
+ IBES_MINUS,
+ IBES_ERROR
+};
+
+class IntelBracExprStateMachine {
+ IntelBracExprState State;
+ unsigned BaseReg, IndexReg, Scale;
+ int64_t Disp;
+
+ unsigned TmpReg;
+ int64_t TmpInteger;
+
+ bool isPlus;
+
+public:
+ IntelBracExprStateMachine(MCAsmParser &parser) :
+ State(IBES_START), BaseReg(0), IndexReg(0), Scale(1), Disp(0),
+ TmpReg(0), TmpInteger(0), isPlus(true) {}
+
+ unsigned getBaseReg() { return BaseReg; }
+ unsigned getIndexReg() { return IndexReg; }
+ unsigned getScale() { return Scale; }
+ int64_t getDisp() { return Disp; }
+ bool isValidEndState() { return State == IBES_RBRAC; }
+
+ void onPlus() {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_INTEGER:
+ State = IBES_START;
+ if (isPlus)
+ Disp += TmpInteger;
+ else
+ Disp -= TmpInteger;
+ break;
+ case IBES_REGISTER:
+ State = IBES_START;
+ // If we already have a BaseReg, then assume this is the IndexReg with a
+ // scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ break;
+ case IBES_INDEX_REGISTER:
+ State = IBES_START;
+ break;
+ }
+ isPlus = true;
+ }
+ void onMinus() {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_START:
+ State = IBES_MINUS;
+ break;
+ case IBES_INTEGER:
+ State = IBES_START;
+ if (isPlus)
+ Disp += TmpInteger;
+ else
+ Disp -= TmpInteger;
+ break;
+ case IBES_REGISTER:
+ State = IBES_START;
+ // If we already have a BaseReg, then assume this is the IndexReg with a
+ // scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ break;
+ case IBES_INDEX_REGISTER:
+ State = IBES_START;
+ break;
+ }
+ isPlus = false;
+ }
+ void onRegister(unsigned Reg) {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_START:
+ State = IBES_REGISTER;
+ TmpReg = Reg;
+ break;
+ case IBES_INTEGER_STAR:
+ assert (!IndexReg && "IndexReg already set!");
+ State = IBES_INDEX_REGISTER;
+ IndexReg = Reg;
+ Scale = TmpInteger;
+ break;
+ }
+ }
+ void onDispExpr() {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_START:
+ State = IBES_DISP_EXPR;
+ break;
+ }
+ }
+ void onInteger(int64_t TmpInt) {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_START:
+ State = IBES_INTEGER;
+ TmpInteger = TmpInt;
+ break;
+ case IBES_MINUS:
+ State = IBES_INTEGER;
+ TmpInteger = TmpInt;
+ break;
+ case IBES_REGISTER_STAR:
+ assert (!IndexReg && "IndexReg already set!");
+ State = IBES_INDEX_REGISTER;
+ IndexReg = TmpReg;
+ Scale = TmpInt;
+ break;
+ }
+ }
+ void onStar() {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_INTEGER:
+ State = IBES_INTEGER_STAR;
+ break;
+ case IBES_REGISTER:
+ State = IBES_REGISTER_STAR;
+ break;
+ }
+ }
+ void onLBrac() {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_RBRAC:
+ State = IBES_START;
+ isPlus = true;
+ break;
+ }
+ }
+ void onRBrac() {
+ switch (State) {
+ default:
+ State = IBES_ERROR;
+ break;
+ case IBES_DISP_EXPR:
+ State = IBES_RBRAC;
+ break;
+ case IBES_INTEGER:
+ State = IBES_RBRAC;
+ if (isPlus)
+ Disp += TmpInteger;
+ else
+ Disp -= TmpInteger;
+ break;
+ case IBES_REGISTER:
+ State = IBES_RBRAC;
+ // If we already have a BaseReg, then assume this is the IndexReg with a
+ // scale of 1.
+ if (!BaseReg) {
+ BaseReg = TmpReg;
+ } else {
+ assert (!IndexReg && "BaseReg/IndexReg already set!");
+ IndexReg = TmpReg;
+ Scale = 1;
+ }
+ break;
+ case IBES_INDEX_REGISTER:
+ State = IBES_RBRAC;
+ break;
+ }
+ }
+};
+
X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
unsigned Size) {
- unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
const AsmToken &Tok = Parser.getTok();
- SMLoc Start = Tok.getLoc(), End;
-
- const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
- // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ]
+ SMLoc Start = Tok.getLoc(), End = Tok.getEndLoc();
// Eat '['
if (getLexer().isNot(AsmToken::LBrac))
return ErrorOperand(Start, "Expected '[' token!");
Parser.Lex();
+ unsigned TmpReg = 0;
+
+ // Try to handle '[' 'symbol' ']'
if (getLexer().is(AsmToken::Identifier)) {
- // Parse BaseReg
- if (ParseRegister(BaseReg, Start, End)) {
- // Handle '[' 'symbol' ']'
- if (getParser().ParseExpression(Disp, End)) return 0;
+ if (ParseRegister(TmpReg, Start, End)) {
+ const MCExpr *Disp;
+ if (getParser().parseExpression(Disp, End))
+ return 0;
+
if (getLexer().isNot(AsmToken::RBrac))
- return ErrorOperand(Start, "Expected ']' token!");
+ return ErrorOperand(Parser.getTok().getLoc(), "Expected ']' token!");
+ // Adjust the EndLoc due to the ']'.
+ End = SMLoc::getFromPointer(Parser.getTok().getEndLoc().getPointer()-1);
Parser.Lex();
- End = Tok.getLoc();
return X86Operand::CreateMem(Disp, Start, End, Size);
}
- } else if (getLexer().is(AsmToken::Integer)) {
- int64_t Val = Tok.getIntVal();
- Parser.Lex();
- SMLoc Loc = Tok.getLoc();
- if (getLexer().is(AsmToken::RBrac)) {
- // Handle '[' number ']'
- Parser.Lex();
- End = Tok.getLoc();
- const MCExpr *Disp = MCConstantExpr::Create(Val, getContext());
- if (SegReg)
- return X86Operand::CreateMem(SegReg, Disp, 0, 0, Scale,
- Start, End, Size);
- return X86Operand::CreateMem(Disp, Start, End, Size);
- } else if (getLexer().is(AsmToken::Star)) {
- // Handle '[' Scale*IndexReg ']'
- Parser.Lex();
- SMLoc IdxRegLoc = Tok.getLoc();
- if (ParseRegister(IndexReg, IdxRegLoc, End))
- return ErrorOperand(IdxRegLoc, "Expected register");
- Scale = Val;
- } else
- return ErrorOperand(Loc, "Unexpected token");
}
- // Parse ][ as a plus.
- bool ExpectRBrac = true;
- if (getLexer().is(AsmToken::RBrac)) {
- ExpectRBrac = false;
- Parser.Lex();
- End = Tok.getLoc();
- }
+ // Parse [ BaseReg + Scale*IndexReg + Disp ].
+ bool Done = false;
+ IntelBracExprStateMachine SM(Parser);
+
+ // If we parsed a register, then the end loc has already been set and
+ // the identifier has already been lexed. We also need to update the
+ // state.
+ if (TmpReg)
+ SM.onRegister(TmpReg);
+
+ const MCExpr *Disp = 0;
+ while (!Done) {
+ bool UpdateLocLex = true;
- if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus) ||
- getLexer().is(AsmToken::LBrac)) {
- ExpectRBrac = true;
- bool isPlus = getLexer().is(AsmToken::Plus) ||
- getLexer().is(AsmToken::LBrac);
- Parser.Lex();
- SMLoc PlusLoc = Tok.getLoc();
- if (getLexer().is(AsmToken::Integer)) {
+ // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
+ // identifier. Don't try an parse it as a register.
+ if (Tok.getString().startswith("."))
+ break;
+
+ switch (getLexer().getKind()) {
+ default: {
+ if (SM.isValidEndState()) {
+ Done = true;
+ break;
+ }
+ return ErrorOperand(Tok.getLoc(), "Unexpected token!");
+ }
+ case AsmToken::Identifier: {
+ // This could be a register or a displacement expression.
+ if(!ParseRegister(TmpReg, Start, End)) {
+ SM.onRegister(TmpReg);
+ UpdateLocLex = false;
+ break;
+ } else if (!getParser().parseExpression(Disp, End)) {
+ SM.onDispExpr();
+ UpdateLocLex = false;
+ break;
+ }
+ return ErrorOperand(Tok.getLoc(), "Unexpected identifier!");
+ }
+ case AsmToken::Integer: {
int64_t Val = Tok.getIntVal();
- Parser.Lex();
- if (getLexer().is(AsmToken::Star)) {
- Parser.Lex();
- SMLoc IdxRegLoc = Tok.getLoc();
- if (ParseRegister(IndexReg, IdxRegLoc, End))
- return ErrorOperand(IdxRegLoc, "Expected register");
- Scale = Val;
- } else if (getLexer().is(AsmToken::RBrac)) {
- const MCExpr *ValExpr = MCConstantExpr::Create(Val, getContext());
- Disp = isPlus ? ValExpr : MCConstantExpr::Create(0-Val, getContext());
- } else
- return ErrorOperand(PlusLoc, "unexpected token after +");
- } else if (getLexer().is(AsmToken::Identifier)) {
- // This could be an index register or a displacement expression.
- End = Tok.getLoc();
- if (!IndexReg)
- ParseRegister(IndexReg, Start, End);
- else if (getParser().ParseExpression(Disp, End)) return 0;
+ SM.onInteger(Val);
+ break;
}
- }
-
- // Parse ][ as a plus.
- if (getLexer().is(AsmToken::RBrac)) {
- ExpectRBrac = false;
- Parser.Lex();
- End = Tok.getLoc();
- if (getLexer().is(AsmToken::LBrac)) {
- ExpectRBrac = true;
- Parser.Lex();
- if (getParser().ParseExpression(Disp, End))
- return 0;
+ case AsmToken::Plus: SM.onPlus(); break;
+ case AsmToken::Minus: SM.onMinus(); break;
+ case AsmToken::Star: SM.onStar(); break;
+ case AsmToken::LBrac: SM.onLBrac(); break;
+ case AsmToken::RBrac: SM.onRBrac(); break;
+ }
+ if (!Done && UpdateLocLex) {
+ End = Tok.getLoc();
+ Parser.Lex(); // Consume the token.
}
- } else if (ExpectRBrac) {
- if (getParser().ParseExpression(Disp, End))
- return 0;
}
- if (ExpectRBrac) {
- if (getLexer().isNot(AsmToken::RBrac))
- return ErrorOperand(End, "expected ']' token!");
- Parser.Lex();
- End = Tok.getLoc();
- }
+ if (!Disp)
+ Disp = MCConstantExpr::Create(SM.getDisp(), getContext());
// Parse the dot operator (e.g., [ebx].foo.bar).
if (Tok.getString().startswith(".")) {
@@ -790,16 +985,23 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
if (ParseIntelDotOperator(Disp, &NewDisp, Err))
return ErrorOperand(Tok.getLoc(), Err);
+ End = Parser.getTok().getEndLoc();
Parser.Lex(); // Eat the field.
Disp = NewDisp;
}
- End = Tok.getLoc();
+ int BaseReg = SM.getBaseReg();
+ int IndexReg = SM.getIndexReg();
// handle [-42]
- if (!BaseReg && !IndexReg)
- return X86Operand::CreateMem(Disp, Start, End, Size);
+ if (!BaseReg && !IndexReg) {
+ if (!SegReg)
+ return X86Operand::CreateMem(Disp, Start, End);
+ else
+ return X86Operand::CreateMem(SegReg, Disp, 0, 0, 1, Start, End, Size);
+ }
+ int Scale = SM.getScale();
return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
Start, End, Size);
}
@@ -831,28 +1033,43 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
}
const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
- if (getParser().ParseExpression(Disp, End)) return 0;
- End = Parser.getTok().getLoc();
+ if (getParser().parseExpression(Disp, End))
+ return 0;
bool NeedSizeDir = false;
- if (!Size && isParsingInlineAsm()) {
+ bool IsVarDecl = false;
+ if (isParsingInlineAsm()) {
if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Disp)) {
const MCSymbol &Sym = SymRef->getSymbol();
// FIXME: The SemaLookup will fail if the name is anything other then an
// identifier.
// FIXME: Pass a valid SMLoc.
- SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size);
+ unsigned tLength, tSize, tType;
+ SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, tLength,
+ tSize, tType, IsVarDecl);
+ if (!Size)
+ Size = tType * 8; // Size is in terms of bits in this context.
NeedSizeDir = Size > 0;
}
}
if (!isParsingInlineAsm())
return X86Operand::CreateMem(Disp, Start, End, Size);
- else
+ else {
+ // If this is not a VarDecl then assume it is a FuncDecl or some other label
+ // reference. We need an 'r' constraint here, so we need to create register
+ // operand to ensure proper matching. Just pick a GPR based on the size of
+ // a pointer.
+ if (!IsVarDecl) {
+ unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+ return X86Operand::CreateReg(RegNo, Start, End, /*AddressOf=*/true);
+ }
+
// When parsing inline assembly we set the base register to a non-zero value
// as we don't know the actual value at this time. This is necessary to
// get the matching correct in some cases.
return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0,
/*Scale*/1, Start, End, Size, NeedSizeDir);
+ }
}
/// Parse the '.' operator.
@@ -918,11 +1135,9 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
SMLoc End;
const MCExpr *Val;
- if (getParser().ParseExpression(Val, End))
+ if (getParser().parseExpression(Val, End))
return ErrorOperand(Start, "Unable to parse expression!");
- End = Parser.getTok().getLoc();
-
// Don't emit the offset operator.
InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -930,13 +1145,23 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
// register operand to ensure proper matching. Just pick a GPR based on
// the size of a pointer.
unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
- return X86Operand::CreateReg(RegNo, Start, End, OffsetOfLoc);
+ return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
+ OffsetOfLoc);
}
-/// Parse the 'TYPE' operator. The TYPE operator returns the size of a C or
-/// C++ type or variable. If the variable is an array, TYPE returns the size of
-/// a single element of the array.
-X86Operand *X86AsmParser::ParseIntelTypeOperator(SMLoc Start) {
+enum IntelOperatorKind {
+ IOK_LENGTH,
+ IOK_SIZE,
+ IOK_TYPE
+};
+
+/// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators. The LENGTH operator
+/// returns the number of elements in an array. It returns the value 1 for
+/// non-array variables. The SIZE operator returns the size of a C or C++
+/// variable. A variable's size is the product of its LENGTH and TYPE. The
+/// TYPE operator returns the size of a C or C++ type or variable. If the
+/// variable is an array, TYPE returns the size of a single element.
+X86Operand *X86AsmParser::ParseIntelOperator(SMLoc Start, unsigned OpKind) {
SMLoc TypeLoc = Start;
Parser.Lex(); // Eat offset.
Start = Parser.getTok().getLoc();
@@ -944,75 +1169,75 @@ X86Operand *X86AsmParser::ParseIntelTypeOperator(SMLoc Start) {
SMLoc End;
const MCExpr *Val;
- if (getParser().ParseExpression(Val, End))
+ if (getParser().parseExpression(Val, End))
return 0;
- End = Parser.getTok().getLoc();
-
- unsigned Size = 0;
+ unsigned Length = 0, Size = 0, Type = 0;
if (const MCSymbolRefExpr *SymRef = dyn_cast<MCSymbolRefExpr>(Val)) {
const MCSymbol &Sym = SymRef->getSymbol();
// FIXME: The SemaLookup will fail if the name is anything other then an
// identifier.
// FIXME: Pass a valid SMLoc.
- if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Size))
- return ErrorOperand(Start, "Unable to lookup TYPE of expr!");
-
- Size /= 8; // Size is in terms of bits, but we want bytes in the context.
+ bool IsVarDecl;
+ if (!SemaCallback->LookupInlineAsmIdentifier(Sym.getName(), NULL, Length,
+ Size, Type, IsVarDecl))
+ return ErrorOperand(Start, "Unable to lookup expr!");
+ }
+ unsigned CVal;
+ switch(OpKind) {
+ default: llvm_unreachable("Unexpected operand kind!");
+ case IOK_LENGTH: CVal = Length; break;
+ case IOK_SIZE: CVal = Size; break;
+ case IOK_TYPE: CVal = Type; break;
}
// Rewrite the type operator and the C or C++ type or variable in terms of an
// immediate. E.g. TYPE foo -> $$4
unsigned Len = End.getPointer() - TypeLoc.getPointer();
- InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, Size));
+ InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Imm, TypeLoc, Len, CVal));
- const MCExpr *Imm = MCConstantExpr::Create(Size, getContext());
+ const MCExpr *Imm = MCConstantExpr::Create(CVal, getContext());
return X86Operand::CreateImm(Imm, Start, End, /*NeedAsmRewrite*/false);
}
X86Operand *X86AsmParser::ParseIntelOperand() {
SMLoc Start = Parser.getTok().getLoc(), End;
-
- // offset operator.
StringRef AsmTokStr = Parser.getTok().getString();
- if ((AsmTokStr == "offset" || AsmTokStr == "OFFSET") &&
- isParsingInlineAsm())
- return ParseIntelOffsetOfOperator(Start);
-
- // Type directive.
- if ((AsmTokStr == "type" || AsmTokStr == "TYPE") &&
- isParsingInlineAsm())
- return ParseIntelTypeOperator(Start);
-
- // Unsupported directives.
- if (isParsingIntelSyntax() &&
- (AsmTokStr == "size" || AsmTokStr == "SIZE" ||
- AsmTokStr == "length" || AsmTokStr == "LENGTH"))
- return ErrorOperand(Start, "Unsupported directive!");
-
- // immediate.
+
+ // Offset, length, type and size operators.
+ if (isParsingInlineAsm()) {
+ if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+ return ParseIntelOffsetOfOperator(Start);
+ if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
+ return ParseIntelOperator(Start, IOK_LENGTH);
+ if (AsmTokStr == "size" || AsmTokStr == "SIZE")
+ return ParseIntelOperator(Start, IOK_SIZE);
+ if (AsmTokStr == "type" || AsmTokStr == "TYPE")
+ return ParseIntelOperator(Start, IOK_TYPE);
+ }
+
+ // Immediate.
if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
getLexer().is(AsmToken::Minus)) {
const MCExpr *Val;
- if (!getParser().ParseExpression(Val, End)) {
- End = Parser.getTok().getLoc();
+ if (!getParser().parseExpression(Val, End)) {
return X86Operand::CreateImm(Val, Start, End);
}
}
- // register
+ // Register.
unsigned RegNo = 0;
if (!ParseRegister(RegNo, Start, End)) {
// If this is a segment register followed by a ':', then this is the start
// of a memory reference, otherwise this is a normal register reference.
if (getLexer().isNot(AsmToken::Colon))
- return X86Operand::CreateReg(RegNo, Start, Parser.getTok().getLoc());
+ return X86Operand::CreateReg(RegNo, Start, End);
getParser().Lex(); // Eat the colon.
return ParseIntelMemOperand(RegNo, Start);
}
- // mem operand
+ // Memory operand.
return ParseIntelMemOperand(0, Start);
}
@@ -1046,7 +1271,7 @@ X86Operand *X86AsmParser::ParseATTOperand() {
SMLoc Start = Parser.getTok().getLoc(), End;
Parser.Lex();
const MCExpr *Val;
- if (getParser().ParseExpression(Val, End))
+ if (getParser().parseExpression(Val, End))
return 0;
return X86Operand::CreateImm(Val, Start, End);
}
@@ -1064,7 +1289,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
if (getLexer().isNot(AsmToken::LParen)) {
SMLoc ExprEnd;
- if (getParser().ParseExpression(Disp, ExprEnd)) return 0;
+ if (getParser().parseExpression(Disp, ExprEnd)) return 0;
// After parsing the base expression we could either have a parenthesized
// memory address or not. If not, return now. If so, eat the (.
@@ -1090,7 +1315,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
SMLoc ExprEnd;
// It must be an parenthesized expression, parse it now.
- if (getParser().ParseParenExpression(Disp, ExprEnd))
+ if (getParser().parseParenExpression(Disp, ExprEnd))
return 0;
// After parsing the base expression we could either have a parenthesized
@@ -1150,7 +1375,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
SMLoc Loc = Parser.getTok().getLoc();
int64_t ScaleVal;
- if (getParser().ParseAbsoluteExpression(ScaleVal)){
+ if (getParser().parseAbsoluteExpression(ScaleVal)){
Error(Loc, "expected scale expression");
return 0;
}
@@ -1169,7 +1394,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
SMLoc Loc = Parser.getTok().getLoc();
int64_t Value;
- if (getParser().ParseAbsoluteExpression(Value))
+ if (getParser().parseAbsoluteExpression(Value))
return 0;
if (Value != 1)
@@ -1183,7 +1408,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
return 0;
}
- SMLoc MemEnd = Parser.getTok().getLoc();
+ SMLoc MemEnd = Parser.getTok().getEndLoc();
Parser.Lex(); // Eat the ')'.
// If we have both a base register and an index register make sure they are
@@ -1310,7 +1535,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
if (X86Operand *Op = ParseOperand())
Operands.push_back(Op);
else {
- Parser.EatToEndOfStatement();
+ Parser.eatToEndOfStatement();
return true;
}
@@ -1321,14 +1546,14 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
if (X86Operand *Op = ParseOperand())
Operands.push_back(Op);
else {
- Parser.EatToEndOfStatement();
+ Parser.eatToEndOfStatement();
return true;
}
}
if (getLexer().isNot(AsmToken::EndOfStatement)) {
SMLoc Loc = getLexer().getLoc();
- Parser.EatToEndOfStatement();
+ Parser.eatToEndOfStatement();
return Error(Loc, "unexpected token in argument list");
}
}
@@ -1855,7 +2080,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
// Check for the various suffix matches.
Tmp[Base.size()] = Suffixes[0];
unsigned ErrorInfoIgnore;
- unsigned ErrorInfoMissingFeature;
+ unsigned ErrorInfoMissingFeature = 0; // Init suppresses compiler warnings.
unsigned Match1, Match2, Match3, Match4;
Match1 = MatchInstructionImpl(Operands, Inst, ErrorInfoIgnore,
@@ -2018,10 +2243,10 @@ bool X86AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
if (getLexer().isNot(AsmToken::EndOfStatement)) {
for (;;) {
const MCExpr *Value;
- if (getParser().ParseExpression(Value))
+ if (getParser().parseExpression(Value))
return true;
- getParser().getStreamer().EmitValue(Value, Size, 0 /*addrspace*/);
+ getParser().getStreamer().EmitValue(Value, Size);
if (getLexer().is(AsmToken::EndOfStatement))
break;
@@ -2059,14 +2284,10 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
return false;
}
-
-extern "C" void LLVMInitializeX86AsmLexer();
-
// Force static initialization.
extern "C" void LLVMInitializeX86AsmParser() {
RegisterMCAsmParser<X86AsmParser> X(TheX86_32Target);
RegisterMCAsmParser<X86AsmParser> Y(TheX86_64Target);
- LLVMInitializeX86AsmLexer();
}
#define GET_REGISTER_MATCHER
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index f4d03a602c..d14899d28a 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -10,7 +10,6 @@ tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM X86GenEDInfo.inc -gen-enhanced-disassembly-info)
add_public_tablegen_target(X86CommonTableGen)
set(sources
@@ -26,11 +25,13 @@ set(sources
X86JITInfo.cpp
X86MCInstLower.cpp
X86MachineFunctionInfo.cpp
+ X86PadShortFunction.cpp
X86RegisterInfo.cpp
X86SelectionDAGInfo.cpp
X86Subtarget.cpp
X86TargetMachine.cpp
X86TargetObjectFile.cpp
+ X86TargetTransformInfo.cpp
X86VZeroUpper.cpp
)
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ed61c01130..ca6f80ce3e 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -16,7 +16,6 @@
#include "X86Disassembler.h"
#include "X86DisassemblerDecoder.h"
-#include "llvm/MC/EDInstInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDisassembler.h"
#include "llvm/MC/MCExpr.h"
@@ -32,7 +31,6 @@
#include "X86GenRegisterInfo.inc"
#define GET_INSTRINFO_ENUM
#include "X86GenInstrInfo.inc"
-#include "X86GenEDInfo.inc"
using namespace llvm;
using namespace llvm::X86Disassembler;
@@ -83,10 +81,6 @@ X86GenericDisassembler::~X86GenericDisassembler() {
delete MII;
}
-const EDInstInfo *X86GenericDisassembler::getEDInfo() const {
- return instInfoX86;
-}
-
/// regionReader - a callback function that wraps the readByte method from
/// MemoryObject.
///
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 981701f527..b92427a7e9 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -95,8 +95,6 @@ class MCSubtargetInfo;
class MemoryObject;
class raw_ostream;
-struct EDInstInfo;
-
namespace X86Disassembler {
/// X86GenericDisassembler - Generic disassembler for all X86 platforms.
@@ -122,8 +120,6 @@ public:
raw_ostream &vStream,
raw_ostream &cStream) const;
- /// getEDInfo - See MCDisassembler.
- const EDInstInfo *getEDInfo() const;
private:
DisassemblerMode fMode;
};
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 64ac5e685f..0f6eeb19bc 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -34,10 +34,6 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
switch (MI->getOpcode()) {
case X86::INSERTPSrr:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- Src2Name = getRegName(MI->getOperand(2).getReg());
- DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
- break;
case X86::VINSERTPSrr:
DestName = getRegName(MI->getOperand(0).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -46,10 +42,6 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::MOVLHPSrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeMOVLHPSMask(2, ShuffleMask);
- break;
case X86::VMOVLHPSrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -58,10 +50,6 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::MOVHLPSrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeMOVHLPSMask(2, ShuffleMask);
- break;
case X86::VMOVHLPSrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -69,6 +57,29 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeMOVHLPSMask(2, ShuffleMask);
break;
+ case X86::PALIGNR128rr:
+ case X86::VPALIGNR128rr:
+ Src1Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::PALIGNR128rm:
+ case X86::VPALIGNR128rm:
+ Src2Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePALIGNRMask(MVT::v16i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+ case X86::VPALIGNR256rr:
+ Src1Name = getRegName(MI->getOperand(2).getReg());
+ // FALL THROUGH.
+ case X86::VPALIGNR256rm:
+ Src2Name = getRegName(MI->getOperand(1).getReg());
+ DestName = getRegName(MI->getOperand(0).getReg());
+ DecodePALIGNRMask(MVT::v32i8,
+ MI->getOperand(MI->getNumOperands()-1).getImm(),
+ ShuffleMask);
+ break;
+
case X86::PSHUFDri:
case X86::VPSHUFDri:
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -131,15 +142,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::PUNPCKHBWrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHBWrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v16i8, ShuffleMask);
- break;
case X86::VPUNPCKHBWrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKHBWrm:
case X86::VPUNPCKHBWrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -154,15 +160,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeUNPCKHMask(MVT::v32i8, ShuffleMask);
break;
case X86::PUNPCKHWDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHWDrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v8i16, ShuffleMask);
- break;
case X86::VPUNPCKHWDrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKHWDrm:
case X86::VPUNPCKHWDrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -177,15 +178,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeUNPCKHMask(MVT::v16i16, ShuffleMask);
break;
case X86::PUNPCKHDQrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHDQrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v4i32, ShuffleMask);
- break;
case X86::VPUNPCKHDQrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKHDQrm:
case X86::VPUNPCKHDQrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -200,15 +196,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeUNPCKHMask(MVT::v8i32, ShuffleMask);
break;
case X86::PUNPCKHQDQrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKHQDQrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKHMask(MVT::v2i64, ShuffleMask);
- break;
case X86::VPUNPCKHQDQrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKHQDQrm:
case X86::VPUNPCKHQDQrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -224,15 +215,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::PUNPCKLBWrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKLBWrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v16i8, ShuffleMask);
- break;
case X86::VPUNPCKLBWrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKLBWrm:
case X86::VPUNPCKLBWrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -247,15 +233,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeUNPCKLMask(MVT::v32i8, ShuffleMask);
break;
case X86::PUNPCKLWDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKLWDrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v8i16, ShuffleMask);
- break;
case X86::VPUNPCKLWDrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKLWDrm:
case X86::VPUNPCKLWDrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -270,15 +251,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeUNPCKLMask(MVT::v16i16, ShuffleMask);
break;
case X86::PUNPCKLDQrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKLDQrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v4i32, ShuffleMask);
- break;
case X86::VPUNPCKLDQrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKLDQrm:
case X86::VPUNPCKLDQrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -293,15 +269,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeUNPCKLMask(MVT::v8i32, ShuffleMask);
break;
case X86::PUNPCKLQDQrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::PUNPCKLQDQrm:
- Src1Name = getRegName(MI->getOperand(0).getReg());
- DecodeUNPCKLMask(MVT::v2i64, ShuffleMask);
- break;
case X86::VPUNPCKLQDQrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::PUNPCKLQDQrm:
case X86::VPUNPCKLQDQrm:
Src1Name = getRegName(MI->getOperand(1).getReg());
DestName = getRegName(MI->getOperand(0).getReg());
@@ -317,16 +288,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::SHUFPDrri:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::SHUFPDrmi:
- DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
- ShuffleMask);
- Src1Name = getRegName(MI->getOperand(0).getReg());
- break;
case X86::VSHUFPDrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::SHUFPDrmi:
case X86::VSHUFPDrmi:
DecodeSHUFPMask(MVT::v2f64, MI->getOperand(MI->getNumOperands()-1).getImm(),
ShuffleMask);
@@ -344,16 +309,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::SHUFPSrri:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::SHUFPSrmi:
- DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
- ShuffleMask);
- Src1Name = getRegName(MI->getOperand(0).getReg());
- break;
case X86::VSHUFPSrri:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::SHUFPSrmi:
case X86::VSHUFPSrmi:
DecodeSHUFPMask(MVT::v4f32, MI->getOperand(MI->getNumOperands()-1).getImm(),
ShuffleMask);
@@ -371,15 +330,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
break;
case X86::UNPCKLPDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::UNPCKLPDrm:
- DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(0).getReg());
- break;
case X86::VUNPCKLPDrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::UNPCKLPDrm:
case X86::VUNPCKLPDrm:
DecodeUNPCKLMask(MVT::v2f64, ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -394,15 +348,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::UNPCKLPSrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::UNPCKLPSrm:
- DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(0).getReg());
- break;
case X86::VUNPCKLPSrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::UNPCKLPSrm:
case X86::VUNPCKLPSrm:
DecodeUNPCKLMask(MVT::v4f32, ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -417,15 +366,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::UNPCKHPDrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::UNPCKHPDrm:
- DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(0).getReg());
- break;
case X86::VUNPCKHPDrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::UNPCKHPDrm:
case X86::VUNPCKHPDrm:
DecodeUNPCKHMask(MVT::v2f64, ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
@@ -440,15 +384,10 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DestName = getRegName(MI->getOperand(0).getReg());
break;
case X86::UNPCKHPSrr:
- Src2Name = getRegName(MI->getOperand(2).getReg());
- // FALL THROUGH.
- case X86::UNPCKHPSrm:
- DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
- Src1Name = getRegName(MI->getOperand(0).getReg());
- break;
case X86::VUNPCKHPSrr:
Src2Name = getRegName(MI->getOperand(2).getReg());
// FALL THROUGH.
+ case X86::UNPCKHPSrm:
case X86::VUNPCKHPSrm:
DecodeUNPCKHMask(MVT::v4f32, ShuffleMask);
Src1Name = getRegName(MI->getOperand(1).getReg());
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 467edadc7e..598ddee56d 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -113,7 +113,7 @@ public:
bool fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value,
- const MCInstFragment *DF,
+ const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const;
void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
@@ -255,7 +255,7 @@ bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
uint64_t Value,
- const MCInstFragment *DF,
+ const MCRelaxableFragment *DF,
const MCAsmLayout &Layout) const {
// Relax if the value is too big for a (signed) i8.
return int64_t(Value) != int64_t(int8_t(Value));
@@ -279,9 +279,9 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
Res.setOpcode(RelaxedOp);
}
-/// writeNopData - Write optimal nops to the output file for the \p Count
-/// bytes. This returns the number of bytes written. It may return 0 if
-/// the \p Count is more than the maximum optimal nops.
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
static const uint8_t Nops[10][10] = {
// nop
@@ -315,18 +315,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
return true;
}
- // Write an optimal sequence for the first 15 bytes.
- const uint64_t OptimalCount = (Count < 16) ? Count : 15;
- const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
- for (uint64_t i = 0, e = Prefixes; i != e; i++)
- OW->Write8(0x66);
- const uint64_t Rest = OptimalCount - Prefixes;
- for (uint64_t i = 0, e = Rest; i != e; i++)
- OW->Write8(Nops[Rest - 1][i]);
-
- // Finish with single byte nops.
- for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
- OW->Write8(0x90);
+ // 15 is the longest single nop instruction. Emit as many 15-byte nops as
+ // needed, then emit a nop of the remaining length.
+ do {
+ const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
+ const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
+ for (uint8_t i = 0; i < Prefixes; i++)
+ OW->Write8(0x66);
+ const uint8_t Rest = ThisNopLength - Prefixes;
+ for (uint8_t i = 0; i < Rest; i++)
+ OW->Write8(Nops[Rest - 1][i]);
+ Count -= ThisNopLength;
+ } while (Count != 0);
return true;
}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 7ea1961dec..9e68388cf2 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -104,7 +104,7 @@ namespace X86II {
/// MO_TLSLD - On a symbol operand this indicates that the immediate is
/// the offset of the GOT entry with the TLS index for the module that
- /// contains the symbol. When this index is passed to a call to to
+ /// contains the symbol. When this index is passed to a call to
/// __tls_get_addr, the function will return the base address of the TLS
/// block for the symbol. Used in the x86-64 local dynamic TLS access model.
///
@@ -114,7 +114,7 @@ namespace X86II {
/// MO_TLSLDM - On a symbol operand this indicates that the immediate is
/// the offset of the GOT entry with the TLS index for the module that
- /// contains the symbol. When this index is passed to a call to to
+ /// contains the symbol. When this index is passed to a call to
/// ___tls_get_addr, the function will return the base address of the TLS
/// block for the symbol. Used in the IA32 local dynamic TLS access model.
///
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 16488eb7ae..7815ae98c9 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -44,7 +44,7 @@ void X86MCAsmInfoDarwin::anchor() { }
X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
bool is64Bit = T.getArch() == Triple::x86_64;
if (is64Bit)
- PointerSize = 8;
+ PointerSize = CalleeSaveStackSlotSize = 8;
AssemblerDialect = AsmWriterFlavor;
@@ -76,8 +76,16 @@ X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
void X86ELFMCAsmInfo::anchor() { }
X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
- if (T.getArch() == Triple::x86_64)
- PointerSize = 8;
+ bool is64Bit = T.getArch() == Triple::x86_64;
+ bool isX32 = T.getEnvironment() == Triple::GNUX32;
+
+ // For ELF, x86-64 pointer size depends on the ABI.
+ // For x86-64 without the x32 ABI, pointer size is 8. For x86 and for x86-64
+ // with the x32 ABI, pointer size remains the default 4.
+ PointerSize = (is64Bit && !isX32) ? 8 : 4;
+
+ // OTOH, stack slot size is always 8 for x86-64, even with the x32 ABI.
+ CalleeSaveStackSlotSize = is64Bit ? 8 : 4;
AssemblerDialect = AsmWriterFlavor;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index f66b203f0d..5e84530cd7 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -257,7 +257,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(StringRef TT) {
MCRegisterInfo *X = new MCRegisterInfo();
InitX86MCRegisterInfo(X, RA,
X86_MC::getDwarfRegFlavour(TT, false),
- X86_MC::getDwarfRegFlavour(TT, true));
+ X86_MC::getDwarfRegFlavour(TT, true),
+ RA);
X86_MC::InitLLVM2SEHRegisterMapping(X);
return X;
}
diff --git a/lib/Target/X86/Makefile b/lib/Target/X86/Makefile
index 949661eb99..e518fecf04 100644
--- a/lib/Target/X86/Makefile
+++ b/lib/Target/X86/Makefile
@@ -16,8 +16,7 @@ BUILT_SOURCES = X86GenRegisterInfo.inc X86GenInstrInfo.inc \
X86GenAsmWriter.inc X86GenAsmMatcher.inc \
X86GenAsmWriter1.inc X86GenDAGISel.inc \
X86GenDisassemblerTables.inc X86GenFastISel.inc \
- X86GenCallingConv.inc X86GenSubtargetInfo.inc \
- X86GenEDInfo.inc
+ X86GenCallingConv.inc X86GenSubtargetInfo.inc
DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index 40110353fc..496b704ee8 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -953,3 +953,12 @@ similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
turn into hsubpd also.
//===---------------------------------------------------------------------===//
+
+define <2 x i32> @foo(<2 x double> %in) {
+ %x = fptosi <2 x double> %in to <2 x i32>
+ ret <2 x i32> %x
+}
+
+Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 6a8a4fdf25..b4285a0718 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -1568,43 +1568,6 @@ The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
Core 2, and "Generic"
//===---------------------------------------------------------------------===//
-
-Testcase:
-int a(int x) { return (x & 127) > 31; }
-
-Current output:
- movl 4(%esp), %eax
- andl $127, %eax
- cmpl $31, %eax
- seta %al
- movzbl %al, %eax
- ret
-
-Ideal output:
- xorl %eax, %eax
- testl $96, 4(%esp)
- setne %al
- ret
-
-This should definitely be done in instcombine, canonicalizing the range
-condition into a != condition. We get this IR:
-
-define i32 @a(i32 %x) nounwind readnone {
-entry:
- %0 = and i32 %x, 127 ; <i32> [#uses=1]
- %1 = icmp ugt i32 %0, 31 ; <i1> [#uses=1]
- %2 = zext i1 %1 to i32 ; <i32> [#uses=1]
- ret i32 %2
-}
-
-Instcombine prefers to strength reduce relational comparisons to equality
-comparisons when possible, this should be another case of that. This could
-be handled pretty easily in InstCombiner::visitICmpInstWithInstAndIntCst, but it
-looks like InstCombiner::visitICmpInstWithInstAndIntCst should really already
-be redesigned to use ComputeMaskedBits and friends.
-
-
-//===---------------------------------------------------------------------===//
Testcase:
int x(int a) { return (a&0xf0)>>4; }
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 52a67f763b..815d23588f 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -8,7 +8,7 @@
//===----------------------------------------------------------------------===//
#include "X86.h"
-#include "llvm/Module.h"
+#include "llvm/IR/Module.h"
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index 8b87c1f9c8..bbd490411f 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -61,6 +61,24 @@ void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask) {
ShuffleMask.push_back(NElts+i);
}
+void DecodePALIGNRMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<int> &ShuffleMask) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned Offset = Imm * (VT.getVectorElementType().getSizeInBits() / 8);
+
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Offset;
+ // if i+offset is out of this lane then we actually need the other source
+ if (Base >= NumLaneElts) Base += NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
/// DecodePSHUFMask - This decodes the shuffle masks for pshufd, and vpermilp*.
/// VT indicates the type of the vector allowing it to handle different
/// datatypes and vector widths.
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 70d8171a81..017ab325ec 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -35,6 +35,8 @@ void DecodeMOVHLPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
// <0,2> or <0,1,4,5>
void DecodeMOVLHPSMask(unsigned NElts, SmallVectorImpl<int> &ShuffleMask);
+void DecodePALIGNRMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
+
void DecodePSHUFMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
void DecodePSHUFHWMask(MVT VT, unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 1e7b98d94f..1f9919f159 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -63,6 +63,13 @@ FunctionPass *createX86JITCodeEmitterPass(X86TargetMachine &TM,
///
FunctionPass *createEmitX86CodeToMemory();
+/// \brief Creates an X86-specific Target Transformation Info pass.
+ImmutablePass *createX86TargetTransformInfoPass(const X86TargetMachine *TM);
+
+/// createX86PadShortFunctions - Return a pass that pads short functions
+/// with NOOPs. This will prevent a stall when returning on the Atom.
+FunctionPass *createX86PadShortFunctions();
+
} // End llvm namespace
#endif
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index e3c22d9c3b..0216252c19 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -120,11 +120,16 @@ def FeatureBMI2 : SubtargetFeature<"bmi2", "HasBMI2", "true",
"Support BMI2 instructions">;
def FeatureRTM : SubtargetFeature<"rtm", "HasRTM", "true",
"Support RTM instructions">;
+def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
+ "Support ADX instructions">;
def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
"Use LEA for adjusting the stack pointer">;
def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
- "HasSlowDivide", "true",
- "Use small divide for positive values less than 256">;
+ "HasSlowDivide", "true",
+ "Use small divide for positive values less than 256">;
+def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
+ "PadShortFunctions", "true",
+ "Pad short functions">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
@@ -155,7 +160,8 @@ def : Proc<"pentium3m", [FeatureSSE1, FeatureSlowBTMem]>;
def : Proc<"pentium-m", [FeatureSSE2, FeatureSlowBTMem]>;
def : Proc<"pentium4", [FeatureSSE2]>;
def : Proc<"pentium4m", [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"x86-64", [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+ FeatureFastUAMem]>;
def : Proc<"yonah", [FeatureSSE3, FeatureSlowBTMem]>;
def : Proc<"prescott", [FeatureSSE3, FeatureSlowBTMem]>;
def : Proc<"nocona", [FeatureSSE3, FeatureCMPXCHG16B,
@@ -166,7 +172,7 @@ def : Proc<"penryn", [FeatureSSE41, FeatureCMPXCHG16B,
FeatureSlowBTMem]>;
def : AtomProc<"atom", [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
- FeatureSlowDivide]>;
+ FeatureSlowDivide, FeaturePadShortFunctions]>;
// "Arrandale" along with corei3 and corei5
def : Proc<"corei7", [FeatureSSE42, FeatureCMPXCHG16B,
FeatureSlowBTMem, FeatureFastUAMem,
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index d275e1dac2..ac5daec2b2 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -20,26 +20,26 @@
#include "X86TargetMachine.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/Assembly/Writer.h"
-#include "llvm/CallingConv.h"
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
-#include "llvm/Module.h"
#include "llvm/Support/COFF.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/Mangler.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Type.h"
using namespace llvm;
//===----------------------------------------------------------------------===//
@@ -252,14 +252,15 @@ void X86AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
}
case MachineOperand::MO_Immediate:
- O << '$' << MO.getImm();
+ if (AsmVariant == 0) O << '$';
+ O << MO.getImm();
return;
case MachineOperand::MO_JumpTableIndex:
case MachineOperand::MO_ConstantPoolIndex:
case MachineOperand::MO_GlobalAddress:
case MachineOperand::MO_ExternalSymbol: {
- O << '$';
+ if (AsmVariant == 0) O << '$';
printSymbolOperand(MO, O);
break;
}
@@ -355,19 +356,23 @@ void X86AsmPrinter::printIntelMemReference(const MachineInstr *MI, unsigned Op,
NeedPlus = true;
}
- assert (DispSpec.isImm() && "Displacement is not an immediate!");
- int64_t DispVal = DispSpec.getImm();
- if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
- if (NeedPlus) {
- if (DispVal > 0)
- O << " + ";
- else {
- O << " - ";
- DispVal = -DispVal;
+ if (!DispSpec.isImm()) {
+ if (NeedPlus) O << " + ";
+ printOperand(MI, Op+3, O, Modifier, AsmVariant);
+ } else {
+ int64_t DispVal = DispSpec.getImm();
+ if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (NeedPlus) {
+ if (DispVal > 0)
+ O << " + ";
+ else {
+ O << " - ";
+ DispVal = -DispVal;
+ }
}
+ O << DispVal;
}
- O << DispVal;
- }
+ }
O << ']';
}
@@ -543,7 +548,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
MCSA_IndirectSymbol);
// hlt; hlt; hlt; hlt; hlt hlt = 0xf4.
const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
- OutStreamer.EmitBytes(StringRef(HltInsts, 5), 0/*addrspace*/);
+ OutStreamer.EmitBytes(StringRef(HltInsts, 5));
}
Stubs.clear();
@@ -569,7 +574,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
// .long 0
if (MCSym.getInt())
// External to current translation unit.
- OutStreamer.EmitIntValue(0, 4/*size*/, 0/*addrspace*/);
+ OutStreamer.EmitIntValue(0, 4/*size*/);
else
// Internal to current translation unit.
//
@@ -578,8 +583,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
// using NLPs. However, sometimes the types are local to the file. So
// we need to fill in the value for the NLP in those cases.
OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
- OutContext),
- 4/*size*/, 0/*addrspace*/);
+ OutContext), 4/*size*/);
}
Stubs.clear();
OutStreamer.AddBlankLine();
@@ -596,8 +600,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
// .long _foo
OutStreamer.EmitValue(MCSymbolRefExpr::
Create(Stubs[i].second.getPointer(),
- OutContext),
- 4/*size*/, 0/*addrspace*/);
+ OutContext), 4/*size*/);
}
Stubs.clear();
OutStreamer.AddBlankLine();
@@ -663,7 +666,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
name += ",DATA";
else
name += ",data";
- OutStreamer.EmitBytes(name, 0);
+ OutStreamer.EmitBytes(name);
}
for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
@@ -672,7 +675,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
else
name = " -export:";
name += DLLExportedFns[i]->getName();
- OutStreamer.EmitBytes(name, 0);
+ OutStreamer.EmitBytes(name);
}
}
}
@@ -692,7 +695,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
OutStreamer.EmitLabel(Stubs[i].first);
OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
- TD->getPointerSize(), 0);
+ TD->getPointerSize());
}
Stubs.clear();
}
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 61eb14e036..bc7496bad1 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- X86AsmPrinter.h - Convert X86 LLVM code to assembly -----*- C++ -*-===//
+//===-- X86AsmPrinter.h - X86 implementation of AsmPrinter ------*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,10 +6,6 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// AT&T assembly code printer class.
-//
-//===----------------------------------------------------------------------===//
#ifndef X86ASMPRINTER_H
#define X86ASMPRINTER_H
@@ -35,7 +31,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
}
virtual const char *getPassName() const LLVM_OVERRIDE {
- return "X86 AT&T-Style Assembly Printer";
+ return "X86 Assembly / Object Emitter";
}
const X86Subtarget &getSubtarget() const { return *Subtarget; }
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 7ad2fdd259..b516be0696 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -519,6 +519,9 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
(sequence "XMM%u", 6, 15))>;
+def CSR_MostRegs_64 : CalleeSavedRegs<(add RBX, RCX, RDX, RSI, RDI, R8, R9, R10,
+ R11, R12, R13, R14, R15, RBP,
+ (sequence "XMM%u", 0, 15))>;
// Standard C + YMM6-15
def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index df578fe098..ece38aa346 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -25,7 +25,7 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCInst.h"
@@ -124,7 +124,7 @@ template<class CodeEmitter>
} // end anonymous namespace.
/// createX86CodeEmitterPass - Return a pass that emits the collected X86 code
-/// to the specified templated MachineCodeEmitter object.
+/// to the specified JITCodeEmitter object.
FunctionPass *llvm::createX86JITCodeEmitterPass(X86TargetMachine &TM,
JITCodeEmitter &JCE) {
return new Emitter<JITCodeEmitter>(TM, JCE);
diff --git a/lib/Target/X86/X86CompilationCallback_Win64.asm b/lib/Target/X86/X86CompilationCallback_Win64.asm
index f321778db2..69b4c71651 100644
--- a/lib/Target/X86/X86CompilationCallback_Win64.asm
+++ b/lib/Target/X86/X86CompilationCallback_Win64.asm
@@ -11,7 +11,7 @@
;;
;;===----------------------------------------------------------------------===
-extrn X86CompilationCallback2: PROC
+extrn LLVMX86CompilationCallback2: PROC
.code
X86CompilationCallback proc
@@ -42,7 +42,7 @@ X86CompilationCallback proc
; Pass prev frame and return address.
mov rcx, rbp
mov rdx, qword ptr [rbp+8]
- call X86CompilationCallback2
+ call LLVMX86CompilationCallback2
; Restore all XMM arg registers.
movaps xmm3, [rsp+48+32]
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 3e78b4c0a1..b5c3270065 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -19,19 +19,19 @@
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
-#include "llvm/CallingConv.h"
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/FastISel.h"
#include "llvm/CodeGen/FunctionLoweringInfo.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalAlias.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Operator.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Operator.h"
#include "llvm/Support/CallSite.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/GetElementPtrTypeIterator.h"
@@ -75,6 +75,8 @@ public:
virtual bool TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
const LoadInst *LI);
+ virtual bool FastLowerArguments();
+
#include "X86GenFastISel.inc"
private:
@@ -326,12 +328,11 @@ bool X86FastISel::X86FastEmitExtend(ISD::NodeType Opc, EVT DstVT,
unsigned &ResultReg) {
unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
Src, /*TODO: Kill=*/false);
-
- if (RR != 0) {
- ResultReg = RR;
- return true;
- } else
+ if (RR == 0)
return false;
+
+ ResultReg = RR;
+ return true;
}
/// X86SelectAddress - Attempt to fill in an address from the given value.
@@ -727,7 +728,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
// Don't handle popping bytes on return for now.
if (X86MFInfo->getBytesToPopOnReturn() != 0)
- return 0;
+ return false;
// fastcc with -tailcallopt is intended to provide a guaranteed
// tail call optimization. Fastisel doesn't know how to do that.
@@ -738,10 +739,12 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
if (F.isVarArg())
return false;
+ // Build a list of return value registers.
+ SmallVector<unsigned, 4> RetRegs;
+
if (Ret->getNumOperands() > 0) {
SmallVector<ISD::OutputArg, 4> Outs;
- GetReturnInfo(F.getReturnType(), F.getAttributes().getRetAttributes(),
- Outs, TLI);
+ GetReturnInfo(F.getReturnType(), F.getAttributes(), Outs, TLI);
// Analyze operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ValLocs;
@@ -806,8 +809,8 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
DstReg).addReg(SrcReg);
- // Mark the register as live out of the function.
- MRI.addLiveOut(VA.getLocReg());
+ // Add register to return instruction.
+ RetRegs.push_back(VA.getLocReg());
}
// The x86-64 ABI for returning structs by value requires that we copy
@@ -820,11 +823,14 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
"SRetReturnReg should have been set in LowerFormalArguments()!");
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
X86::RAX).addReg(Reg);
- MRI.addLiveOut(X86::RAX);
+ RetRegs.push_back(X86::RAX);
}
// Now emit the RET.
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+ MachineInstrBuilder MIB =
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(X86::RET));
+ for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
+ MIB.addReg(RetRegs[i], RegState::Implicit);
return true;
}
@@ -1373,7 +1379,6 @@ bool X86FastISel::TryEmitSmallMemcpy(X86AddressMode DestAM,
else if (Len >= 2)
VT = MVT::i16;
else {
- assert(Len == 1);
VT = MVT::i8;
}
@@ -1517,6 +1522,78 @@ bool X86FastISel::X86VisitIntrinsicCall(const IntrinsicInst &I) {
}
}
+bool X86FastISel::FastLowerArguments() {
+ if (!FuncInfo.CanLowerReturn)
+ return false;
+
+ const Function *F = FuncInfo.Fn;
+ if (F->isVarArg())
+ return false;
+
+ CallingConv::ID CC = F->getCallingConv();
+ if (CC != CallingConv::C)
+ return false;
+
+ if (!Subtarget->is64Bit())
+ return false;
+
+ // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments.
+ unsigned Idx = 1;
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I, ++Idx) {
+ if (Idx > 6)
+ return false;
+
+ if (F->getAttributes().hasAttribute(Idx, Attribute::ByVal) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::InReg) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::StructRet) ||
+ F->getAttributes().hasAttribute(Idx, Attribute::Nest))
+ return false;
+
+ Type *ArgTy = I->getType();
+ if (ArgTy->isStructTy() || ArgTy->isArrayTy() || ArgTy->isVectorTy())
+ return false;
+
+ EVT ArgVT = TLI.getValueType(ArgTy);
+ if (!ArgVT.isSimple()) return false;
+ switch (ArgVT.getSimpleVT().SimpleTy) {
+ case MVT::i32:
+ case MVT::i64:
+ break;
+ default:
+ return false;
+ }
+ }
+
+ static const uint16_t GPR32ArgRegs[] = {
+ X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
+ };
+ static const uint16_t GPR64ArgRegs[] = {
+ X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
+ };
+
+ Idx = 0;
+ const TargetRegisterClass *RC32 = TLI.getRegClassFor(MVT::i32);
+ const TargetRegisterClass *RC64 = TLI.getRegClassFor(MVT::i64);
+ for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ I != E; ++I, ++Idx) {
+ if (I->use_empty())
+ continue;
+ bool is32Bit = TLI.getValueType(I->getType()) == MVT::i32;
+ const TargetRegisterClass *RC = is32Bit ? RC32 : RC64;
+ unsigned SrcReg = is32Bit ? GPR32ArgRegs[Idx] : GPR64ArgRegs[Idx];
+ unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+ // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
+ // Without this, EmitLiveInCopies may eliminate the livein if its only
+ // use is a bitcast (which isn't turned into an instruction).
+ unsigned ResultReg = createResultReg(RC);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
+ ResultReg).addReg(DstReg, getKillRegState(true));
+ UpdateValueMap(I, ResultReg);
+ }
+ return true;
+}
+
bool X86FastISel::X86SelectCall(const Instruction *I) {
const CallInst *CI = cast<CallInst>(I);
const Value *Callee = CI->getCalledValue();
@@ -1545,9 +1622,9 @@ static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
CallingConv::ID CC = CS.getCallingConv();
if (CC == CallingConv::Fast || CC == CallingConv::GHC)
return 0;
- if (!CS.paramHasAttr(1, Attributes::StructRet))
+ if (!CS.paramHasAttr(1, Attribute::StructRet))
return 0;
- if (CS.paramHasAttr(1, Attributes::InReg))
+ if (CS.paramHasAttr(1, Attribute::InReg))
return 0;
return 4;
}
@@ -1585,8 +1662,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
// Check whether the function can return without sret-demotion.
SmallVector<ISD::OutputArg, 4> Outs;
- GetReturnInfo(I->getType(), CS.getAttributes().getRetAttributes(),
- Outs, TLI);
+ GetReturnInfo(I->getType(), CS.getAttributes(), Outs, TLI);
bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
*FuncInfo.MF, FTy->isVarArg(),
Outs, FTy->getContext());
@@ -1626,12 +1702,12 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
Value *ArgVal = *i;
ISD::ArgFlagsTy Flags;
unsigned AttrInd = i - CS.arg_begin() + 1;
- if (CS.paramHasAttr(AttrInd, Attributes::SExt))
+ if (CS.paramHasAttr(AttrInd, Attribute::SExt))
Flags.setSExt();
- if (CS.paramHasAttr(AttrInd, Attributes::ZExt))
+ if (CS.paramHasAttr(AttrInd, Attribute::ZExt))
Flags.setZExt();
- if (CS.paramHasAttr(AttrInd, Attributes::ByVal)) {
+ if (CS.paramHasAttr(AttrInd, Attribute::ByVal)) {
PointerType *Ty = cast<PointerType>(ArgVal->getType());
Type *ElementTy = Ty->getElementType();
unsigned FrameSize = TD.getTypeAllocSize(ElementTy);
@@ -1645,9 +1721,9 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
return false;
}
- if (CS.paramHasAttr(AttrInd, Attributes::InReg))
+ if (CS.paramHasAttr(AttrInd, Attribute::InReg))
Flags.setInReg();
- if (CS.paramHasAttr(AttrInd, Attributes::Nest))
+ if (CS.paramHasAttr(AttrInd, Attribute::Nest))
Flags.setNest();
// If this is an i1/i8/i16 argument, promote to i32 to avoid an extra
@@ -1909,17 +1985,17 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
ComputeValueVTs(TLI, I->getType(), RetTys);
for (unsigned i = 0, e = RetTys.size(); i != e; ++i) {
EVT VT = RetTys[i];
- EVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT);
+ MVT RegisterVT = TLI.getRegisterType(I->getParent()->getContext(), VT);
unsigned NumRegs = TLI.getNumRegisters(I->getParent()->getContext(), VT);
for (unsigned j = 0; j != NumRegs; ++j) {
ISD::InputArg MyFlags;
- MyFlags.VT = RegisterVT.getSimpleVT();
+ MyFlags.VT = RegisterVT;
MyFlags.Used = !CS.getInstruction()->use_empty();
- if (CS.paramHasAttr(0, Attributes::SExt))
+ if (CS.paramHasAttr(0, Attribute::SExt))
MyFlags.Flags.setSExt();
- if (CS.paramHasAttr(0, Attributes::ZExt))
+ if (CS.paramHasAttr(0, Attribute::ZExt))
MyFlags.Flags.setZExt();
- if (CS.paramHasAttr(0, Attributes::InReg))
+ if (CS.paramHasAttr(0, Attribute::InReg))
MyFlags.Flags.setInReg();
Ins.push_back(MyFlags);
}
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 16197e09e9..0585b43a46 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -36,7 +36,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
-#include "llvm/InlineAsm.h"
+#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 18595a86ea..54cbd40274 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -23,8 +23,8 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Function.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/CommandLine.h"
@@ -50,13 +50,13 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
RegInfo->needsStackRealignment(MF) ||
MFI->hasVarSizedObjects() ||
- MFI->isFrameAddressTaken() ||
+ MFI->isFrameAddressTaken() || MF.hasMSInlineAsm() ||
MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
MMI.callsUnwindInit() || MMI.callsEHReturn());
}
-static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
- if (is64Bit) {
+static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
if (isInt<8>(Imm))
return X86::SUB64ri8;
return X86::SUB64ri32;
@@ -67,8 +67,8 @@ static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
}
}
-static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
- if (is64Bit) {
+static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+ if (IsLP64) {
if (isInt<8>(Imm))
return X86::ADD64ri8;
return X86::ADD64ri32;
@@ -79,8 +79,8 @@ static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
}
}
-static unsigned getLEArOpcode(unsigned is64Bit) {
- return is64Bit ? X86::LEA64r : X86::LEA32r;
+static unsigned getLEArOpcode(unsigned IsLP64) {
+ return IsLP64 ? X86::LEA64r : X86::LEA32r;
}
/// findDeadCallerSavedReg - Return a caller-saved register that isn't live
@@ -145,17 +145,17 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
static
void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
unsigned StackPtr, int64_t NumBytes,
- bool Is64Bit, bool UseLEA,
+ bool Is64Bit, bool IsLP64, bool UseLEA,
const TargetInstrInfo &TII, const TargetRegisterInfo &TRI) {
bool isSub = NumBytes < 0;
uint64_t Offset = isSub ? -NumBytes : NumBytes;
unsigned Opc;
if (UseLEA)
- Opc = getLEArOpcode(Is64Bit);
+ Opc = getLEArOpcode(IsLP64);
else
Opc = isSub
- ? getSUBriOpcode(Is64Bit, Offset)
- : getADDriOpcode(Is64Bit, Offset);
+ ? getSUBriOpcode(IsLP64, Offset)
+ : getADDriOpcode(IsLP64, Offset);
uint64_t Chunk = (1LL << 31) - 1;
DebugLoc DL = MBB.findDebugLoc(MBBI);
@@ -625,6 +625,22 @@ uint32_t X86FrameLowering::getCompactUnwindEncoding(MachineFunction &MF) const {
return CompactUnwindEncoding;
}
+/// usesTheStack - This function checks if any of the users of EFLAGS
+/// copies the EFLAGS. We know that the code that lowers COPY of EFLAGS has
+/// to use the stack, and if we don't adjust the stack we clobber the first
+/// frame index.
+/// See X86InstrInfo::copyPhysReg.
+static bool usesTheStack(MachineFunction &MF) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+
+ for (MachineRegisterInfo::reg_iterator ri = MRI.reg_begin(X86::EFLAGS),
+ re = MRI.reg_end(); ri != re; ++ri)
+ if (ri->isCopy())
+ return true;
+
+ return false;
+}
+
/// emitPrologue - Push callee-saved registers onto the stack, which
/// automatically adjust the stack pointer. Adjust the stack pointer to allocate
/// space for local variables. Also emit labels used by the exception handler to
@@ -644,6 +660,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
uint64_t StackSize = MFI->getStackSize(); // Number of bytes to allocate.
bool HasFP = hasFP(MF);
bool Is64Bit = STI.is64Bit();
+ bool IsLP64 = STI.isTarget64BitLP64();
bool IsWin64 = STI.isTargetWin64();
bool UseLEA = STI.useLeaForSP();
unsigned StackAlign = getStackAlignment();
@@ -673,12 +690,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// If this is x86-64 and the Red Zone is not disabled, if we are a leaf
// function, and use up to 128 bytes of stack space, don't have a frame
// pointer, calls, or dynamic alloca then we do not need to adjust the
- // stack pointer (we fit in the Red Zone).
- if (Is64Bit && !Fn->getFnAttributes().hasAttribute(Attributes::NoRedZone) &&
+ // stack pointer (we fit in the Red Zone). We also check that we don't
+ // push and pop from the stack.
+ if (Is64Bit && !Fn->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoRedZone) &&
!RegInfo->needsStackRealignment(MF) &&
!MFI->hasVarSizedObjects() && // No dynamic alloca.
!MFI->adjustsStack() && // No calls.
!IsWin64 && // Win64 has no Red Zone
+ !usesTheStack(MF) && // Don't push and pop.
!MF.getTarget().Options.EnableSegmentedStacks) { // Regular stack
uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
if (HasFP) MinSize += SlotSize;
@@ -692,7 +712,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
if (TailCallReturnAddrDelta < 0) {
MachineInstr *MI =
BuildMI(MBB, MBBI, DL,
- TII.get(getSUBriOpcode(Is64Bit, -TailCallReturnAddrDelta)),
+ TII.get(getSUBriOpcode(IsLP64, -TailCallReturnAddrDelta)),
StackPtr)
.addReg(StackPtr)
.addImm(-TailCallReturnAddrDelta)
@@ -908,7 +928,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
// MSVC x64's __chkstk needs to adjust %rsp.
// FIXME: %rax preserves the offset and should be available.
if (isSPUpdateNeeded)
- emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
+ emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
UseLEA, TII, *RegInfo);
if (isEAXAlive) {
@@ -920,7 +940,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
MBB.insert(MBBI, MI);
}
} else if (NumBytes)
- emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit,
+ emitSPUpdate(MBB, MBBI, StackPtr, -(int64_t)NumBytes, Is64Bit, IsLP64,
UseLEA, TII, *RegInfo);
// If we need a base pointer, set it up here. It's whatever the value
@@ -977,6 +997,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
unsigned RetOpcode = MBBI->getOpcode();
DebugLoc DL = MBBI->getDebugLoc();
bool Is64Bit = STI.is64Bit();
+ bool IsLP64 = STI.isTarget64BitLP64();
bool UseLEA = STI.useLeaForSP();
unsigned StackAlign = getStackAlignment();
unsigned SlotSize = RegInfo->getSlotSize();
@@ -1062,7 +1083,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (RegInfo->needsStackRealignment(MF))
MBBI = FirstCSPop;
if (CSSize != 0) {
- unsigned Opc = getLEArOpcode(Is64Bit);
+ unsigned Opc = getLEArOpcode(IsLP64);
addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
FramePtr, false, -CSSize);
} else {
@@ -1072,7 +1093,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
} else if (NumBytes) {
// Adjust stack pointer back: ESP += numbytes.
- emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, UseLEA, TII, *RegInfo);
+ emitSPUpdate(MBB, MBBI, StackPtr, NumBytes, Is64Bit, IsLP64, UseLEA,
+ TII, *RegInfo);
}
// We're returning from function via eh_return.
@@ -1107,7 +1129,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
if (Offset) {
// Check for possible merge with preceding ADD instruction.
Offset += mergeSPUpdates(MBB, MBBI, StackPtr, true);
- emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, UseLEA, TII, *RegInfo);
+ emitSPUpdate(MBB, MBBI, StackPtr, Offset, Is64Bit, IsLP64,
+ UseLEA, TII, *RegInfo);
}
// Jump to label or value in register.
@@ -1138,7 +1161,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
MachineInstr *NewMI = prior(MBBI);
- NewMI->copyImplicitOps(MBBI);
+ NewMI->copyImplicitOps(MF, MBBI);
// Delete the pseudo instruction TCRETURN.
MBB.erase(MBBI);
@@ -1150,7 +1173,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// Check for possible merge with preceding ADD instruction.
delta += mergeSPUpdates(MBB, MBBI, StackPtr, true);
- emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, UseLEA, TII, *RegInfo);
+ emitSPUpdate(MBB, MBBI, StackPtr, delta, Is64Bit, IsLP64, UseLEA, TII,
+ *RegInfo);
}
}
@@ -1362,17 +1386,25 @@ HasNestArgument(const MachineFunction *MF) {
return false;
}
-
-/// GetScratchRegister - Get a register for performing work in the segmented
-/// stack prologue. Depending on platform and the properties of the function
-/// either one or two registers will be needed. Set primary to true for
-/// the first register, false for the second.
+/// GetScratchRegister - Get a temp register for performing work in the
+/// segmented stack and the Erlang/HiPE stack prologue. Depending on platform
+/// and the properties of the function either one or two registers will be
+/// needed. Set primary to true for the first register, false for the second.
static unsigned
GetScratchRegister(bool Is64Bit, const MachineFunction &MF, bool Primary) {
+ CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+
+ // Erlang stuff.
+ if (CallingConvention == CallingConv::HiPE) {
+ if (Is64Bit)
+ return Primary ? X86::R14 : X86::R13;
+ else
+ return Primary ? X86::EBX : X86::EDI;
+ }
+
if (Is64Bit)
return Primary ? X86::R11 : X86::R12;
- CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
bool IsNested = HasNestArgument(&MF);
if (CallingConvention == CallingConv::X86_FastCall ||
@@ -1400,7 +1432,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
bool Is64Bit = STI.is64Bit();
unsigned TlsReg, TlsOffset;
DebugLoc DL;
- const X86Subtarget *ST = &MF.getTarget().getSubtarget<X86Subtarget>();
unsigned ScratchReg = GetScratchRegister(Is64Bit, MF, true);
assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
@@ -1408,8 +1439,8 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
if (MF.getFunction()->isVarArg())
report_fatal_error("Segmented stacks do not support vararg functions.");
- if (!ST->isTargetLinux() && !ST->isTargetDarwin() &&
- !ST->isTargetWin32() && !ST->isTargetFreeBSD())
+ if (!STI.isTargetLinux() && !STI.isTargetDarwin() &&
+ !STI.isTargetWin32() && !STI.isTargetFreeBSD())
report_fatal_error("Segmented stacks not supported on this platform.");
MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
@@ -1447,13 +1478,13 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
// Read the limit off the current stacklet off the stack_guard location.
if (Is64Bit) {
- if (ST->isTargetLinux()) {
+ if (STI.isTargetLinux()) {
TlsReg = X86::FS;
TlsOffset = 0x70;
- } else if (ST->isTargetDarwin()) {
+ } else if (STI.isTargetDarwin()) {
TlsReg = X86::GS;
TlsOffset = 0x60 + 90*8; // See pthread_machdep.h. Steal TLS slot 90.
- } else if (ST->isTargetFreeBSD()) {
+ } else if (STI.isTargetFreeBSD()) {
TlsReg = X86::FS;
TlsOffset = 0x18;
} else {
@@ -1469,16 +1500,16 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
BuildMI(checkMBB, DL, TII.get(X86::CMP64rm)).addReg(ScratchReg)
.addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg);
} else {
- if (ST->isTargetLinux()) {
+ if (STI.isTargetLinux()) {
TlsReg = X86::GS;
TlsOffset = 0x30;
- } else if (ST->isTargetDarwin()) {
+ } else if (STI.isTargetDarwin()) {
TlsReg = X86::GS;
TlsOffset = 0x48 + 90*4;
- } else if (ST->isTargetWin32()) {
+ } else if (STI.isTargetWin32()) {
TlsReg = X86::FS;
TlsOffset = 0x14; // pvArbitrary, reserved for application use
- } else if (ST->isTargetFreeBSD()) {
+ } else if (STI.isTargetFreeBSD()) {
report_fatal_error("Segmented stacks not supported on FreeBSD i386.");
} else {
report_fatal_error("Segmented stacks not supported on this platform.");
@@ -1490,10 +1521,10 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
BuildMI(checkMBB, DL, TII.get(X86::LEA32r), ScratchReg).addReg(X86::ESP)
.addImm(1).addReg(0).addImm(-StackSize).addReg(0);
- if (ST->isTargetLinux() || ST->isTargetWin32()) {
+ if (STI.isTargetLinux() || STI.isTargetWin32()) {
BuildMI(checkMBB, DL, TII.get(X86::CMP32rm)).addReg(ScratchReg)
.addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
- } else if (ST->isTargetDarwin()) {
+ } else if (STI.isTargetDarwin()) {
// TlsOffset doesn't fit into a mod r/m byte so we need an extra register
unsigned ScratchReg2;
@@ -1579,3 +1610,228 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
MF.verify();
#endif
}
+
+/// Erlang programs may need a special prologue to handle the stack size they
+/// might need at runtime. That is because Erlang/OTP does not implement a C
+/// stack but uses a custom implementation of hybrid stack/heap architecture.
+/// (for more information see Eric Stenman's Ph.D. thesis:
+/// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
+///
+/// CheckStack:
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+/// OldStart:
+/// ...
+/// IncStack:
+/// call inc_stack # doubles the stack space
+/// temp0 = sp - MaxStack
+/// if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
+ const X86InstrInfo &TII = *TM.getInstrInfo();
+ MachineFrameInfo *MFI = MF.getFrameInfo();
+ const unsigned SlotSize = TM.getRegisterInfo()->getSlotSize();
+ const bool Is64Bit = STI.is64Bit();
+ DebugLoc DL;
+ // HiPE-specific values
+ const unsigned HipeLeafWords = 24;
+ const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
+ const unsigned Guaranteed = HipeLeafWords * SlotSize;
+ unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
+ MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+ unsigned MaxStack = MFI->getStackSize() + CallerStkArity*SlotSize + SlotSize;
+
+ assert(STI.isTargetLinux() &&
+ "HiPE prologue is only supported on Linux operating systems.");
+
+ // Compute the largest caller's frame that is needed to fit the callees'
+ // frames. This 'MaxStack' is computed from:
+ //
+ // a) the fixed frame size, which is the space needed for all spilled temps,
+ // b) outgoing on-stack parameter areas, and
+ // c) the minimum stack space this function needs to make available for the
+ // functions it calls (a tunable ABI property).
+ if (MFI->hasCalls()) {
+ unsigned MoreStackForCalls = 0;
+
+ for (MachineFunction::iterator MBBI = MF.begin(), MBBE = MF.end();
+ MBBI != MBBE; ++MBBI)
+ for (MachineBasicBlock::iterator MI = MBBI->begin(), ME = MBBI->end();
+ MI != ME; ++MI) {
+ if (!MI->isCall())
+ continue;
+
+ // Get callee operand.
+ const MachineOperand &MO = MI->getOperand(0);
+
+ // Only take account of global function calls (no closures etc.).
+ if (!MO.isGlobal())
+ continue;
+
+ const Function *F = dyn_cast<Function>(MO.getGlobal());
+ if (!F)
+ continue;
+
+ // Do not update 'MaxStack' for primitive and built-in functions
+ // (encoded with names either starting with "erlang."/"bif_" or not
+ // having a ".", such as a simple <Module>.<Function>.<Arity>, or an
+ // "_", such as the BIF "suspend_0") as they are executed on another
+ // stack.
+ if (F->getName().find("erlang.") != StringRef::npos ||
+ F->getName().find("bif_") != StringRef::npos ||
+ F->getName().find_first_of("._") == StringRef::npos)
+ continue;
+
+ unsigned CalleeStkArity =
+ F->arg_size() > CCRegisteredArgs ? F->arg_size()-CCRegisteredArgs : 0;
+ if (HipeLeafWords - 1 > CalleeStkArity)
+ MoreStackForCalls = std::max(MoreStackForCalls,
+ (HipeLeafWords - 1 - CalleeStkArity) * SlotSize);
+ }
+ MaxStack += MoreStackForCalls;
+ }
+
+ // If the stack frame needed is larger than the guaranteed then runtime checks
+ // and calls to "inc_stack_0" BIF should be inserted in the assembly prologue.
+ if (MaxStack > Guaranteed) {
+ MachineBasicBlock &prologueMBB = MF.front();
+ MachineBasicBlock *stackCheckMBB = MF.CreateMachineBasicBlock();
+ MachineBasicBlock *incStackMBB = MF.CreateMachineBasicBlock();
+
+ for (MachineBasicBlock::livein_iterator I = prologueMBB.livein_begin(),
+ E = prologueMBB.livein_end(); I != E; I++) {
+ stackCheckMBB->addLiveIn(*I);
+ incStackMBB->addLiveIn(*I);
+ }
+
+ MF.push_front(incStackMBB);
+ MF.push_front(stackCheckMBB);
+
+ unsigned ScratchReg, SPReg, PReg, SPLimitOffset;
+ unsigned LEAop, CMPop, CALLop;
+ if (Is64Bit) {
+ SPReg = X86::RSP;
+ PReg = X86::RBP;
+ LEAop = X86::LEA64r;
+ CMPop = X86::CMP64rm;
+ CALLop = X86::CALL64pcrel32;
+ SPLimitOffset = 0x90;
+ } else {
+ SPReg = X86::ESP;
+ PReg = X86::EBP;
+ LEAop = X86::LEA32r;
+ CMPop = X86::CMP32rm;
+ CALLop = X86::CALLpcrel32;
+ SPLimitOffset = 0x4c;
+ }
+
+ ScratchReg = GetScratchRegister(Is64Bit, MF, true);
+ assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
+ "HiPE prologue scratch register is live-in");
+
+ // Create new MBB for StackCheck:
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ // SPLimitOffset is in a fixed heap location (pointed by BP).
+ addRegOffset(BuildMI(stackCheckMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(stackCheckMBB, DL, TII.get(X86::JAE_4)).addMBB(&prologueMBB);
+
+ // Create new MBB for IncStack:
+ BuildMI(incStackMBB, DL, TII.get(CALLop)).
+ addExternalSymbol("inc_stack_0");
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(LEAop), ScratchReg),
+ SPReg, false, -MaxStack);
+ addRegOffset(BuildMI(incStackMBB, DL, TII.get(CMPop))
+ .addReg(ScratchReg), PReg, false, SPLimitOffset);
+ BuildMI(incStackMBB, DL, TII.get(X86::JLE_4)).addMBB(incStackMBB);
+
+ stackCheckMBB->addSuccessor(&prologueMBB, 99);
+ stackCheckMBB->addSuccessor(incStackMBB, 1);
+ incStackMBB->addSuccessor(&prologueMBB, 99);
+ incStackMBB->addSuccessor(incStackMBB, 1);
+ }
+#ifdef XDEBUG
+ MF.verify();
+#endif
+}
+
+void X86FrameLowering::
+eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator I) const {
+ const X86InstrInfo &TII = *TM.getInstrInfo();
+ const X86RegisterInfo &RegInfo = *TM.getRegisterInfo();
+ unsigned StackPtr = RegInfo.getStackRegister();
+ bool reseveCallFrame = hasReservedCallFrame(MF);
+ int Opcode = I->getOpcode();
+ bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
+ bool IsLP64 = STI.isTarget64BitLP64();
+ DebugLoc DL = I->getDebugLoc();
+ uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
+ uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
+ I = MBB.erase(I);
+
+ if (!reseveCallFrame) {
+ // If the stack pointer can be changed after prologue, turn the
+ // adjcallstackup instruction into a 'sub ESP, <amt>' and the
+ // adjcallstackdown instruction into 'add ESP, <amt>'
+ // TODO: consider using push / pop instead of sub + store / add
+ if (Amount == 0)
+ return;
+
+ // We need to keep the stack aligned properly. To do this, we round the
+ // amount of space needed for the outgoing arguments up to the next
+ // alignment boundary.
+ unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
+ Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
+
+ MachineInstr *New = 0;
+ if (Opcode == TII.getCallFrameSetupOpcode()) {
+ New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
+ StackPtr)
+ .addReg(StackPtr)
+ .addImm(Amount);
+ } else {
+ assert(Opcode == TII.getCallFrameDestroyOpcode());
+
+ // Factor out the amount the callee already popped.
+ Amount -= CalleeAmt;
+
+ if (Amount) {
+ unsigned Opc = getADDriOpcode(IsLP64, Amount);
+ New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr).addImm(Amount);
+ }
+ }
+
+ if (New) {
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ // Replace the pseudo instruction with a new instruction.
+ MBB.insert(I, New);
+ }
+
+ return;
+ }
+
+ if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
+ // If we are performing frame pointer elimination and if the callee pops
+ // something off the stack pointer, add it back. We do this until we have
+ // more advanced stack pointer tracking ability.
+ unsigned Opc = getSUBriOpcode(IsLP64, CalleeAmt);
+ MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
+ .addReg(StackPtr).addImm(CalleeAmt);
+
+ // The EFLAGS implicit def is dead.
+ New->getOperand(3).setIsDead();
+
+ // We are not tracking the stack pointer adjustment by the callee, so make
+ // sure we restore the stack pointer immediately after the call, there may
+ // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+ MachineBasicBlock::iterator B = MBB.begin();
+ while (I != B && !llvm::prior(I)->isCall())
+ --I;
+ MBB.insert(I, New);
+ }
+}
+
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index dc515dc39c..3f08b9a2e8 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -43,6 +43,8 @@ public:
void adjustForSegmentedStacks(MachineFunction &MF) const;
+ void adjustForHiPEPrologue(MachineFunction &MF) const;
+
void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
RegScavenger *RS = NULL) const;
@@ -63,6 +65,10 @@ public:
int getFrameIndexReference(const MachineFunction &MF, int FI,
unsigned &FrameReg) const;
uint32_t getCompactUnwindEncoding(MachineFunction &MF) const;
+
+ void eliminateCallFramePseudoInstr(MachineFunction &MF,
+ MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MI) const;
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 88e10d3af9..00fbe6924c 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -25,15 +25,15 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Type.h"
using namespace llvm;
STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
@@ -280,13 +280,13 @@ namespace {
/// getTargetMachine - Return a reference to the TargetMachine, casted
/// to the target-specific type.
- const X86TargetMachine &getTargetMachine() {
+ const X86TargetMachine &getTargetMachine() const {
return static_cast<const X86TargetMachine &>(TM);
}
/// getInstrInfo - Return a reference to the TargetInstrInfo, casted
/// to the target-specific type.
- const X86InstrInfo *getInstrInfo() {
+ const X86InstrInfo *getInstrInfo() const {
return getTargetMachine().getInstrInfo();
}
};
@@ -420,6 +420,11 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
if (!Chain.getNumOperands())
return false;
+ // Since we are not checking for AA here, conservatively abort if the chain
+ // writes to memory. It's not safe to move the callee (a load) across a store.
+ if (isa<MemSDNode>(Chain.getNode()) &&
+ cast<MemSDNode>(Chain.getNode())->writeMem())
+ return false;
if (Chain.getOperand(0).getNode() == Callee.getNode())
return true;
if (Chain.getOperand(0).getOpcode() == ISD::TokenFactor &&
@@ -431,8 +436,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
void X86DAGToDAGISel::PreprocessISelDAG() {
// OptForSize is used in pattern predicates that isel is matching.
- OptForSize = MF->getFunction()->getFnAttributes().
- hasAttribute(Attributes::OptimizeForSize);
+ OptForSize = MF->getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
E = CurDAG->allnodes_end(); I != E; ) {
@@ -441,7 +446,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
if (OptLevel != CodeGenOpt::None &&
(N->getOpcode() == X86ISD::CALL ||
(N->getOpcode() == X86ISD::TC_RETURN &&
- // Only does this if load can be foled into TC_RETURN.
+ // Only does this if load can be folded into TC_RETURN.
(Subtarget->is64Bit() ||
getTargetMachine().getRelocationModel() != Reloc::PIC_)))) {
/// Also try moving call address load from outside callseq_start to just
@@ -1037,8 +1042,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
AM.IndexReg = ShVal;
return false;
}
- break;
}
+ break;
case ISD::SRL: {
// Scale must not be used already.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 90bee41e35..960870dc60 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -23,7 +23,6 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/VariadicFunction.h"
-#include "llvm/CallingConv.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -31,14 +30,15 @@
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalAlias.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -85,6 +85,11 @@ static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal,
unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / 128)
* ElemsPerChunk);
+ // If the input is a buildvector just emit a smaller one.
+ if (Vec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
+ Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+
SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
VecIdx);
@@ -181,9 +186,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setSchedulingPreference(Sched::RegPressure);
setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
- // Bypass i32 with i8 on Atom when compiling with O2
- if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
+ // Bypass expensive divides on Atom when compiling with O2
+ if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) {
addBypassSlowDiv(32, 8);
+ if (Subtarget->is64Bit())
+ addBypassSlowDiv(64, 16);
+ }
if (Subtarget->isTargetWindows() && !Subtarget->isTargetCygMing()) {
// Setup Windows compiler runtime calls.
@@ -368,7 +376,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::BR_JT , MVT::Other, Expand);
setOperationAction(ISD::BRCOND , MVT::Other, Custom);
- setOperationAction(ISD::BR_CC , MVT::Other, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f32, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f64, Expand);
+ setOperationAction(ISD::BR_CC , MVT::f80, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i8, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i16, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i32, Expand);
+ setOperationAction(ISD::BR_CC , MVT::i64, Expand);
setOperationAction(ISD::SELECT_CC , MVT::Other, Expand);
if (Subtarget->is64Bit())
setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
@@ -605,10 +619,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
// We don't support sin/cos/fmod
- setOperationAction(ISD::FSIN , MVT::f64, Expand);
- setOperationAction(ISD::FCOS , MVT::f64, Expand);
- setOperationAction(ISD::FSIN , MVT::f32, Expand);
- setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
// Expand FP immediates into loads from the stack, except for the special
// cases we handle.
@@ -633,8 +649,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
// We don't support sin/cos/fmod
- setOperationAction(ISD::FSIN , MVT::f32, Expand);
- setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
// Special cases we handle for FP constants.
addLegalFPImmediate(APFloat(+0.0f)); // xorps
@@ -644,8 +661,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
if (!TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FSIN , MVT::f64 , Expand);
- setOperationAction(ISD::FCOS , MVT::f64 , Expand);
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
}
} else if (!TM.Options.UseSoftFloat) {
// f32 and f64 in x87.
@@ -659,10 +677,12 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
if (!TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FSIN , MVT::f32 , Expand);
- setOperationAction(ISD::FSIN , MVT::f64 , Expand);
- setOperationAction(ISD::FCOS , MVT::f32 , Expand);
- setOperationAction(ISD::FCOS , MVT::f64 , Expand);
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FSIN , MVT::f32, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f32, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
}
addLegalFPImmediate(APFloat(+0.0)); // FLD0
addLegalFPImmediate(APFloat(+1.0)); // FLD1
@@ -699,8 +719,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
}
if (!TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FSIN , MVT::f80 , Expand);
- setOperationAction(ISD::FCOS , MVT::f80 , Expand);
+ setOperationAction(ISD::FSIN , MVT::f80, Expand);
+ setOperationAction(ISD::FCOS , MVT::f80, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
}
setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
@@ -748,7 +769,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
setOperationAction(ISD::FABS, VT, Expand);
setOperationAction(ISD::FSIN, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FCOS, VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
setOperationAction(ISD::FREM, VT, Expand);
setOperationAction(ISD::FMA, VT, Expand);
setOperationAction(ISD::FPOWI, VT, Expand);
@@ -870,6 +893,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::ADD, MVT::v8i16, Legal);
setOperationAction(ISD::ADD, MVT::v4i32, Legal);
setOperationAction(ISD::ADD, MVT::v2i64, Legal);
+ setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
setOperationAction(ISD::SUB, MVT::v16i8, Legal);
setOperationAction(ISD::SUB, MVT::v8i16, Legal);
@@ -1046,6 +1070,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SRA, MVT::v4i32, Custom);
}
+ setOperationAction(ISD::SDIV, MVT::v8i16, Custom);
+ setOperationAction(ISD::SDIV, MVT::v4i32, Custom);
}
if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
@@ -1087,6 +1113,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::FABS, MVT::v4f64, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
+ setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
@@ -1109,6 +1136,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SRA, MVT::v16i16, Custom);
setOperationAction(ISD::SRA, MVT::v32i8, Custom);
+ setOperationAction(ISD::SDIV, MVT::v16i16, Custom);
+
setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
@@ -1123,6 +1152,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::VSELECT, MVT::v8i32, Legal);
setOperationAction(ISD::VSELECT, MVT::v8f32, Legal);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
+
if (Subtarget->hasFMA() || Subtarget->hasFMA4()) {
setOperationAction(ISD::FMA, MVT::v8f32, Legal);
setOperationAction(ISD::FMA, MVT::v4f64, Legal);
@@ -1157,6 +1193,8 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::SHL, MVT::v8i32, Legal);
setOperationAction(ISD::SRA, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::SDIV, MVT::v8i32, Custom);
} else {
setOperationAction(ISD::ADD, MVT::v4i64, Custom);
setOperationAction(ISD::ADD, MVT::v8i32, Custom);
@@ -1238,7 +1276,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
-
// Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
// handle type legalization for these operations here.
//
@@ -1267,6 +1304,19 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setLibcallName(RTLIB::SRA_I128, 0);
}
+ // Combine sin / cos into one node or libcall if possible.
+ if (Subtarget->hasSinCos()) {
+ setLibcallName(RTLIB::SINCOS_F32, "sincosf");
+ setLibcallName(RTLIB::SINCOS_F64, "sincos");
+ if (Subtarget->isTargetDarwin()) {
+ // For MacOSX, we don't want to the normal expansion of a libcall to
+ // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+ // traffic.
+ setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+ setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+ }
+ }
+
// We have target-specific dag combine patterns for the following nodes:
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1287,6 +1337,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::TRUNCATE);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
@@ -1298,28 +1349,26 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
// On Darwin, -Os means optimize for size without hurting performance,
// do not reduce the limit.
- maxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
- maxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
- maxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
- maxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
- maxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
- maxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
+ MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8;
+ MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
+ MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
+ MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
+ MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4;
setPrefLoopAlignment(4); // 2^4 bytes.
- benefitFromCodePlacementOpt = true;
+ BenefitFromCodePlacementOpt = true;
// Predictable cmov don't hurt on atom because it's in-order.
- predictableSelectIsExpensive = !Subtarget->isAtom();
+ PredictableSelectIsExpensive = !Subtarget->isAtom();
setPrefFunctionAlignment(4); // 2^4 bytes.
}
-
EVT X86TargetLowering::getSetCCResultType(EVT VT) const {
if (!VT.isVector()) return MVT::i8;
return VT.changeVectorElementTypeToInteger();
}
-
/// getMaxByValAlign - Helper for getByValTypeAlignment to determine
/// the desired ByVal argument alignment.
static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
@@ -1369,22 +1418,22 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
/// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If
-/// 'IsZeroVal' is true, that means it's safe to return a
-/// non-scalar-integer type, e.g. empty string source, constant, or loaded
-/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-/// constant so it does not need to be loaded.
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
EVT
X86TargetLowering::getOptimalMemOpType(uint64_t Size,
unsigned DstAlign, unsigned SrcAlign,
- bool IsZeroVal,
+ bool IsMemset, bool ZeroMemset,
bool MemcpyStrSrc,
MachineFunction &MF) const {
const Function *F = MF.getFunction();
- if (IsZeroVal &&
- !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
+ if ((!IsMemset || ZeroMemset) &&
+ !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
(Subtarget->isUnalignedMemAccessFast() ||
((DstAlign == 0 || DstAlign >= 16) &&
@@ -1412,6 +1461,14 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
return MVT::i32;
}
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+ if (VT == MVT::f32)
+ return X86ScalarSSEf32;
+ else if (VT == MVT::f64)
+ return X86ScalarSSEf64;
+ return true;
+}
+
bool
X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
if (Fast)
@@ -1472,10 +1529,10 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
// FIXME: Why this routine is here? Move to RegInfo!
std::pair<const TargetRegisterClass*, uint8_t>
-X86TargetLowering::findRepresentativeClass(EVT VT) const{
+X86TargetLowering::findRepresentativeClass(MVT VT) const{
const TargetRegisterClass *RRC = 0;
uint8_t Cost = 1;
- switch (VT.getSimpleVT().SimpleTy) {
+ switch (VT.SimpleTy) {
default:
return TargetLowering::findRepresentativeClass(VT);
case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
@@ -1517,7 +1574,6 @@ bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace,
return true;
}
-
//===----------------------------------------------------------------------===//
// Return Value Calling Convention Implementation
//===----------------------------------------------------------------------===//
@@ -1549,14 +1605,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
RVLocs, *DAG.getContext());
CCInfo.AnalyzeReturn(Outs, RetCC_X86);
- // Add the regs to the liveout set for the function.
- MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
- for (unsigned i = 0; i != RVLocs.size(); ++i)
- if (RVLocs[i].isRegLoc() && !MRI.isLiveOut(RVLocs[i].getLocReg()))
- MRI.addLiveOut(RVLocs[i].getLocReg());
-
SDValue Flag;
-
SmallVector<SDValue, 6> RetOps;
RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
// Operand #1 = Bytes To Pop
@@ -1625,12 +1674,13 @@ X86TargetLowering::LowerReturn(SDValue Chain,
Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag);
Flag = Chain.getValue(1);
+ RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
}
- // The x86-64 ABI for returning structs by value requires that we copy
- // the sret argument into %rax for the return. We saved the argument into
- // a virtual register in the entry block, so now we copy the value out
- // and into %rax.
+ // The x86-64 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // We saved the argument into a virtual register in the entry block,
+ // so now we copy the value out and into %rax/%eax.
if (Subtarget->is64Bit() &&
DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
MachineFunction &MF = DAG.getMachineFunction();
@@ -1640,11 +1690,12 @@ X86TargetLowering::LowerReturn(SDValue Chain,
"SRetReturnReg should have been set in LowerFormalArguments().");
SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
- Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+ unsigned RetValReg = Subtarget->isTarget64BitILP32() ? X86::EAX : X86::RAX;
+ Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
Flag = Chain.getValue(1);
- // RAX now acts like a return value.
- MRI.addLiveOut(X86::RAX);
+ // RAX/EAX now acts like a return value.
+ RetOps.push_back(DAG.getRegister(RetValReg, MVT::i64));
}
RetOps[0] = Chain; // Update chain.
@@ -1689,8 +1740,8 @@ bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
return true;
}
-EVT
-X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
+MVT
+X86TargetLowering::getTypeForExtArgOrReturn(MVT VT,
ISD::NodeType ExtendKind) const {
MVT ReturnMVT;
// TODO: Is this also valid on 32-bit?
@@ -1699,7 +1750,7 @@ X86TargetLowering::getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
else
ReturnMVT = MVT::i32;
- EVT MinVT = getRegisterType(Context, ReturnMVT);
+ MVT MinVT = getRegisterType(ReturnMVT);
return VT.bitsLT(MinVT) ? MinVT : VT;
}
@@ -1721,7 +1772,7 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
// Copy all of the result registers out of their specified physreg.
- for (unsigned i = 0; i != RVLocs.size(); ++i) {
+ for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
CCValAssign &VA = RVLocs[i];
EVT CopyVT = VA.getValVT();
@@ -1765,7 +1816,6 @@ X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
return Chain;
}
-
//===----------------------------------------------------------------------===//
// C & StdCall & Fast Calling Convention implementation
//===----------------------------------------------------------------------===//
@@ -1979,10 +2029,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
if (VA.isExtInLoc()) {
// Handle MMX values passed in XMM regs.
- if (RegVT.isVector()) {
- ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(),
- ArgValue);
- } else
+ if (RegVT.isVector())
+ ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
+ else
ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
}
} else {
@@ -1998,14 +2047,16 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
InVals.push_back(ArgValue);
}
- // The x86-64 ABI for returning structs by value requires that we copy
- // the sret argument into %rax for the return. Save the argument into
- // a virtual register so that we can access it from the return points.
+ // The x86-64 ABIs require that for returning structs by value we copy
+ // the sret argument into %rax/%eax (depending on ABI) for the return.
+ // Save the argument into a virtual register so that we can access it
+ // from the return points.
if (Is64Bit && MF.getFunction()->hasStructRetAttr()) {
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
unsigned Reg = FuncInfo->getSRetReturnReg();
if (!Reg) {
- Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+ MVT PtrTy = getPointerTy();
+ Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
FuncInfo->setSRetReturnReg(Reg);
}
SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
@@ -2058,8 +2109,8 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs,
TotalNumIntRegs);
- bool NoImplicitFloatOps = Fn->getFnAttributes().
- hasAttribute(Attributes::NoImplicitFloat);
+ bool NoImplicitFloatOps = Fn->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
assert(!(NumXMMRegs && !Subtarget->hasSSE1()) &&
"SSE register cannot be used when SSE is disabled!");
assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat &&
@@ -2537,8 +2588,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
OpFlags = X86II::MO_DARWIN_STUB;
} else if (Subtarget->isPICStyleRIPRel() &&
isa<Function>(GV) &&
- cast<Function>(GV)->getFnAttributes().
- hasAttribute(Attributes::NonLazyBind)) {
+ cast<Function>(GV)->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NonLazyBind)) {
// If the function is marked as non-lazy, generate an indirect call
// which loads from the GOT directly. This avoids runtime overhead
// at the cost of eager binding (and one extra byte of encoding).
@@ -2618,8 +2670,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// This isn't right, although it's probably harmless on x86; liveouts
// should be computed from returns not tail calls. Consider a void
// function making a tail call to a function returning int.
- return DAG.getNode(X86ISD::TC_RETURN, dl,
- NodeTys, &Ops[0], Ops.size());
+ return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
}
Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
@@ -2656,7 +2707,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
Ins, dl, DAG, InVals);
}
-
//===----------------------------------------------------------------------===//
// Fast Calling Convention (tail call) implementation
//===----------------------------------------------------------------------===//
@@ -2778,7 +2828,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
const SmallVectorImpl<ISD::OutputArg> &Outs,
const SmallVectorImpl<SDValue> &OutVals,
const SmallVectorImpl<ISD::InputArg> &Ins,
- SelectionDAG& DAG) const {
+ SelectionDAG &DAG) const {
if (!IsTailCallConvention(CalleeCC) &&
CalleeCC != CallingConv::C)
return false;
@@ -2817,7 +2867,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// An stdcall caller is expected to clean up its arguments; the callee
// isn't going to do that.
- if (!CCMatch && CallerCC==CallingConv::X86_StdCall)
+ if (!CCMatch && CallerCC == CallingConv::X86_StdCall)
return false;
// Do not sibcall optimize vararg calls unless all arguments are passed via
@@ -2937,9 +2987,15 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
// callee-saved registers are restored. These happen to be the same
// registers used to pass 'inreg' arguments so watch out for those.
if (!Subtarget->is64Bit() &&
- !isa<GlobalAddressSDNode>(Callee) &&
- !isa<ExternalSymbolSDNode>(Callee)) {
+ ((!isa<GlobalAddressSDNode>(Callee) &&
+ !isa<ExternalSymbolSDNode>(Callee)) ||
+ getTargetMachine().getRelocationModel() == Reloc::PIC_)) {
unsigned NumInRegs = 0;
+ // In PIC we need an extra register to formulate the address computation
+ // for the callee.
+ unsigned MaxInRegs =
+ (getTargetMachine().getRelocationModel() == Reloc::PIC_) ? 2 : 3;
+
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
if (!VA.isRegLoc())
@@ -2948,7 +3004,7 @@ X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
switch (Reg) {
default: break;
case X86::EAX: case X86::EDX: case X86::ECX:
- if (++NumInRegs == 3)
+ if (++NumInRegs == MaxInRegs)
return false;
break;
}
@@ -2965,7 +3021,6 @@ X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
return X86::createFastISel(funcInfo, libInfo);
}
-
//===----------------------------------------------------------------------===//
// Other Lowering Hooks
//===----------------------------------------------------------------------===//
@@ -2985,7 +3040,7 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::PSHUFHW:
case X86ISD::PSHUFLW:
case X86ISD::SHUFP:
- case X86ISD::PALIGN:
+ case X86ISD::PALIGNR:
case X86ISD::MOVLHPS:
case X86ISD::MOVLHPD:
case X86ISD::MOVHLPS:
@@ -3035,7 +3090,7 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
SelectionDAG &DAG) {
switch(Opc) {
default: llvm_unreachable("Unknown x86 shuffle node");
- case X86ISD::PALIGN:
+ case X86ISD::PALIGNR:
case X86ISD::SHUFP:
case X86ISD::VPERM2X128:
return DAG.getNode(Opc, dl, VT, V1, V2,
@@ -3076,7 +3131,6 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
}
-
bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
bool hasSymbolicDisplacement) {
// Offset should fit into 32 bit immediate field.
@@ -3346,8 +3400,8 @@ static bool isPSHUFLWMask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
/// is suitable for input to PALIGNR.
static bool isPALIGNRMask(ArrayRef<int> Mask, EVT VT,
const X86Subtarget *Subtarget) {
- if ((VT.getSizeInBits() == 128 && !Subtarget->hasSSSE3()) ||
- (VT.getSizeInBits() == 256 && !Subtarget->hasInt256()))
+ if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) ||
+ (VT.is256BitVector() && !Subtarget->hasInt256()))
return false;
unsigned NumElts = VT.getVectorNumElements();
@@ -3436,7 +3490,7 @@ static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask,
/// reverse of what x86 shuffles want.
static bool isSHUFPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256,
bool Commuted = false) {
- if (!HasFp256 && VT.getSizeInBits() == 256)
+ if (!HasFp256 && VT.is256BitVector())
return false;
unsigned NumElems = VT.getVectorNumElements();
@@ -3571,7 +3625,7 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, EVT VT) {
static
SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG) {
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
DebugLoc dl = SVOp->getDebugLoc();
if (VT != MVT::v8i32 && VT != MVT::v8f32)
@@ -3621,7 +3675,7 @@ static bool isUNPCKLMask(ArrayRef<int> Mask, EVT VT,
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
- if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
(!HasInt256 || (NumElts != 16 && NumElts != 32)))
return false;
@@ -3660,7 +3714,7 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
- if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
(!HasInt256 || (NumElts != 16 && NumElts != 32)))
return false;
@@ -3691,14 +3745,14 @@ static bool isUNPCKHMask(ArrayRef<int> Mask, EVT VT,
/// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
/// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
/// <0, 0, 1, 1>
-static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
- bool HasInt256) {
+static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
unsigned NumElts = VT.getVectorNumElements();
+ bool Is256BitVec = VT.is256BitVector();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
- if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ if (Is256BitVec && NumElts != 4 && NumElts != 8 &&
(!HasInt256 || (NumElts != 16 && NumElts != 32)))
return false;
@@ -3706,7 +3760,7 @@ static bool isUNPCKL_v_undef_Mask(ArrayRef<int> Mask, EVT VT,
// FIXME: Need a better way to get rid of this, there's no latency difference
// between UNPCKLPD and MOVDDUP, the later should always be checked first and
// the former later. We should also remove the "_undef" special mask.
- if (NumElts == 4 && VT.getSizeInBits() == 256)
+ if (NumElts == 4 && Is256BitVec)
return false;
// Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3740,7 +3794,7 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, EVT VT, bool HasInt256) {
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for unpckh");
- if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+ if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 &&
(!HasInt256 || (NumElts != 16 && NumElts != 32)))
return false;
@@ -3822,7 +3876,7 @@ static bool isVPERM2X128Mask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
/// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions.
static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
unsigned HalfSize = VT.getVectorNumElements()/2;
@@ -3856,7 +3910,7 @@ static bool isVPERMILPMask(ArrayRef<int> Mask, EVT VT, bool HasFp256) {
unsigned NumElts = VT.getVectorNumElements();
// Only match 256-bit with 32/64-bit types
- if (VT.getSizeInBits() != 256 || (NumElts != 4 && NumElts != 8))
+ if (!VT.is256BitVector() || (NumElts != 4 && NumElts != 8))
return false;
unsigned NumLanes = VT.getSizeInBits()/128;
@@ -3912,8 +3966,8 @@ static bool isMOVSHDUPMask(ArrayRef<int> Mask, EVT VT,
unsigned NumElems = VT.getVectorNumElements();
- if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
- (VT.getSizeInBits() == 256 && NumElems != 8))
+ if ((VT.is128BitVector() && NumElems != 4) ||
+ (VT.is256BitVector() && NumElems != 8))
return false;
// "i+1" is the value the indexed mask element must have
@@ -3935,8 +3989,8 @@ static bool isMOVSLDUPMask(ArrayRef<int> Mask, EVT VT,
unsigned NumElems = VT.getVectorNumElements();
- if ((VT.getSizeInBits() == 128 && NumElems != 4) ||
- (VT.getSizeInBits() == 256 && NumElems != 8))
+ if ((VT.is128BitVector() && NumElems != 4) ||
+ (VT.is256BitVector() && NumElems != 8))
return false;
// "i" is the value the indexed mask element must have
@@ -3996,9 +4050,8 @@ bool X86::isVEXTRACTF128Index(SDNode *N) {
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
- unsigned VL = N->getValueType(0).getVectorNumElements();
- unsigned VBits = N->getValueType(0).getSizeInBits();
- unsigned ElSize = VBits / VL;
+ MVT VT = N->getValueType(0).getSimpleVT();
+ unsigned ElSize = VT.getVectorElementType().getSizeInBits();
bool Result = (Index * ElSize) % 128 == 0;
return Result;
@@ -4015,9 +4068,8 @@ bool X86::isVINSERTF128Index(SDNode *N) {
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
- unsigned VL = N->getValueType(0).getVectorNumElements();
- unsigned VBits = N->getValueType(0).getSizeInBits();
- unsigned ElSize = VBits / VL;
+ MVT VT = N->getValueType(0).getSimpleVT();
+ unsigned ElSize = VT.getVectorElementType().getSizeInBits();
bool Result = (Index * ElSize) % 128 == 0;
return Result;
@@ -4027,7 +4079,7 @@ bool X86::isVINSERTF128Index(SDNode *N) {
/// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions.
/// Handles 128-bit and 256-bit.
static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
- EVT VT = N->getValueType(0);
+ MVT VT = N->getValueType(0).getSimpleVT();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
"Unsupported vector type for PSHUF/SHUFP");
@@ -4057,7 +4109,7 @@ static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) {
/// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction.
static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
- EVT VT = N->getValueType(0);
+ MVT VT = N->getValueType(0).getSimpleVT();
assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
"Unsupported vector type for PSHUFHW");
@@ -4081,7 +4133,7 @@ static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) {
/// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction.
static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
- EVT VT = N->getValueType(0);
+ MVT VT = N->getValueType(0).getSimpleVT();
assert((VT == MVT::v8i16 || VT == MVT::v16i16) &&
"Unsupported vector type for PSHUFHW");
@@ -4105,7 +4157,7 @@ static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) {
/// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle
/// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction.
static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) {
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
unsigned EltSize = VT.getVectorElementType().getSizeInBits() >> 3;
unsigned NumElts = VT.getVectorNumElements();
@@ -4136,8 +4188,8 @@ unsigned X86::getExtractVEXTRACTF128Immediate(SDNode *N) {
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
- EVT VecVT = N->getOperand(0).getValueType();
- EVT ElVT = VecVT.getVectorElementType();
+ MVT VecVT = N->getOperand(0).getValueType().getSimpleVT();
+ MVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
return Index / NumElemsPerChunk;
@@ -4153,8 +4205,8 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
uint64_t Index =
cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
- EVT VecVT = N->getValueType(0);
- EVT ElVT = VecVT.getVectorElementType();
+ MVT VecVT = N->getValueType(0).getSimpleVT();
+ MVT ElVT = VecVT.getVectorElementType();
unsigned NumElemsPerChunk = 128 / ElVT.getSizeInBits();
return Index / NumElemsPerChunk;
@@ -4164,7 +4216,7 @@ unsigned X86::getInsertVINSERTF128Immediate(SDNode *N) {
/// the specified VECTOR_SHUFFLE mask with VPERMQ and VPERMPD instructions.
/// Handles 256-bit.
static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
- EVT VT = N->getValueType(0);
+ MVT VT = N->getValueType(0).getSimpleVT();
unsigned NumElts = VT.getVectorNumElements();
@@ -4184,17 +4236,18 @@ static unsigned getShuffleCLImmediate(ShuffleVectorSDNode *N) {
/// isZeroNode - Returns true if Elt is a constant zero or a floating point
/// constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
- return ((isa<ConstantSDNode>(Elt) &&
- cast<ConstantSDNode>(Elt)->isNullValue()) ||
- (isa<ConstantFPSDNode>(Elt) &&
- cast<ConstantFPSDNode>(Elt)->getValueAPF().isPosZero()));
+ if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
+ return CN->isNullValue();
+ if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
+ return CFP->getValueAPF().isPosZero();
+ return false;
}
/// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in
/// their permute mask.
static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
SelectionDAG &DAG) {
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
unsigned NumElems = VT.getVectorNumElements();
SmallVector<int, 8> MaskVec;
@@ -4343,12 +4396,11 @@ static bool isZeroShuffle(ShuffleVectorSDNode *N) {
static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SelectionDAG &DAG, DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- unsigned Size = VT.getSizeInBits();
// Always build SSE zero vectors as <4 x i32> bitcasted
// to their dest type. This ensures they get CSE'd.
SDValue Vec;
- if (Size == 128) { // SSE
+ if (VT.is128BitVector()) { // SSE
if (Subtarget->hasSSE2()) { // SSE2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
@@ -4356,7 +4408,7 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst);
}
- } else if (Size == 256) { // AVX
+ } else if (VT.is256BitVector()) { // AVX
if (Subtarget->hasInt256()) { // AVX2
SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
@@ -4378,14 +4430,13 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
/// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG,
+static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
DebugLoc dl) {
assert(VT.isVector() && "Expected a vector type");
- unsigned Size = VT.getSizeInBits();
SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
SDValue Vec;
- if (Size == 256) {
+ if (VT.is256BitVector()) {
if (HasInt256) { // AVX2
SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
@@ -4393,7 +4444,7 @@ static SDValue getOnesVector(EVT VT, bool HasInt256, SelectionDAG &DAG,
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
}
- } else if (Size == 128) {
+ } else if (VT.is128BitVector()) {
Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
} else
llvm_unreachable("Unexpected vector type");
@@ -4472,14 +4523,13 @@ static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) {
static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) {
EVT VT = V.getValueType();
DebugLoc dl = V.getDebugLoc();
- unsigned Size = VT.getSizeInBits();
- if (Size == 128) {
+ if (VT.is128BitVector()) {
V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V);
int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo };
V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32),
&SplatMask[0]);
- } else if (Size == 256) {
+ } else if (VT.is256BitVector()) {
// To use VPERMILPS to splat scalars, the second half of indicies must
// refer to the higher part, which is a duplication of the lower one,
// because VPERMILPS can only handle in-lane permutations.
@@ -4503,14 +4553,14 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
int EltNo = SV->getSplatIndex();
int NumElems = SrcVT.getVectorNumElements();
- unsigned Size = SrcVT.getSizeInBits();
+ bool Is256BitVec = SrcVT.is256BitVector();
- assert(((Size == 128 && NumElems > 4) || Size == 256) &&
- "Unknown how to promote splat for type");
+ assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) &&
+ "Unknown how to promote splat for type");
// Extract the 128-bit part containing the splat element and update
// the splat element index when it refers to the higher register.
- if (Size == 256) {
+ if (Is256BitVec) {
V1 = Extract128BitVector(V1, EltNo, DAG, dl);
if (EltNo >= NumElems/2)
EltNo -= NumElems/2;
@@ -4527,7 +4577,7 @@ static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) {
// Recreate the 256-bit vector and place the same 128-bit vector
// into the low and high part. This is necessary because we want
// to use VPERM* to shuffle the vectors
- if (Size == 256) {
+ if (Is256BitVec) {
V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1);
}
@@ -4579,6 +4629,10 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
case X86ISD::MOVLHPS:
DecodeMOVLHPSMask(NumElems, Mask);
break;
+ case X86ISD::PALIGNR:
+ ImmN = N->getOperand(N->getNumOperands()-1);
+ DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+ break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILP:
ImmN = N->getOperand(N->getNumOperands()-1);
@@ -4622,7 +4676,6 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT,
case X86ISD::MOVLPS:
case X86ISD::MOVSHDUP:
case X86ISD::MOVSLDUP:
- case X86ISD::PALIGN:
// Not yet implemented
return false;
default: llvm_unreachable("unknown target shuffle node");
@@ -4917,7 +4970,7 @@ static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
return DAG.getNode(ISD::BITCAST, dl, VT,
DAG.getNode(Opc, dl, ShVT, SrcOp,
DAG.getConstant(NumBits,
- TLI.getShiftAmountTy(SrcOp.getValueType()))));
+ TLI.getScalarShiftAmountTy(SrcOp.getValueType()))));
}
SDValue
@@ -5090,7 +5143,7 @@ X86TargetLowering::LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const {
if (!Subtarget->hasFp256())
return SDValue();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
DebugLoc dl = Op.getDebugLoc();
assert((VT.is128BitVector() || VT.is256BitVector()) &&
@@ -5288,8 +5341,8 @@ SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
DebugLoc dl = Op.getDebugLoc();
- EVT VT = Op.getValueType();
- EVT ExtVT = VT.getVectorElementType();
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT ExtVT = VT.getVectorElementType();
unsigned NumElems = Op.getNumOperands();
// Vectors containing all zeros can be matched by pxor and xorps later
@@ -5305,7 +5358,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// Vectors containing all ones can be matched by pcmpeqd on 128-bit width
// vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
// vpcmpeqd on 256-bit vectors.
- if (ISD::isBuildVectorAllOnes(Op.getNode())) {
+ if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256()))
return Op;
@@ -5620,7 +5673,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// to create 256-bit vectors from two other 128-bit ones.
static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
- EVT ResVT = Op.getValueType();
+ MVT ResVT = Op.getValueType().getSimpleVT();
assert(ResVT.is256BitVector() && "Value type must be 256-bit wide");
@@ -5646,8 +5699,8 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
- EVT VT = SVOp->getValueType(0);
- EVT EltVT = VT.getVectorElementType();
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
unsigned NumElems = VT.getVectorNumElements();
if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
@@ -5658,41 +5711,40 @@ LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp,
// Check the mask for BLEND and build the value.
unsigned MaskValue = 0;
// There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
- unsigned NumLanes = (NumElems-1)/8 + 1;
+ unsigned NumLanes = (NumElems-1)/8 + 1;
unsigned NumElemsInLane = NumElems / NumLanes;
// Blend for v16i16 should be symetric for the both lanes.
for (unsigned i = 0; i < NumElemsInLane; ++i) {
- int SndLaneEltIdx = (NumLanes == 2) ?
+ int SndLaneEltIdx = (NumLanes == 2) ?
SVOp->getMaskElt(i + NumElemsInLane) : -1;
int EltIdx = SVOp->getMaskElt(i);
- if ((EltIdx == -1 || EltIdx == (int)i) &&
- (SndLaneEltIdx == -1 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
+ if ((EltIdx < 0 || EltIdx == (int)i) &&
+ (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane)))
continue;
- if (((unsigned)EltIdx == (i + NumElems)) &&
- (SndLaneEltIdx == -1 ||
+ if (((unsigned)EltIdx == (i + NumElems)) &&
+ (SndLaneEltIdx < 0 ||
(unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane))
MaskValue |= (1<<i);
- else
+ else
return SDValue();
}
// Convert i32 vectors to floating point if it is not AVX2.
// AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
- EVT BlendVT = VT;
+ MVT BlendVT = VT;
if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
- BlendVT = EVT::getVectorVT(*DAG.getContext(),
- EVT::getFloatingPointVT(EltVT.getSizeInBits()),
- NumElems);
+ BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+ NumElems);
V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1);
V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2);
}
-
- SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
- DAG.getConstant(MaskValue, MVT::i32));
+
+ SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2,
+ DAG.getConstant(MaskValue, MVT::i32));
return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
}
@@ -5827,6 +5879,11 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
}
}
+ // Promote splats to a larger type which usually leads to more efficient code.
+ // FIXME: Is this true if pshufb is available?
+ if (SVOp->isSplat())
+ return PromoteSplat(SVOp, DAG);
+
// If we have SSSE3, and all words of the result are from 1 input vector,
// case 2 is generated, otherwise case 3 is generated. If no SSSE3
// is present, fall back to case 4.
@@ -5842,7 +5899,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
int EltIdx = MaskVals[i] * 2;
int Idx0 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx;
int Idx1 = (TwoInputs && (EltIdx >= 16)) ? 0x80 : EltIdx+1;
- pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(Idx0, MVT::i8));
pshufbMask.push_back(DAG.getConstant(Idx1, MVT::i8));
}
V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V1);
@@ -5960,6 +6017,11 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
DebugLoc dl = SVOp->getDebugLoc();
ArrayRef<int> MaskVals = SVOp->getMask();
+ // Promote splats to a larger type which usually leads to more efficient code.
+ // FIXME: Is this true if pshufb is available?
+ if (SVOp->isSplat())
+ return PromoteSplat(SVOp, DAG);
+
// If we have SSSE3, case 1 is generated when all result bytes come from
// one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is
// present, fall back to case 3.
@@ -6078,7 +6140,7 @@ static
SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
@@ -6125,8 +6187,9 @@ SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp,
/// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15>
static
SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
- SelectionDAG &DAG, DebugLoc dl) {
+ SelectionDAG &DAG) {
MVT VT = SVOp->getValueType(0).getSimpleVT();
+ DebugLoc dl = SVOp->getDebugLoc();
unsigned NumElems = VT.getVectorNumElements();
MVT NewVT;
unsigned Scale;
@@ -6162,7 +6225,7 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
/// getVZextMovL - Return a zero-extending vector move low node.
///
-static SDValue getVZextMovL(EVT VT, EVT OpVT,
+static SDValue getVZextMovL(MVT VT, EVT OpVT,
SDValue SrcOp, SelectionDAG &DAG,
const X86Subtarget *Subtarget, DebugLoc dl) {
if (VT == MVT::v2f64 || VT == MVT::v4f32) {
@@ -6204,14 +6267,14 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
if (NewOp.getNode())
return NewOp;
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
unsigned NumElems = VT.getVectorNumElements();
unsigned NumLaneElems = NumElems / 2;
DebugLoc dl = SVOp->getDebugLoc();
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
- EVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
+ MVT EltVT = VT.getVectorElementType();
+ MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems);
SDValue Output[2];
SmallVector<int, 16> Mask;
@@ -6316,7 +6379,7 @@ LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
SDValue V1 = SVOp->getOperand(0);
SDValue V2 = SVOp->getOperand(1);
DebugLoc dl = SVOp->getDebugLoc();
- EVT VT = SVOp->getValueType(0);
+ MVT VT = SVOp->getValueType(0).getSimpleVT();
assert(VT.is128BitVector() && "Unsupported vector size");
@@ -6570,7 +6633,7 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
// Reduce a vector shuffle to zext.
SDValue
-X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+X86TargetLowering::LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
// PMOVZX is only available from SSE41.
if (!Subtarget->hasSSE41())
return SDValue();
@@ -6614,9 +6677,10 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+ LLVMContext *Context = DAG.getContext();
unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
- EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
- EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
+ EVT NeVT = EVT::getIntegerVT(*Context, NBits);
+ EVT NVT = EVT::getVectorVT(*Context, NeVT, NumElems >> Shift);
if (!isTypeLegal(NVT))
return SDValue();
@@ -6635,8 +6699,21 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
// If it's foldable, i.e. normal load with single use, we will let code
// selection to fold it. Otherwise, we will short the conversion sequence.
if (CIdx && CIdx->getZExtValue() == 0 &&
- (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
+ (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse())) {
+ if (V.getValueSizeInBits() > V1.getValueSizeInBits()) {
+ // The "ext_vec_elt" node is wider than the result node.
+ // In this case we should extract subvector from V.
+ // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast (extract_subvector x)).
+ unsigned Ratio = V.getValueSizeInBits() / V1.getValueSizeInBits();
+ EVT FullVT = V.getValueType();
+ EVT SubVecVT = EVT::getVectorVT(*Context,
+ FullVT.getVectorElementType(),
+ FullVT.getVectorNumElements()/Ratio);
+ V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVecVT, V,
+ DAG.getIntPtrConstant(0));
+ }
V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+ }
}
return DAG.getNode(ISD::BITCAST, DL, VT,
@@ -6646,7 +6723,7 @@ X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
SDValue
X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
DebugLoc dl = Op.getDebugLoc();
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
@@ -6656,25 +6733,14 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
// Handle splat operations
if (SVOp->isSplat()) {
- unsigned NumElem = VT.getVectorNumElements();
- int Size = VT.getSizeInBits();
-
// Use vbroadcast whenever the splat comes from a foldable load
SDValue Broadcast = LowerVectorBroadcast(Op, DAG);
if (Broadcast.getNode())
return Broadcast;
-
- // Handle splats by matching through known shuffle masks
- if ((Size == 128 && NumElem <= 4) ||
- (Size == 256 && NumElem <= 8))
- return SDValue();
-
- // All remaning splats are promoted to target supported vector shuffles.
- return PromoteSplat(SVOp, DAG);
}
// Check integer expanding shuffles.
- SDValue NewOp = lowerVectorIntExtend(Op, DAG);
+ SDValue NewOp = LowerVectorIntExtend(Op, DAG);
if (NewOp.getNode())
return NewOp;
@@ -6682,7 +6748,7 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
// do it!
if (VT == MVT::v8i16 || VT == MVT::v16i8 ||
VT == MVT::v16i16 || VT == MVT::v32i8) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode())
return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
} else if ((VT == MVT::v4i32 ||
@@ -6690,18 +6756,18 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
// FIXME: Figure out a cleaner way to do this.
// Try to make use of movq to zero out the top part.
if (ISD::isBuildVectorAllZeros(V2.getNode())) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
- EVT NewVT = NewOp.getValueType();
+ MVT NewVT = NewOp.getValueType().getSimpleVT();
if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
NewVT, true, false))
return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
DAG, Subtarget, dl);
}
} else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
- SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG, dl);
+ SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
if (NewOp.getNode()) {
- EVT NewVT = NewOp.getValueType();
+ MVT NewVT = NewOp.getValueType().getSimpleVT();
if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
DAG, Subtarget, dl);
@@ -6716,7 +6782,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
SDValue V1 = Op.getOperand(0);
SDValue V2 = Op.getOperand(1);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
DebugLoc dl = Op.getDebugLoc();
unsigned NumElems = VT.getVectorNumElements();
bool V1IsUndef = V1.getOpcode() == ISD::UNDEF;
@@ -6727,8 +6793,8 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
bool HasFp256 = Subtarget->hasFp256();
bool HasInt256 = Subtarget->hasInt256();
MachineFunction &MF = DAG.getMachineFunction();
- bool OptForSize = MF.getFunction()->getFnAttributes().
- hasAttribute(Attributes::OptimizeForSize);
+ bool OptForSize = MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize);
assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles");
@@ -6807,7 +6873,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (isShift && ShVal.hasOneUse()) {
// If the shifted value has multiple uses, it may be cheaper to use
// v_set0 + movlhps or movhlps, etc.
- EVT EltVT = VT.getVectorElementType();
+ MVT EltVT = VT.getVectorElementType();
ShAmt *= EltVT.getSizeInBits();
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
}
@@ -6846,7 +6912,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
if (isShift) {
// No better options. Use a vshldq / vsrldq.
- EVT EltVT = VT.getVectorElementType();
+ MVT EltVT = VT.getVectorElementType();
ShAmt *= EltVT.getSizeInBits();
return getVShift(isLeft, VT, ShVal, ShAmt, DAG, *this, dl);
}
@@ -6917,7 +6983,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
// nodes, and remove one by one until they don't return Op anymore.
if (isPALIGNRMask(M, VT, Subtarget))
- return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
+ return getTargetShuffleNode(X86ISD::PALIGNR, dl, VT, V1, V2,
getShufflePALIGNRImmediate(SVOp),
DAG);
@@ -6989,7 +7055,6 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1,
getShuffleCLImmediate(SVOp), DAG);
-
//===--------------------------------------------------------------------===//
// Since no target specific shuffle was selected for this generic one,
// lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -7027,13 +7092,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
-SDValue
-X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
+static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType().getSimpleVT();
DebugLoc dl = Op.getDebugLoc();
- if (!Op.getOperand(0).getValueType().is128BitVector())
+ if (!Op.getOperand(0).getValueType().getSimpleVT().is128BitVector())
return SDValue();
if (VT.getSizeInBits() == 8) {
@@ -7091,7 +7154,6 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
return SDValue();
}
-
SDValue
X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SelectionDAG &DAG) const {
@@ -7099,7 +7161,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return SDValue();
SDValue Vec = Op.getOperand(0);
- EVT VecVT = Vec.getValueType();
+ MVT VecVT = Vec.getValueType().getSimpleVT();
// If this is a 256-bit vector result, first extract the 128-bit vector and
// then extract the element from the 128-bit vector.
@@ -7126,7 +7188,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return Res;
}
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
DebugLoc dl = Op.getDebugLoc();
// TODO: handle v16i8.
if (VT.getSizeInBits() == 16) {
@@ -7139,7 +7201,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
MVT::v4i32, Vec),
Op.getOperand(1)));
// Transform it so it match pextrw which produces a 32-bit result.
- EVT EltVT = MVT::i32;
+ MVT EltVT = MVT::i32;
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
Op.getOperand(0), Op.getOperand(1));
SDValue Assert = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
@@ -7154,7 +7216,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// SHUFPS the element to the lowest double word, then movss.
int Mask[4] = { static_cast<int>(Idx), -1, -1, -1 };
- EVT VVT = Op.getOperand(0).getValueType();
+ MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7173,7 +7235,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// Note if the lower 64 bits of the result of the UNPCKHPD is then stored
// to a f64mem, the whole operation is folded into a single MOVHPDmr.
int Mask[2] = { 1, -1 };
- EVT VVT = Op.getOperand(0).getValueType();
+ MVT VVT = Op.getOperand(0).getValueType().getSimpleVT();
SDValue Vec = DAG.getVectorShuffle(VVT, dl, Op.getOperand(0),
DAG.getUNDEF(VVT), Mask);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
@@ -7183,11 +7245,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
return SDValue();
}
-SDValue
-X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
- SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- EVT EltVT = VT.getVectorElementType();
+static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
DebugLoc dl = Op.getDebugLoc();
SDValue N0 = Op.getOperand(0);
@@ -7240,8 +7300,8 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT_SSE4(SDValue Op,
SDValue
X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
- EVT VT = Op.getValueType();
- EVT EltVT = VT.getVectorElementType();
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT.getVectorElementType();
DebugLoc dl = Op.getDebugLoc();
SDValue N0 = Op.getOperand(0);
@@ -7289,7 +7349,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
LLVMContext *Context = DAG.getContext();
DebugLoc dl = Op.getDebugLoc();
- EVT OpVT = Op.getValueType();
+ MVT OpVT = Op.getValueType().getSimpleVT();
// If this is a 256-bit vector result, first insert into a 128-bit
// vector and then insert into the 256-bit vector.
@@ -7456,7 +7516,6 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
DebugLoc DL = Op.getDebugLoc();
Result = DAG.getNode(WrapperKind, DL, getPointerTy(), Result);
-
// With PIC, the address is actually $g + Offset.
if (getTargetMachine().getRelocationModel() == Reloc::PIC_ &&
!Subtarget->is64Bit()) {
@@ -7505,8 +7564,7 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
SDValue
X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
- int64_t Offset,
- SelectionDAG &DAG) const {
+ int64_t Offset, SelectionDAG &DAG) const {
// Create the TargetGlobalAddress node, folding in the constant
// offset if it is legal.
unsigned char OpFlags =
@@ -7726,7 +7784,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
case TLSModel::LocalExec:
return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
Subtarget->is64Bit(),
- getTargetMachine().getRelocationModel() == Reloc::PIC_);
+ getTargetMachine().getRelocationModel() == Reloc::PIC_);
}
llvm_unreachable("Unknown TLS model.");
}
@@ -7843,7 +7901,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
llvm_unreachable("TLS not implemented for this target.");
}
-
/// LowerShiftParts - Lower SRA_PARTS and friends, which return two i32 values
/// and take a 2 x i32 value to shift plus a shift amount.
SDValue X86TargetLowering::LowerShiftParts(SDValue Op, SelectionDAG &DAG) const{
@@ -8010,9 +8067,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
SmallVector<Constant*,2> CV1;
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APInt(64, 0x4330000000000000ULL))));
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 0x4330000000000000ULL))));
CV1.push_back(
- ConstantFP::get(*Context, APFloat(APInt(64, 0x4530000000000000ULL))));
+ ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 0x4530000000000000ULL))));
Constant *C1 = ConstantVector::get(CV1);
SDValue CPIdx1 = DAG.getConstantPool(C1, getPointerTy(), 16);
@@ -8106,7 +8165,8 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
"Custom UINT_TO_FP is not supported!");
- EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
+ EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+ SVT.getVectorNumElements());
return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
}
@@ -8199,8 +8259,9 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add, DAG.getIntPtrConstant(0));
}
-std::pair<SDValue,SDValue> X86TargetLowering::
-FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) const {
+std::pair<SDValue,SDValue>
+X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
+ bool IsSigned, bool IsReplace) const {
DebugLoc DL = Op.getDebugLoc();
EVT DstTy = Op.getValueType();
@@ -8292,11 +8353,70 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co
}
}
-SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ MVT VT = Op->getValueType(0).getSimpleVT();
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getValueType().getSimpleVT();
+ DebugLoc dl = Op->getDebugLoc();
+
+ // Optimize vectors in AVX mode:
+ //
+ // v8i16 -> v8i32
+ // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
+ // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
+ // Concat upper and lower parts.
+ //
+ // v4i32 -> v4i64
+ // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
+ // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
+ // Concat upper and lower parts.
+ //
+
+ if (((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
+ ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
+ return SDValue();
+
+ if (Subtarget->hasInt256())
+ return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, In);
+
+ SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
+ SDValue Undef = DAG.getUNDEF(InVT);
+ bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
+ SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+ SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
+
+ MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
+SDValue X86TargetLowering::LowerANY_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ if (Subtarget->hasFp256()) {
+ SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+ if (Res.getNode())
+ return Res;
+ }
+
+ return SDValue();
+}
+SDValue X86TargetLowering::LowerZERO_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
DebugLoc DL = Op.getDebugLoc();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
SDValue In = Op.getOperand(0);
- EVT SVT = In.getValueType();
+ MVT SVT = In.getValueType().getSimpleVT();
+
+ if (Subtarget->hasFp256()) {
+ SDValue Res = LowerAVXExtend(Op, DAG, Subtarget);
+ if (Res.getNode())
+ return Res;
+ }
if (!VT.is256BitVector() || !SVT.is128BitVector() ||
VT.getVectorNumElements() != SVT.getVectorNumElements())
@@ -8311,27 +8431,119 @@ SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const
SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
- DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0]));
+ DAG.getVectorShuffle(MVT::v8i16, DL, In,
+ DAG.getUNDEF(MVT::v8i16),
+ &Mask[0]));
return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
}
-SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
DebugLoc DL = Op.getDebugLoc();
- EVT VT = Op.getValueType();
- EVT SVT = Op.getOperand(0).getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
+ SDValue In = Op.getOperand(0);
+ MVT SVT = In.getValueType().getSimpleVT();
- if (!VT.is128BitVector() || !SVT.is256BitVector() ||
- VT.getVectorNumElements() != SVT.getVectorNumElements())
+ if ((VT == MVT::v4i32) && (SVT == MVT::v4i64)) {
+ // On AVX2, v4i64 -> v4i32 becomes VPERMD.
+ if (Subtarget->hasInt256()) {
+ static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
+ In = DAG.getNode(ISD::BITCAST, DL, MVT::v8i32, In);
+ In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
+ ShufMask);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
+ DAG.getIntPtrConstant(0));
+ }
+
+ // On AVX, v4i64 -> v4i32 becomes a sequence that uses PSHUFD and MOVLHPS.
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0));
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(2));
+
+ OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+ // The PSHUFD mask:
+ static const int ShufMask1[] = {0, 2, 0, 0};
+ SDValue Undef = DAG.getUNDEF(VT);
+ OpLo = DAG.getVectorShuffle(VT, DL, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(VT, DL, OpHi, Undef, ShufMask1);
+
+ // The MOVLHPS mask:
+ static const int ShufMask2[] = {0, 1, 4, 5};
+ return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask2);
+ }
+
+ if ((VT == MVT::v8i16) && (SVT == MVT::v8i32)) {
+ // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+ if (Subtarget->hasInt256()) {
+ In = DAG.getNode(ISD::BITCAST, DL, MVT::v32i8, In);
+
+ SmallVector<SDValue,32> pshufbMask;
+ for (unsigned i = 0; i < 2; ++i) {
+ pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
+ pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
+ for (unsigned j = 0; j < 8; ++j)
+ pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
+ }
+ SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
+ &pshufbMask[0], 32);
+ In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+ In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
+
+ static const int ShufMask[] = {0, 2, -1, -1};
+ In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
+ &ShufMask[0]);
+ In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
+ DAG.getIntPtrConstant(0));
+ return DAG.getNode(ISD::BITCAST, DL, VT, In);
+ }
+
+ SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(0));
+
+ SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
+ DAG.getIntPtrConstant(4));
+
+ OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, OpHi);
+
+ // The PSHUFB mask:
+ static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
+ -1, -1, -1, -1, -1, -1, -1, -1};
+
+ SDValue Undef = DAG.getUNDEF(MVT::v16i8);
+ OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
+ OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+
+ OpLo = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpLo);
+ OpHi = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, OpHi);
+
+ // The MOVLHPS Mask:
+ static const int ShufMask2[] = {0, 1, 4, 5};
+ SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
+ return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, res);
+ }
+
+ // Handle truncation of V256 to V128 using shuffles.
+ if (!VT.is128BitVector() || !SVT.is256BitVector())
return SDValue();
- assert(Subtarget->hasFp256() && "256-bit vector is observed without AVX!");
+ assert(VT.getVectorNumElements() != SVT.getVectorNumElements() &&
+ "Invalid op");
+ assert(Subtarget->hasFp256() && "256-bit vector without AVX!");
unsigned NumElems = VT.getVectorNumElements();
EVT NVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
NumElems * 2);
- SDValue In = Op.getOperand(0);
SmallVector<int, 16> MaskVec(NumElems * 2, -1);
// Prepare truncation shuffle mask
for (unsigned i = 0; i != NumElems; ++i)
@@ -8345,9 +8557,10 @@ SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
SelectionDAG &DAG) const {
- if (Op.getValueType().isVector()) {
- if (Op.getValueType() == MVT::v8i16)
- return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), Op.getValueType(),
+ MVT VT = Op.getValueType().getSimpleVT();
+ if (VT.isVector()) {
+ if (VT == MVT::v8i16)
+ return DAG.getNode(ISD::TRUNCATE, Op.getDebugLoc(), VT,
DAG.getNode(ISD::FP_TO_SINT, Op.getDebugLoc(),
MVT::v8i32, Op.getOperand(0)));
return SDValue();
@@ -8386,12 +8599,11 @@ SDValue X86TargetLowering::LowerFP_TO_UINT(SDValue Op,
return FIST;
}
-SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
- SelectionDAG &DAG) const {
+static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
DebugLoc DL = Op.getDebugLoc();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
SDValue In = Op.getOperand(0);
- EVT SVT = In.getValueType();
+ MVT SVT = In.getValueType().getSimpleVT();
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
@@ -8403,8 +8615,8 @@ SDValue X86TargetLowering::lowerFP_EXTEND(SDValue Op,
SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
LLVMContext *Context = DAG.getContext();
DebugLoc dl = Op.getDebugLoc();
- EVT VT = Op.getValueType();
- EVT EltVT = VT;
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT;
unsigned NumElts = VT == MVT::f64 ? 2 : 4;
if (VT.isVector()) {
EltVT = VT.getVectorElementType();
@@ -8412,9 +8624,11 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
}
Constant *C;
if (EltVT == MVT::f64)
- C = ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63))));
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, ~(1ULL << 63))));
else
- C = ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31))));
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+ APInt(32, ~(1U << 31))));
C = ConstantVector::getSplat(NumElts, C);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
@@ -8435,8 +8649,8 @@ SDValue X86TargetLowering::LowerFABS(SDValue Op, SelectionDAG &DAG) const {
SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
LLVMContext *Context = DAG.getContext();
DebugLoc dl = Op.getDebugLoc();
- EVT VT = Op.getValueType();
- EVT EltVT = VT;
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT EltVT = VT;
unsigned NumElts = VT == MVT::f64 ? 2 : 4;
if (VT.isVector()) {
EltVT = VT.getVectorElementType();
@@ -8444,9 +8658,11 @@ SDValue X86TargetLowering::LowerFNEG(SDValue Op, SelectionDAG &DAG) const {
}
Constant *C;
if (EltVT == MVT::f64)
- C = ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63)));
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble,
+ APInt(64, 1ULL << 63)));
else
- C = ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31)));
+ C = ConstantFP::get(*Context, APFloat(APFloat::IEEEsingle,
+ APInt(32, 1U << 31)));
C = ConstantVector::getSplat(NumElts, C);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy());
unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
@@ -8470,8 +8686,8 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
DebugLoc dl = Op.getDebugLoc();
- EVT VT = Op.getValueType();
- EVT SrcVT = Op1.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
+ MVT SrcVT = Op1.getValueType().getSimpleVT();
// If second operand is smaller, extend it first.
if (SrcVT.bitsLT(VT)) {
@@ -8490,13 +8706,15 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
// First get the sign bit of second operand.
SmallVector<Constant*,4> CV;
if (SrcVT == MVT::f64) {
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 1ULL << 63))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+ const fltSemantics &Sem = APFloat::IEEEdouble;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 1ULL << 63))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
} else {
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 1U << 31))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ const fltSemantics &Sem = APFloat::IEEEsingle;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 1U << 31))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
}
Constant *C = ConstantVector::get(CV);
SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8519,13 +8737,17 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
// Clear first operand sign bit.
CV.clear();
if (VT == MVT::f64) {
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, ~(1ULL << 63)))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(64, 0))));
+ const fltSemantics &Sem = APFloat::IEEEdouble;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+ APInt(64, ~(1ULL << 63)))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(64, 0))));
} else {
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, ~(1U << 31)))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
- CV.push_back(ConstantFP::get(*Context, APFloat(APInt(32, 0))));
+ const fltSemantics &Sem = APFloat::IEEEsingle;
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem,
+ APInt(32, ~(1U << 31)))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
+ CV.push_back(ConstantFP::get(*Context, APFloat(Sem, APInt(32, 0))));
}
C = ConstantVector::get(CV);
CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
@@ -8541,7 +8763,7 @@ SDValue X86TargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
SDValue N0 = Op.getOperand(0);
DebugLoc dl = Op.getDebugLoc();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
// Lower ISD::FGETSIGN to (AND (X86ISD::FGETSIGNx86 ...) 1).
SDValue xFGETSIGN = DAG.getNode(X86ISD::FGETSIGNx86, dl, VT, N0,
@@ -8551,7 +8773,8 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
//
-SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op,
+ SelectionDAG &DAG) const {
assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
if (!Subtarget->hasSSE41())
@@ -8983,65 +9206,10 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
return SDValue();
}
-SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-
- if (Op.getValueType().isVector()) return LowerVSETCC(Op, DAG);
-
- assert(Op.getValueType() == MVT::i8 && "SetCC type must be 8-bit integer");
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
- DebugLoc dl = Op.getDebugLoc();
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-
- // Optimize to BT if possible.
- // Lower (X & (1 << N)) == 0 to BT(X, N).
- // Lower ((X >>u N) & 1) != 0 to BT(X, N).
- // Lower ((X >>s N) & 1) != 0 to BT(X, N).
- if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
- Op1.getOpcode() == ISD::Constant &&
- cast<ConstantSDNode>(Op1)->isNullValue() &&
- (CC == ISD::SETEQ || CC == ISD::SETNE)) {
- SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
- if (NewSetCC.getNode())
- return NewSetCC;
- }
-
- // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
- // these.
- if (Op1.getOpcode() == ISD::Constant &&
- (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
- cast<ConstantSDNode>(Op1)->isNullValue()) &&
- (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-
- // If the input is a setcc, then reuse the input setcc or use a new one with
- // the inverted condition.
- if (Op0.getOpcode() == X86ISD::SETCC) {
- X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
- bool Invert = (CC == ISD::SETNE) ^
- cast<ConstantSDNode>(Op1)->isNullValue();
- if (!Invert) return Op0;
-
- CCode = X86::GetOppositeBranchCondition(CCode);
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
- }
- }
-
- bool isFP = Op1.getValueType().isFloatingPoint();
- unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
- if (X86CC == X86::COND_INVALID)
- return SDValue();
-
- SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
- EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
- DAG.getConstant(X86CC, MVT::i8), EFLAGS);
-}
-
// Lower256IntVSETCC - Break a VSETCC 256-bit integer VSETCC into two new 128
// ones, and then concatenate the result back.
static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
"Unsupported value type for operation");
@@ -9061,27 +9229,27 @@ static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
SDValue RHS2 = Extract128BitVector(RHS, NumElems/2, DAG, dl);
// Issue the operation on the smaller types and concatenate the result back
- MVT EltVT = VT.getVectorElementType().getSimpleVT();
- EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+ MVT EltVT = VT.getVectorElementType();
+ MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
}
-
-SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
+static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
+ SelectionDAG &DAG) {
SDValue Cond;
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
SDValue CC = Op.getOperand(2);
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
- bool isFP = Op.getOperand(1).getValueType().isFloatingPoint();
+ bool isFP = Op.getOperand(1).getValueType().getSimpleVT().isFloatingPoint();
DebugLoc dl = Op.getDebugLoc();
if (isFP) {
#ifndef NDEBUG
- EVT EltVT = Op0.getValueType().getVectorElementType();
+ MVT EltVT = Op0.getValueType().getVectorElementType().getSimpleVT();
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
@@ -9176,8 +9344,28 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
if (VT == MVT::v2i64) {
if (Opc == X86ISD::PCMPGT && !Subtarget->hasSSE42())
return SDValue();
- if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41())
- return SDValue();
+ if (Opc == X86ISD::PCMPEQ && !Subtarget->hasSSE41()) {
+ // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
+ // pcmpeqd + pshufd + pand.
+ assert(Subtarget->hasSSE2() && !FlipSigns && "Don't know how to lower!");
+
+ // First cast everything to the right type,
+ Op0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op0);
+ Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, Op1);
+
+ // Do the compare.
+ SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
+
+ // Make sure the lower and upper halves are both all-ones.
+ const int Mask[] = { 1, 0, 3, 2 };
+ SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
+ Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
+
+ if (Invert)
+ Result = DAG.getNOT(dl, Result, MVT::v4i32);
+
+ return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+ }
}
// Since SSE has no unsigned integer comparisons, we need to flip the sign
@@ -9202,6 +9390,63 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
return Result;
}
+SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+
+ MVT VT = Op.getValueType().getSimpleVT();
+
+ if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
+
+ assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
+ SDValue Op0 = Op.getOperand(0);
+ SDValue Op1 = Op.getOperand(1);
+ DebugLoc dl = Op.getDebugLoc();
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+
+ // Optimize to BT if possible.
+ // Lower (X & (1 << N)) == 0 to BT(X, N).
+ // Lower ((X >>u N) & 1) != 0 to BT(X, N).
+ // Lower ((X >>s N) & 1) != 0 to BT(X, N).
+ if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() &&
+ Op1.getOpcode() == ISD::Constant &&
+ cast<ConstantSDNode>(Op1)->isNullValue() &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG);
+ if (NewSetCC.getNode())
+ return NewSetCC;
+ }
+
+ // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
+ // these.
+ if (Op1.getOpcode() == ISD::Constant &&
+ (cast<ConstantSDNode>(Op1)->getZExtValue() == 1 ||
+ cast<ConstantSDNode>(Op1)->isNullValue()) &&
+ (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+
+ // If the input is a setcc, then reuse the input setcc or use a new one with
+ // the inverted condition.
+ if (Op0.getOpcode() == X86ISD::SETCC) {
+ X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
+ bool Invert = (CC == ISD::SETNE) ^
+ cast<ConstantSDNode>(Op1)->isNullValue();
+ if (!Invert) return Op0;
+
+ CCode = X86::GetOppositeBranchCondition(CCode);
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(CCode, MVT::i8), Op0.getOperand(1));
+ }
+ }
+
+ bool isFP = Op1.getValueType().getSimpleVT().isFloatingPoint();
+ unsigned X86CC = TranslateX86CC(CC, isFP, Op0, Op1, DAG);
+ if (X86CC == X86::COND_INVALID)
+ return SDValue();
+
+ SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+ EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+ return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
+ DAG.getConstant(X86CC, MVT::i8), EFLAGS);
+}
+
// isX86LogicalCmp - Return true if opcode is a X86 logical comparison.
static bool isX86LogicalCmp(SDValue Op) {
unsigned Opc = Op.getNode()->getOpcode();
@@ -9324,7 +9569,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
SDValue Cmp = Cond.getOperand(1);
unsigned Opc = Cmp.getOpcode();
- EVT VT = Op.getValueType();
+ MVT VT = Op.getValueType().getSimpleVT();
bool IllegalFPCMov = false;
if (VT.isFloatingPoint() && !VT.isVector() &&
@@ -9433,6 +9678,53 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
}
+SDValue X86TargetLowering::LowerSIGN_EXTEND(SDValue Op,
+ SelectionDAG &DAG) const {
+ MVT VT = Op->getValueType(0).getSimpleVT();
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getValueType().getSimpleVT();
+ DebugLoc dl = Op->getDebugLoc();
+
+ if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+ (VT != MVT::v8i32 || InVT != MVT::v8i16))
+ return SDValue();
+
+ if (Subtarget->hasInt256())
+ return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, In);
+
+ // Optimize vectors in AVX mode
+ // Sign extend v8i16 to v8i32 and
+ // v4i32 to v4i64
+ //
+ // Divide input vector into two parts
+ // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
+ // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
+ // concat the vectors to original VT
+
+ unsigned NumElems = InVT.getVectorNumElements();
+ SDValue Undef = DAG.getUNDEF(InVT);
+
+ SmallVector<int,8> ShufMask1(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask1[i] = i;
+
+ SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask1[0]);
+
+ SmallVector<int,8> ShufMask2(NumElems, -1);
+ for (unsigned i = 0; i != NumElems/2; ++i)
+ ShufMask2[i] = i + NumElems/2;
+
+ SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, &ShufMask2[0]);
+
+ MVT HalfVT = MVT::getVectorVT(VT.getScalarType(),
+ VT.getVectorNumElements()/2);
+
+ OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
+ OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
+
+ return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+}
+
// isAndOrOfSingleUseSetCCs - Return true if node is an ISD::AND or
// ISD::OR of two X86ISD::SETCC nodes each of which has no other use apart
// from the AND / OR.
@@ -9721,7 +10013,6 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
Chain, Dest, CC, Cond);
}
-
// Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
// Calls to _alloca is needed to probe the stack when allocating more than 4k
// bytes in one go. Touching the stack at 4K increments is necessary to ensure
@@ -9884,8 +10175,9 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!getTargetMachine().Options.UseSoftFloat &&
!(DAG.getMachineFunction()
- .getFunction()->getFnAttributes()
- .hasAttribute(Attributes::NoImplicitFloat)) &&
+ .getFunction()->getAttributes()
+ .hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::NoImplicitFloat)) &&
Subtarget->hasSSE1());
}
@@ -9933,7 +10225,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
}
-// getTargetVShiftNOde - Handle vector element shifts where the shift amount
+// getTargetVShiftNode - Handle vector element shifts where the shift amount
// may or may not be a constant. Takes immediate version of shift as input.
static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
SDValue SrcOp, SDValue ShAmt,
@@ -10090,6 +10382,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
Op.getOperand(1), Op.getOperand(2));
+ // SSE2/AVX2 sub with unsigned saturation intrinsics
+ case Intrinsic::x86_sse2_psubus_b:
+ case Intrinsic::x86_sse2_psubus_w:
+ case Intrinsic::x86_avx2_psubus_b:
+ case Intrinsic::x86_avx2_psubus_w:
+ return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+
// SSE3/AVX horizontal add/sub intrinsics
case Intrinsic::x86_sse3_hadd_ps:
case Intrinsic::x86_sse3_hadd_pd:
@@ -10139,6 +10439,100 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
Op.getOperand(1), Op.getOperand(2));
}
+ // SSE2/SSE41/AVX2 integer max/min intrinsics.
+ case Intrinsic::x86_sse2_pmaxu_b:
+ case Intrinsic::x86_sse41_pmaxuw:
+ case Intrinsic::x86_sse41_pmaxud:
+ case Intrinsic::x86_avx2_pmaxu_b:
+ case Intrinsic::x86_avx2_pmaxu_w:
+ case Intrinsic::x86_avx2_pmaxu_d:
+ case Intrinsic::x86_sse2_pminu_b:
+ case Intrinsic::x86_sse41_pminuw:
+ case Intrinsic::x86_sse41_pminud:
+ case Intrinsic::x86_avx2_pminu_b:
+ case Intrinsic::x86_avx2_pminu_w:
+ case Intrinsic::x86_avx2_pminu_d:
+ case Intrinsic::x86_sse41_pmaxsb:
+ case Intrinsic::x86_sse2_pmaxs_w:
+ case Intrinsic::x86_sse41_pmaxsd:
+ case Intrinsic::x86_avx2_pmaxs_b:
+ case Intrinsic::x86_avx2_pmaxs_w:
+ case Intrinsic::x86_avx2_pmaxs_d:
+ case Intrinsic::x86_sse41_pminsb:
+ case Intrinsic::x86_sse2_pmins_w:
+ case Intrinsic::x86_sse41_pminsd:
+ case Intrinsic::x86_avx2_pmins_b:
+ case Intrinsic::x86_avx2_pmins_w:
+ case Intrinsic::x86_avx2_pmins_d: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse2_pmaxu_b:
+ case Intrinsic::x86_sse41_pmaxuw:
+ case Intrinsic::x86_sse41_pmaxud:
+ case Intrinsic::x86_avx2_pmaxu_b:
+ case Intrinsic::x86_avx2_pmaxu_w:
+ case Intrinsic::x86_avx2_pmaxu_d:
+ Opcode = X86ISD::UMAX;
+ break;
+ case Intrinsic::x86_sse2_pminu_b:
+ case Intrinsic::x86_sse41_pminuw:
+ case Intrinsic::x86_sse41_pminud:
+ case Intrinsic::x86_avx2_pminu_b:
+ case Intrinsic::x86_avx2_pminu_w:
+ case Intrinsic::x86_avx2_pminu_d:
+ Opcode = X86ISD::UMIN;
+ break;
+ case Intrinsic::x86_sse41_pmaxsb:
+ case Intrinsic::x86_sse2_pmaxs_w:
+ case Intrinsic::x86_sse41_pmaxsd:
+ case Intrinsic::x86_avx2_pmaxs_b:
+ case Intrinsic::x86_avx2_pmaxs_w:
+ case Intrinsic::x86_avx2_pmaxs_d:
+ Opcode = X86ISD::SMAX;
+ break;
+ case Intrinsic::x86_sse41_pminsb:
+ case Intrinsic::x86_sse2_pmins_w:
+ case Intrinsic::x86_sse41_pminsd:
+ case Intrinsic::x86_avx2_pmins_b:
+ case Intrinsic::x86_avx2_pmins_w:
+ case Intrinsic::x86_avx2_pmins_d:
+ Opcode = X86ISD::SMIN;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
+ // SSE/SSE2/AVX floating point max/min intrinsics.
+ case Intrinsic::x86_sse_max_ps:
+ case Intrinsic::x86_sse2_max_pd:
+ case Intrinsic::x86_avx_max_ps_256:
+ case Intrinsic::x86_avx_max_pd_256:
+ case Intrinsic::x86_sse_min_ps:
+ case Intrinsic::x86_sse2_min_pd:
+ case Intrinsic::x86_avx_min_ps_256:
+ case Intrinsic::x86_avx_min_pd_256: {
+ unsigned Opcode;
+ switch (IntNo) {
+ default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
+ case Intrinsic::x86_sse_max_ps:
+ case Intrinsic::x86_sse2_max_pd:
+ case Intrinsic::x86_avx_max_ps_256:
+ case Intrinsic::x86_avx_max_pd_256:
+ Opcode = X86ISD::FMAX;
+ break;
+ case Intrinsic::x86_sse_min_ps:
+ case Intrinsic::x86_sse2_min_pd:
+ case Intrinsic::x86_avx_min_ps_256:
+ case Intrinsic::x86_avx_min_pd_256:
+ Opcode = X86ISD::FMIN;
+ break;
+ }
+ return DAG.getNode(Opcode, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ }
+
// AVX2 variable shift intrinsics
case Intrinsic::x86_avx2_psllv_d:
case Intrinsic::x86_avx2_psllv_q:
@@ -10206,6 +10600,12 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(1));
+ case Intrinsic::x86_sse_sqrt_ps:
+ case Intrinsic::x86_sse2_sqrt_pd:
+ case Intrinsic::x86_avx_sqrt_ps_256:
+ case Intrinsic::x86_avx_sqrt_pd_256:
+ return DAG.getNode(ISD::FSQRT, dl, Op.getValueType(), Op.getOperand(1));
+
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
@@ -10726,7 +11126,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
for (FunctionType::param_iterator I = FTy->param_begin(),
E = FTy->param_end(); I != E; ++I, ++Idx)
- if (Attrs.getParamAttributes(Idx).hasAttribute(Attributes::InReg))
+ if (Attrs.hasAttribute(Idx, Attribute::InReg))
// FIXME: should only count parameters that are lowered to integers.
InRegCount += (TD->getTypeSizeInBits(*I) + 31) / 32;
@@ -10816,7 +11216,6 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
int SSFI = MF.getFrameInfo()->CreateStackObject(2, StackAlignment, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
-
MachineMemOperand *MMO =
MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
MachineMemOperand::MOStore, 2, 2);
@@ -10849,7 +11248,6 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
DAG.getConstant(1, MVT::i16)),
DAG.getConstant(3, MVT::i16));
-
return DAG.getNode((VT.getSizeInBits() < 16 ?
ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
}
@@ -10978,17 +11376,43 @@ static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
+ DebugLoc dl = Op.getDebugLoc();
EVT VT = Op.getValueType();
// Decompose 256-bit ops into smaller 128-bit ops.
if (VT.is256BitVector() && !Subtarget->hasInt256())
return Lower256IntArith(Op, DAG);
+ SDValue A = Op.getOperand(0);
+ SDValue B = Op.getOperand(1);
+
+ // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
+ if (VT == MVT::v4i32) {
+ assert(Subtarget->hasSSE2() && !Subtarget->hasSSE41() &&
+ "Should not custom lower when pmuldq is available!");
+
+ // Extract the odd parts.
+ const int UnpackMask[] = { 1, -1, 3, -1 };
+ SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
+ SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
+
+ // Multiply the even parts.
+ SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
+ // Now multiply odd parts.
+ SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
+
+ Evens = DAG.getNode(ISD::BITCAST, dl, VT, Evens);
+ Odds = DAG.getNode(ISD::BITCAST, dl, VT, Odds);
+
+ // Merge the two vectors back together with a shuffle. This expands into 2
+ // shuffles.
+ const int ShufMask[] = { 0, 4, 2, 6 };
+ return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
+ }
+
assert((VT == MVT::v2i64 || VT == MVT::v4i64) &&
"Only know how to lower V2I64/V4I64 multiply");
- DebugLoc dl = Op.getDebugLoc();
-
// Ahi = psrlqi(a, 32);
// Bhi = psrlqi(b, 32);
//
@@ -11000,9 +11424,6 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
// AhiBlo = psllqi(AhiBlo, 32);
// return AloBlo + AloBhi + AhiBlo;
- SDValue A = Op.getOperand(0);
- SDValue B = Op.getOperand(1);
-
SDValue ShAmt = DAG.getConstant(32, MVT::i32);
SDValue Ahi = DAG.getNode(X86ISD::VSRLI, dl, VT, A, ShAmt);
@@ -11026,13 +11447,55 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
}
+SDValue X86TargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+ EVT VT = Op.getValueType();
+ EVT EltTy = VT.getVectorElementType();
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue N0 = Op.getOperand(0);
+ DebugLoc dl = Op.getDebugLoc();
+
+ // Lower sdiv X, pow2-const.
+ BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
+ if (!C)
+ return SDValue();
+
+ APInt SplatValue, SplatUndef;
+ unsigned MinSplatBits;
+ bool HasAnyUndefs;
+ if (!C->isConstantSplat(SplatValue, SplatUndef, MinSplatBits, HasAnyUndefs))
+ return SDValue();
+
+ if ((SplatValue != 0) &&
+ (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
+ unsigned lg2 = SplatValue.countTrailingZeros();
+ // Splat the sign bit.
+ SDValue Sz = DAG.getConstant(EltTy.getSizeInBits()-1, MVT::i32);
+ SDValue SGN = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, N0, Sz, DAG);
+ // Add (N0 < 0) ? abs2 - 1 : 0;
+ SDValue Amt = DAG.getConstant(EltTy.getSizeInBits() - lg2, MVT::i32);
+ SDValue SRL = getTargetVShiftNode(X86ISD::VSRLI, dl, VT, SGN, Amt, DAG);
+ SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
+ SDValue Lg2Amt = DAG.getConstant(lg2, MVT::i32);
+ SDValue SRA = getTargetVShiftNode(X86ISD::VSRAI, dl, VT, ADD, Lg2Amt, DAG);
+
+ // If we're dividing by a positive value, we're done. Otherwise, we must
+ // negate the result.
+ if (SplatValue.isNonNegative())
+ return SRA;
+
+ SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
+ SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
+ return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+ }
+ return SDValue();
+}
+
SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
EVT VT = Op.getValueType();
DebugLoc dl = Op.getDebugLoc();
SDValue R = Op.getOperand(0);
SDValue Amt = Op.getOperand(1);
- LLVMContext *Context = DAG.getContext();
if (!Subtarget->hasSSE2())
return SDValue();
@@ -11149,17 +11612,9 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
// Lower SHL with variable shift amount.
if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
- Op = DAG.getNode(X86ISD::VSHLI, dl, VT, Op.getOperand(1),
- DAG.getConstant(23, MVT::i32));
-
- const uint32_t CV[] = { 0x3f800000U, 0x3f800000U, 0x3f800000U, 0x3f800000U};
- Constant *C = ConstantDataVector::get(*Context, CV);
- SDValue CPIdx = DAG.getConstantPool(C, getPointerTy(), 16);
- SDValue Addend = DAG.getLoad(VT, dl, DAG.getEntryNode(), CPIdx,
- MachinePointerInfo::getConstantPool(),
- false, false, false, 16);
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, VT));
- Op = DAG.getNode(ISD::ADD, dl, VT, Op, Addend);
+ Op = DAG.getNode(ISD::ADD, dl, VT, Op, DAG.getConstant(0x3f800000U, VT));
Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, Op);
Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
return DAG.getNode(ISD::MUL, dl, VT, Op, R);
@@ -11168,8 +11623,7 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
// a = a << 5;
- Op = DAG.getNode(X86ISD::VSHLI, dl, MVT::v8i16, Op.getOperand(1),
- DAG.getConstant(5, MVT::i32));
+ Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(5, VT));
Op = DAG.getNode(ISD::BITCAST, dl, VT, Op);
// Turn 'a' into a mask suitable for VSELECT
@@ -11379,7 +11833,6 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
}
}
-
static SDValue LowerMEMBARRIER(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
DebugLoc dl = Op.getDebugLoc();
@@ -11464,7 +11917,6 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget *Subtarget,
return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
}
-
static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
SelectionDAG &DAG) {
EVT T = Op.getValueType();
@@ -11603,6 +12055,43 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
Op.getOperand(1), Op.getOperand(2));
}
+SDValue X86TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
+ assert(Subtarget->isTargetDarwin() && Subtarget->is64Bit());
+
+ // For MacOSX, we want to call an alternative entry point: __sincos_stret,
+ // which returns the values in two XMM registers.
+ DebugLoc dl = Op.getDebugLoc();
+ SDValue Arg = Op.getOperand(0);
+ EVT ArgVT = Arg.getValueType();
+ Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+
+ ArgListTy Args;
+ ArgListEntry Entry;
+
+ Entry.Node = Arg;
+ Entry.Ty = ArgTy;
+ Entry.isSExt = false;
+ Entry.isZExt = false;
+ Args.push_back(Entry);
+
+ // Only optimize x86_64 for now. i386 is a bit messy. For f32,
+ // the small struct {f32, f32} is returned in (eax, edx). For f64,
+ // the results are returned via SRet in memory.
+ const char *LibcallName = (ArgVT == MVT::f64)
+ ? "__sincos_stret" : "__sincosf_stret";
+ SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
+
+ StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+ TargetLowering::
+ CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
+ false, false, false, false, 0,
+ CallingConv::C, /*isTaillCall=*/false,
+ /*doesNotRet=*/false, /*isReturnValueUsed*/true,
+ Callee, Args, DAG, dl);
+ std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+ return CallResult.first;
+}
+
/// LowerOperation - Provide custom lowering hooks for some operations.
///
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
@@ -11632,11 +12121,13 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
- case ISD::TRUNCATE: return lowerTRUNCATE(Op, DAG);
- case ISD::ZERO_EXTEND: return lowerZERO_EXTEND(Op, DAG);
+ case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
+ case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, DAG);
+ case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, DAG);
+ case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, DAG);
case ISD::FP_TO_SINT: return LowerFP_TO_SINT(Op, DAG);
case ISD::FP_TO_UINT: return LowerFP_TO_UINT(Op, DAG);
- case ISD::FP_EXTEND: return lowerFP_EXTEND(Op, DAG);
+ case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
case ISD::FABS: return LowerFABS(Op, DAG);
case ISD::FNEG: return LowerFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
@@ -11682,6 +12173,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
case ISD::ADD: return LowerADD(Op, DAG);
case ISD::SUB: return LowerSUB(Op, DAG);
+ case ISD::SDIV: return LowerSDIV(Op, DAG);
+ case ISD::FSINCOS: return LowerFSINCOS(Op, DAG);
}
}
@@ -11735,6 +12228,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SmallVectorImpl<SDValue>&Results,
SelectionDAG &DAG) const {
DebugLoc dl = N->getDebugLoc();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
switch (N->getOpcode()) {
default:
llvm_unreachable("Do not know how to custom type legalize this operation!");
@@ -11784,6 +12278,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FP_ROUND: {
+ if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+ return;
SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
Results.push_back(V);
return;
@@ -11951,10 +12447,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PSIGN: return "X86ISD::PSIGN";
case X86ISD::BLENDV: return "X86ISD::BLENDV";
case X86ISD::BLENDI: return "X86ISD::BLENDI";
+ case X86ISD::SUBUS: return "X86ISD::SUBUS";
case X86ISD::HADD: return "X86ISD::HADD";
case X86ISD::HSUB: return "X86ISD::HSUB";
case X86ISD::FHADD: return "X86ISD::FHADD";
case X86ISD::FHSUB: return "X86ISD::FHSUB";
+ case X86ISD::UMAX: return "X86ISD::UMAX";
+ case X86ISD::UMIN: return "X86ISD::UMIN";
+ case X86ISD::SMAX: return "X86ISD::SMAX";
+ case X86ISD::SMIN: return "X86ISD::SMIN";
case X86ISD::FMAX: return "X86ISD::FMAX";
case X86ISD::FMIN: return "X86ISD::FMIN";
case X86ISD::FMAXC: return "X86ISD::FMAXC";
@@ -12007,14 +12508,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::ANDN: return "X86ISD::ANDN";
case X86ISD::BLSI: return "X86ISD::BLSI";
case X86ISD::BLSMSK: return "X86ISD::BLSMSK";
case X86ISD::BLSR: return "X86ISD::BLSR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::PTEST: return "X86ISD::PTEST";
case X86ISD::TESTP: return "X86ISD::TESTP";
- case X86ISD::PALIGN: return "X86ISD::PALIGN";
+ case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
@@ -12110,24 +12610,21 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
return true;
}
-
bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
return false;
unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
- if (NumBits1 <= NumBits2)
- return false;
- return true;
+ return NumBits1 > NumBits2;
}
bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
- return Imm == (int32_t)Imm;
+ return isInt<32>(Imm);
}
bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
// Can also use sub to handle negated immediates.
- return Imm == (int32_t)Imm;
+ return isInt<32>(Imm);
}
bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
@@ -12135,9 +12632,7 @@ bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
return false;
unsigned NumBits1 = VT1.getSizeInBits();
unsigned NumBits2 = VT2.getSizeInBits();
- if (NumBits1 <= NumBits2)
- return false;
- return true;
+ return NumBits1 > NumBits2;
}
bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
@@ -12409,13 +12904,16 @@ static unsigned getPseudoCMOVOpc(EVT VT) {
// to
//
// ...
-// EAX = LOAD MI.addr
+// t1 = LOAD MI.addr
// loop:
-// t1 = OP MI.val, EAX
-// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+// t4 = phi(t1, t3 / loop)
+// t2 = OP MI.val, t4
+// EAX = t4
+// LCMPXCHG [MI.addr], t2, [EAX is implicitly used & defined]
+// t3 = EAX
// JNE loop
// sink:
-// dst = EAX
+// dst = t3
// ...
MachineBasicBlock *
X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
@@ -12430,7 +12928,7 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
MachineFunction::iterator I = MBB;
++I;
- assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
+ assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
"Unexpected number of operands");
assert(MI->hasOneMemOperand() &&
@@ -12452,7 +12950,11 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
MVT::SimpleValueType VT = *RC->vt_begin();
- unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
+ unsigned t1 = MRI.createVirtualRegister(RC);
+ unsigned t2 = MRI.createVirtualRegister(RC);
+ unsigned t3 = MRI.createVirtualRegister(RC);
+ unsigned t4 = MRI.createVirtualRegister(RC);
+ unsigned PhyReg = getX86SubSuperRegister(X86::EAX, VT);
unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
unsigned LOADOpc = getLoadOpcode(VT);
@@ -12460,12 +12962,16 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
// For the atomic load-arith operator, we generate
//
// thisMBB:
- // EAX = LOAD [MI.addr]
+ // t1 = LOAD [MI.addr]
// mainMBB:
+ // t4 = phi(t1 / thisMBB, t3 / mainMBB)
// t1 = OP MI.val, EAX
+ // EAX = t4
// LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+ // t3 = EAX
// JNE mainMBB
// sinkMBB:
+ // dst = t3
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
@@ -12481,23 +12987,34 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
// thisMBB:
- MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI->getOperand(MemOpndSlot + i));
- MIB.setMemRefs(MMOBegin, MMOEnd);
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+ unsigned flags = (*MMOI)->getFlags();
+ flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+ MachineMemOperand *MMO =
+ MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+ (*MMOI)->getSize(),
+ (*MMOI)->getBaseAlignment(),
+ (*MMOI)->getTBAAInfo(),
+ (*MMOI)->getRanges());
+ MIB.addMemOperand(MMO);
+ }
thisMBB->addSuccessor(mainMBB);
// mainMBB:
MachineBasicBlock *origMainMBB = mainMBB;
- mainMBB->addLiveIn(AccPhyReg);
- // Copy AccPhyReg as it is used more than once.
- unsigned AccReg = MRI.createVirtualRegister(RC);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
- .addReg(AccPhyReg);
+ // Add a PHI.
+ MachineInstr *Phi = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4)
+ .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
- unsigned t1 = MRI.createVirtualRegister(RC);
unsigned Opc = MI->getOpcode();
switch (Opc) {
default:
@@ -12515,20 +13032,20 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
case X86::ATOMXOR32:
case X86::ATOMXOR64: {
unsigned ARITHOpc = getNonAtomicOpcode(Opc);
- BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
- .addReg(AccReg);
+ BuildMI(mainMBB, DL, TII->get(ARITHOpc), t2).addReg(SrcReg)
+ .addReg(t4);
break;
}
case X86::ATOMNAND8:
case X86::ATOMNAND16:
case X86::ATOMNAND32:
case X86::ATOMNAND64: {
- unsigned t2 = MRI.createVirtualRegister(RC);
+ unsigned Tmp = MRI.createVirtualRegister(RC);
unsigned NOTOpc;
unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
- BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
- .addReg(AccReg);
- BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
+ BuildMI(mainMBB, DL, TII->get(ANDOpc), Tmp).addReg(SrcReg)
+ .addReg(t4);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t2).addReg(Tmp);
break;
}
case X86::ATOMMAX8:
@@ -12552,20 +13069,22 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
BuildMI(mainMBB, DL, TII->get(CMPOpc))
.addReg(SrcReg)
- .addReg(AccReg);
+ .addReg(t4);
if (Subtarget->hasCMov()) {
if (VT != MVT::i8) {
// Native support
- BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
+ BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
.addReg(SrcReg)
- .addReg(AccReg);
+ .addReg(t4);
} else {
// Promote i8 to i32 to use CMOV32
- const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
+ const TargetRegisterInfo* TRI = getTargetMachine().getRegisterInfo();
+ const TargetRegisterClass *RC32 =
+ TRI->getSubClassWithSubReg(getRegClassFor(MVT::i32), X86::sub_8bit);
unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
unsigned AccReg32 = MRI.createVirtualRegister(RC32);
- unsigned t2 = MRI.createVirtualRegister(RC32);
+ unsigned Tmp = MRI.createVirtualRegister(RC32);
unsigned Undef = MRI.createVirtualRegister(RC32);
BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
@@ -12576,15 +13095,15 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
.addImm(X86::sub_8bit);
BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
.addReg(Undef)
- .addReg(AccReg)
+ .addReg(t4)
.addImm(X86::sub_8bit);
- BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+ BuildMI(mainMBB, DL, TII->get(CMOVOpc), Tmp)
.addReg(SrcReg32)
.addReg(AccReg32);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
- .addReg(t2, 0, X86::sub_8bit);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t2)
+ .addReg(Tmp, 0, X86::sub_8bit);
}
} else {
// Use pseudo select and lower them.
@@ -12593,36 +13112,47 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
unsigned SelOpc = getPseudoCMOVOpc(VT);
X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
- MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
- .addReg(SrcReg).addReg(AccReg)
+ MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t2)
+ .addReg(SrcReg).addReg(t4)
.addImm(CC);
mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ // Replace the original PHI node as mainMBB is changed after CMOV
+ // lowering.
+ BuildMI(*origMainMBB, Phi, DL, TII->get(X86::PHI), t4)
+ .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(mainMBB);
+ Phi->eraseFromParent();
}
break;
}
}
- // Copy AccPhyReg back from virtual register.
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
- .addReg(AccReg);
+ // Copy PhyReg back from virtual register.
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), PhyReg)
+ .addReg(t4);
MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI->getOperand(MemOpndSlot + i));
- MIB.addReg(t1);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ MIB.addReg(t2);
MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Copy PhyReg back to virtual register.
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3)
+ .addReg(PhyReg);
+
BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
mainMBB->addSuccessor(origMainMBB);
mainMBB->addSuccessor(sinkMBB);
// sinkMBB:
- sinkMBB->addLiveIn(AccPhyReg);
-
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
TII->get(TargetOpcode::COPY), DstReg)
- .addReg(AccPhyReg);
+ .addReg(t3);
MI->eraseFromParent();
return sinkMBB;
@@ -12639,15 +13169,24 @@ X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
// to
//
// ...
-// EAX = LOAD [MI.addr + 0]
-// EDX = LOAD [MI.addr + 4]
+// t1L = LOAD [MI.addr + 0]
+// t1H = LOAD [MI.addr + 4]
// loop:
-// EBX = OP MI.val.lo, EAX
-// ECX = OP MI.val.hi, EDX
+// t4L = phi(t1L, t3L / loop)
+// t4H = phi(t1H, t3H / loop)
+// t2L = OP MI.val.lo, t4L
+// t2H = OP MI.val.hi, t4H
+// EAX = t4L
+// EDX = t4H
+// EBX = t2L
+// ECX = t2H
// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+// t3L = EAX
+// t3H = EDX
// JNE loop
// sink:
-// dst = EDX:EAX
+// dstL = t3L
+// dstH = t3H
// ...
MachineBasicBlock *
X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
@@ -12662,7 +13201,7 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
MachineFunction::iterator I = MBB;
++I;
- assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+ assert(MI->getNumOperands() <= X86::AddrNumOperands + 7 &&
"Unexpected number of operands");
assert(MI->hasOneMemOperand() &&
@@ -12688,20 +13227,37 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
const TargetRegisterClass *RC = &X86::GR32RegClass;
const TargetRegisterClass *RC8 = &X86::GR8RegClass;
+ unsigned t1L = MRI.createVirtualRegister(RC);
+ unsigned t1H = MRI.createVirtualRegister(RC);
+ unsigned t2L = MRI.createVirtualRegister(RC);
+ unsigned t2H = MRI.createVirtualRegister(RC);
+ unsigned t3L = MRI.createVirtualRegister(RC);
+ unsigned t3H = MRI.createVirtualRegister(RC);
+ unsigned t4L = MRI.createVirtualRegister(RC);
+ unsigned t4H = MRI.createVirtualRegister(RC);
+
unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
unsigned LOADOpc = X86::MOV32rm;
// For the atomic load-arith operator, we generate
//
// thisMBB:
- // EAX = LOAD [MI.addr + 0]
- // EDX = LOAD [MI.addr + 4]
+ // t1L = LOAD [MI.addr + 0]
+ // t1H = LOAD [MI.addr + 4]
// mainMBB:
- // EBX = OP MI.vallo, EAX
- // ECX = OP MI.valhi, EDX
+ // t4L = phi(t1L / thisMBB, t3L / mainMBB)
+ // t4H = phi(t1H / thisMBB, t3H / mainMBB)
+ // t2L = OP MI.val.lo, t4L
+ // t2H = OP MI.val.hi, t4H
+ // EBX = t2L
+ // ECX = t2H
// LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
- // JNE mainMBB
+ // t3L = EAX
+ // t3H = EDX
+ // JNE loop
// sinkMBB:
+ // dstL = t3L
+ // dstH = t3H
MachineBasicBlock *thisMBB = MBB;
MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
@@ -12718,35 +13274,50 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
// thisMBB:
// Lo
- MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI->getOperand(MemOpndSlot + i));
- MIB.setMemRefs(MMOBegin, MMOEnd);
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1L);
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
+ for (MachineInstr::mmo_iterator MMOI = MMOBegin; MMOI != MMOEnd; ++MMOI) {
+ unsigned flags = (*MMOI)->getFlags();
+ flags = (flags & ~MachineMemOperand::MOStore) | MachineMemOperand::MOLoad;
+ MachineMemOperand *MMO =
+ MF->getMachineMemOperand((*MMOI)->getPointerInfo(), flags,
+ (*MMOI)->getSize(),
+ (*MMOI)->getBaseAlignment(),
+ (*MMOI)->getTBAAInfo(),
+ (*MMOI)->getRanges());
+ MIB.addMemOperand(MMO);
+ };
+ MachineInstr *LowMI = MIB;
+
// Hi
- MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
+ MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), t1H);
for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
- if (i == X86::AddrDisp)
+ if (i == X86::AddrDisp) {
MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
- else
- MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ } else {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
}
- MIB.setMemRefs(MMOBegin, MMOEnd);
+ MIB.setMemRefs(LowMI->memoperands_begin(), LowMI->memoperands_end());
thisMBB->addSuccessor(mainMBB);
// mainMBB:
MachineBasicBlock *origMainMBB = mainMBB;
- mainMBB->addLiveIn(X86::EAX);
- mainMBB->addLiveIn(X86::EDX);
- // Copy EDX:EAX as they are used more than once.
- unsigned LoReg = MRI.createVirtualRegister(RC);
- unsigned HiReg = MRI.createVirtualRegister(RC);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
-
- unsigned t1L = MRI.createVirtualRegister(RC);
- unsigned t1H = MRI.createVirtualRegister(RC);
+ // Add PHIs.
+ MachineInstr *PhiL = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4L)
+ .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+ MachineInstr *PhiH = BuildMI(mainMBB, DL, TII->get(X86::PHI), t4H)
+ .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
unsigned Opc = MI->getOpcode();
switch (Opc) {
@@ -12759,19 +13330,23 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
case X86::ATOMSUB6432: {
unsigned HiOpc;
unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
- BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(LoReg).addReg(SrcLoReg);
- BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(HiReg).addReg(SrcHiReg);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(t4L)
+ .addReg(SrcLoReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(t4H)
+ .addReg(SrcHiReg);
break;
}
case X86::ATOMNAND6432: {
unsigned HiOpc, NOTOpc;
unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
- unsigned t2L = MRI.createVirtualRegister(RC);
- unsigned t2H = MRI.createVirtualRegister(RC);
- BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
- BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
- BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
- BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
+ unsigned TmpL = MRI.createVirtualRegister(RC);
+ unsigned TmpH = MRI.createVirtualRegister(RC);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), TmpL).addReg(SrcLoReg)
+ .addReg(t4L);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), TmpH).addReg(SrcHiReg)
+ .addReg(t4H);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t2L).addReg(TmpL);
+ BuildMI(mainMBB, DL, TII->get(NOTOpc), t2H).addReg(TmpH);
break;
}
case X86::ATOMMAX6432:
@@ -12787,12 +13362,12 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
unsigned cc = MRI.createVirtualRegister(RC);
// cl := cmp src_lo, lo
BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
- .addReg(SrcLoReg).addReg(LoReg);
+ .addReg(SrcLoReg).addReg(t4L);
BuildMI(mainMBB, DL, TII->get(LoOpc), cL);
BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cL32).addReg(cL);
// ch := cmp src_hi, hi
BuildMI(mainMBB, DL, TII->get(X86::CMP32rr))
- .addReg(SrcHiReg).addReg(HiReg);
+ .addReg(SrcHiReg).addReg(t4H);
BuildMI(mainMBB, DL, TII->get(HiOpc), cH);
BuildMI(mainMBB, DL, TII->get(X86::MOVZX32rr8), cH32).addReg(cH);
// cc := if (src_hi == hi) ? cl : ch;
@@ -12807,58 +13382,74 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
}
BuildMI(mainMBB, DL, TII->get(X86::TEST32rr)).addReg(cc).addReg(cc);
if (Subtarget->hasCMov()) {
- BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1L)
- .addReg(SrcLoReg).addReg(LoReg);
- BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t1H)
- .addReg(SrcHiReg).addReg(HiReg);
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2L)
+ .addReg(SrcLoReg).addReg(t4L);
+ BuildMI(mainMBB, DL, TII->get(X86::CMOVNE32rr), t2H)
+ .addReg(SrcHiReg).addReg(t4H);
} else {
- MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1L)
- .addReg(SrcLoReg).addReg(LoReg)
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2L)
+ .addReg(SrcLoReg).addReg(t4L)
.addImm(X86::COND_NE);
mainMBB = EmitLoweredSelect(MIB, mainMBB);
- MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t1H)
- .addReg(SrcHiReg).addReg(HiReg)
+ // As the lowered CMOV won't clobber EFLAGS, we could reuse it for the
+ // 2nd CMOV lowering.
+ mainMBB->addLiveIn(X86::EFLAGS);
+ MIB = BuildMI(mainMBB, DL, TII->get(X86::CMOV_GR32), t2H)
+ .addReg(SrcHiReg).addReg(t4H)
.addImm(X86::COND_NE);
mainMBB = EmitLoweredSelect(MIB, mainMBB);
+ // Replace the original PHI node as mainMBB is changed after CMOV
+ // lowering.
+ BuildMI(*origMainMBB, PhiL, DL, TII->get(X86::PHI), t4L)
+ .addReg(t1L).addMBB(thisMBB).addReg(t3L).addMBB(mainMBB);
+ BuildMI(*origMainMBB, PhiH, DL, TII->get(X86::PHI), t4H)
+ .addReg(t1H).addMBB(thisMBB).addReg(t3H).addMBB(mainMBB);
+ PhiL->eraseFromParent();
+ PhiH->eraseFromParent();
}
break;
}
case X86::ATOMSWAP6432: {
unsigned HiOpc;
unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
- BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
- BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
+ BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg);
+ BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg);
break;
}
}
// Copy EDX:EAX back from HiReg:LoReg
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(t4L);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(t4H);
// Copy ECX:EBX from t1H:t1L
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
- BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t2L);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t2H);
MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
- for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
- MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+ for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+ MachineOperand NewMO = MI->getOperand(MemOpndSlot + i);
+ if (NewMO.isReg())
+ NewMO.setIsKill(false);
+ MIB.addOperand(NewMO);
+ }
MIB.setMemRefs(MMOBegin, MMOEnd);
+ // Copy EDX:EAX back to t3H:t3L
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3L).addReg(X86::EAX);
+ BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t3H).addReg(X86::EDX);
+
BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
mainMBB->addSuccessor(origMainMBB);
mainMBB->addSuccessor(sinkMBB);
// sinkMBB:
- sinkMBB->addLiveIn(X86::EAX);
- sinkMBB->addLiveIn(X86::EDX);
-
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
TII->get(TargetOpcode::COPY), DstLoReg)
- .addReg(X86::EAX);
+ .addReg(t3L);
BuildMI(*sinkMBB, sinkMBB->begin(), DL,
TII->get(TargetOpcode::COPY), DstHiReg)
- .addReg(X86::EDX);
+ .addReg(t3H);
MI->eraseFromParent();
return sinkMBB;
@@ -14308,127 +14899,12 @@ static SDValue PerformShuffleCombine(SDNode *N, SelectionDAG &DAG,
return EltsFromConsecutiveLoads(VT, Elts, dl, DAG);
}
-
/// PerformTruncateCombine - Converts truncate operation to
/// a sequence of vector shuffle operations.
/// It is possible when we truncate 256-bit vector to 128-bit vector
static SDValue PerformTruncateCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
- if (!DCI.isBeforeLegalizeOps())
- return SDValue();
-
- if (!Subtarget->hasFp256())
- return SDValue();
-
- EVT VT = N->getValueType(0);
- SDValue Op = N->getOperand(0);
- EVT OpVT = Op.getValueType();
- DebugLoc dl = N->getDebugLoc();
-
- if ((VT == MVT::v4i32) && (OpVT == MVT::v4i64)) {
-
- if (Subtarget->hasInt256()) {
- // AVX2: v4i64 -> v4i32
-
- // VPERMD
- static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
-
- Op = DAG.getNode(ISD::BITCAST, dl, MVT::v8i32, Op);
- Op = DAG.getVectorShuffle(MVT::v8i32, dl, Op, DAG.getUNDEF(MVT::v8i32),
- ShufMask);
-
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
- DAG.getIntPtrConstant(0));
- }
-
- // AVX: v4i64 -> v4i32
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
- DAG.getIntPtrConstant(0));
-
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
- DAG.getIntPtrConstant(2));
-
- OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
-
- // PSHUFD
- static const int ShufMask1[] = {0, 2, 0, 0};
-
- SDValue Undef = DAG.getUNDEF(VT);
- OpLo = DAG.getVectorShuffle(VT, dl, OpLo, Undef, ShufMask1);
- OpHi = DAG.getVectorShuffle(VT, dl, OpHi, Undef, ShufMask1);
-
- // MOVLHPS
- static const int ShufMask2[] = {0, 1, 4, 5};
-
- return DAG.getVectorShuffle(VT, dl, OpLo, OpHi, ShufMask2);
- }
-
- if ((VT == MVT::v8i16) && (OpVT == MVT::v8i32)) {
-
- if (Subtarget->hasInt256()) {
- // AVX2: v8i32 -> v8i16
-
- Op = DAG.getNode(ISD::BITCAST, dl, MVT::v32i8, Op);
-
- // PSHUFB
- SmallVector<SDValue,32> pshufbMask;
- for (unsigned i = 0; i < 2; ++i) {
- pshufbMask.push_back(DAG.getConstant(0x0, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x1, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x4, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x5, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x8, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0x9, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xc, MVT::i8));
- pshufbMask.push_back(DAG.getConstant(0xd, MVT::i8));
- for (unsigned j = 0; j < 8; ++j)
- pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
- }
- SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v32i8,
- &pshufbMask[0], 32);
- Op = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v32i8, Op, BV);
-
- Op = DAG.getNode(ISD::BITCAST, dl, MVT::v4i64, Op);
-
- static const int ShufMask[] = {0, 2, -1, -1};
- Op = DAG.getVectorShuffle(MVT::v4i64, dl, Op, DAG.getUNDEF(MVT::v4i64),
- &ShufMask[0]);
-
- Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Op,
- DAG.getIntPtrConstant(0));
-
- return DAG.getNode(ISD::BITCAST, dl, VT, Op);
- }
-
- SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
- DAG.getIntPtrConstant(0));
-
- SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i32, Op,
- DAG.getIntPtrConstant(4));
-
- OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, OpHi);
-
- // PSHUFB
- static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
- -1, -1, -1, -1, -1, -1, -1, -1};
-
- SDValue Undef = DAG.getUNDEF(MVT::v16i8);
- OpLo = DAG.getVectorShuffle(MVT::v16i8, dl, OpLo, Undef, ShufMask1);
- OpHi = DAG.getVectorShuffle(MVT::v16i8, dl, OpHi, Undef, ShufMask1);
-
- OpLo = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, OpHi);
-
- // MOVLHPS
- static const int ShufMask2[] = {0, 1, 4, 5};
-
- SDValue res = DAG.getVectorShuffle(MVT::v4i32, dl, OpLo, OpHi, ShufMask2);
- return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, res);
- }
-
return SDValue();
}
@@ -14623,6 +15099,76 @@ static SDValue PerformEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// \brief Matches a VSELECT onto min/max or return 0 if the node doesn't match.
+static unsigned matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS,
+ SDValue RHS, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ if (!VT.isVector())
+ return 0;
+
+ switch (VT.getSimpleVT().SimpleTy) {
+ default: return 0;
+ case MVT::v32i8:
+ case MVT::v16i16:
+ case MVT::v8i32:
+ if (!Subtarget->hasAVX2())
+ return 0;
+ case MVT::v16i8:
+ case MVT::v8i16:
+ case MVT::v4i32:
+ if (!Subtarget->hasSSE2())
+ return 0;
+ }
+
+ // SSE2 has only a small subset of the operations.
+ bool hasUnsigned = Subtarget->hasSSE41() ||
+ (Subtarget->hasSSE2() && VT == MVT::v16i8);
+ bool hasSigned = Subtarget->hasSSE41() ||
+ (Subtarget->hasSSE2() && VT == MVT::v8i16);
+
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check for x CC y ? x : y.
+ if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(1))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ return hasUnsigned ? X86ISD::UMIN : 0;
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ return hasUnsigned ? X86ISD::UMAX : 0;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ return hasSigned ? X86ISD::SMIN : 0;
+ case ISD::SETGT:
+ case ISD::SETGE:
+ return hasSigned ? X86ISD::SMAX : 0;
+ }
+ // Check for x CC y ? y : x -- a min/max with reversed arms.
+ } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
+ DAG.isEqualTo(RHS, Cond.getOperand(0))) {
+ switch (CC) {
+ default: break;
+ case ISD::SETULT:
+ case ISD::SETULE:
+ return hasUnsigned ? X86ISD::UMAX : 0;
+ case ISD::SETUGT:
+ case ISD::SETUGE:
+ return hasUnsigned ? X86ISD::UMIN : 0;
+ case ISD::SETLT:
+ case ISD::SETLE:
+ return hasSigned ? X86ISD::SMAX : 0;
+ case ISD::SETGT:
+ case ISD::SETGE:
+ return hasSigned ? X86ISD::SMIN : 0;
+ }
+ }
+
+ return 0;
+}
+
/// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
/// nodes.
static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -14903,6 +15449,67 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
}
}
+ // Match VSELECTs into subs with unsigned saturation.
+ if (!DCI.isBeforeLegalize() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+ // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+ ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+ (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+ // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+ // left side invert the predicate to simplify logic below.
+ SDValue Other;
+ if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+ Other = RHS;
+ CC = ISD::getSetCCInverse(CC, true);
+ } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+ Other = LHS;
+ }
+
+ if (Other.getNode() && Other->getNumOperands() == 2 &&
+ DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+ SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+ SDValue CondRHS = Cond->getOperand(1);
+
+ // Look for a general sub with unsigned saturation first.
+ // x >= y ? x-y : 0 --> subus x, y
+ // x > y ? x-y : 0 --> subus x, y
+ if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+ Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+ // If the RHS is a constant we have to reverse the const canonicalization.
+ // x > C-1 ? x+-C : 0 --> subus x, C
+ if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+ isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (CondRHS.getConstantOperandVal(0) == -A-1)
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+ DAG.getConstant(-A, VT));
+ }
+
+ // Another special case: If C was a sign bit, the sub has been
+ // canonicalized into a xor.
+ // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+ // it's safe to decanonicalize the xor?
+ // x s< 0 ? x^C : 0 --> subus x, C
+ if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+ ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+ isSplatVector(OpRHS.getNode())) {
+ APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+ if (A.isSignBit())
+ return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+ }
+ }
+ }
+
+ // Try to match a min/max vector operation.
+ if (!DCI.isBeforeLegalize() &&
+ N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC)
+ if (unsigned Op = matchIntegerMINMAX(Cond, VT, LHS, RHS, DAG, Subtarget))
+ return DAG.getNode(Op, DL, N->getValueType(0), LHS, RHS);
+
// If we know that this node is legal then we know that it is going to be
// matched by one of the SSE/AVX BLEND instructions. These instructions only
// depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -15179,7 +15786,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
ConstantSDNode *CmpAgainst = 0;
if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
(CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
- dyn_cast<ConstantSDNode>(Cond.getOperand(0)) == 0) {
+ !isa<ConstantSDNode>(Cond.getOperand(0))) {
if (CC == X86::COND_NE &&
CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
@@ -15200,7 +15807,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-
/// PerformMulCombine - Optimize a single multiply with constant into two
/// in order to implement it with two cheaper instructions, e.g.
/// LEA + SHL, LEA + LEA.
@@ -15289,7 +15895,6 @@ static SDValue PerformSHLCombine(SDNode *N, SelectionDAG &DAG) {
}
}
-
// Hardware support for vector shifts is sparse which makes us scalarize the
// vector operations in many cases. Also, on sandybridge ADD is faster than
// shl.
@@ -15433,7 +16038,6 @@ static SDValue PerformShiftCombine(SDNode* N, SelectionDAG &DAG,
}
}
-
// CMPEQCombine - Recognize the distinctive (AND (setcc ...) (setcc ..))
// where both setccs reference the same FP CMP, and rewrite for CMPEQSS
// and friends. Likewise for OR -> CMPNEQSS.
@@ -15462,8 +16066,7 @@ static SDValue CMPEQCombine(SDNode *N, SelectionDAG &DAG,
if (VT == MVT::f32 || VT == MVT::f64) {
bool ExpectingFlags = false;
// Check for any users that want flags:
- for (SDNode::use_iterator UI = N->use_begin(),
- UE = N->use_end();
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
!ExpectingFlags && UI != UE; ++UI)
switch (UI->getOpcode()) {
default:
@@ -15542,9 +16145,92 @@ static bool CanFoldXORWithAllOnes(const SDNode *N) {
return false;
}
+// On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
+// register. In most cases we actually compare or select YMM-sized registers
+// and mixing the two types creates horrible code. This method optimizes
+// some of the transition sequences.
+static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.is256BitVector())
+ return SDValue();
+
+ assert((N->getOpcode() == ISD::ANY_EXTEND ||
+ N->getOpcode() == ISD::ZERO_EXTEND ||
+ N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
+
+ SDValue Narrow = N->getOperand(0);
+ EVT NarrowVT = Narrow->getValueType(0);
+ if (!NarrowVT.is128BitVector())
+ return SDValue();
+
+ if (Narrow->getOpcode() != ISD::XOR &&
+ Narrow->getOpcode() != ISD::AND &&
+ Narrow->getOpcode() != ISD::OR)
+ return SDValue();
+
+ SDValue N0 = Narrow->getOperand(0);
+ SDValue N1 = Narrow->getOperand(1);
+ DebugLoc DL = Narrow->getDebugLoc();
+
+ // The Left side has to be a trunc.
+ if (N0.getOpcode() != ISD::TRUNCATE)
+ return SDValue();
+
+ // The type of the truncated inputs.
+ EVT WideVT = N0->getOperand(0)->getValueType(0);
+ if (WideVT != VT)
+ return SDValue();
+
+ // The right side has to be a 'trunc' or a constant vector.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
+ bool RHSConst = (isSplatVector(N1.getNode()) &&
+ isa<ConstantSDNode>(N1->getOperand(0)));
+ if (!RHSTrunc && !RHSConst)
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
+ return SDValue();
+
+ // Set N0 and N1 to hold the inputs to the new wide operation.
+ N0 = N0->getOperand(0);
+ if (RHSConst) {
+ N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
+ N1->getOperand(0));
+ SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
+ N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+ } else if (RHSTrunc) {
+ N1 = N1->getOperand(0);
+ }
+
+ // Generate the wide operation.
+ SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
+ unsigned Opcode = N->getOpcode();
+ switch (Opcode) {
+ case ISD::ANY_EXTEND:
+ return Op;
+ case ISD::ZERO_EXTEND: {
+ unsigned InBits = NarrowVT.getScalarType().getSizeInBits();
+ APInt Mask = APInt::getAllOnesValue(InBits);
+ Mask = Mask.zext(VT.getScalarType().getSizeInBits());
+ return DAG.getNode(ISD::AND, DL, VT,
+ Op, DAG.getConstant(Mask, VT));
+ }
+ case ISD::SIGN_EXTEND:
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
+ Op, DAG.getValueType(NarrowVT));
+ default:
+ llvm_unreachable("Unexpected opcode");
+ }
+}
+
static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -15552,9 +16238,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
if (R.getNode())
return R;
- EVT VT = N->getValueType(0);
-
- // Create ANDN, BLSI, and BLSR instructions
+ // Create BLSI, and BLSR instructions
// BLSI is X & (-X)
// BLSR is X & (X-1)
if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
@@ -15562,13 +16246,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
SDValue N1 = N->getOperand(1);
DebugLoc DL = N->getDebugLoc();
- // Check LHS for not
- if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
- return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
- // Check RHS for not
- if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
- return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
-
// Check LHS for neg
if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
isZero(N0.getOperand(0)))
@@ -15621,6 +16298,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -15628,8 +16306,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
if (R.getNode())
return R;
- EVT VT = N->getValueType(0);
-
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
@@ -15682,11 +16358,6 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
DebugLoc DL = N->getDebugLoc();
- // We are going to replace the AND, OR, NAND with either BLEND
- // or PSIGN, which only look at the MSB. The VSRAI instruction
- // does not affect the highest bit, so we can get rid of it.
- Mask = Mask.getOperand(0);
-
// Now we know we at least have a plendvb with the mask val. See if
// we can form a psignb/w/d.
// psign = x.type == y.type == mask.type && y = sub(0, x);
@@ -15695,7 +16366,7 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
"Unsupported VT for PSIGN");
- Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask);
+ Mask = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X, Mask.getOperand(0));
return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
}
// PBLENDVB only available on SSE 4.1
@@ -15809,6 +16480,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -15822,8 +16494,6 @@ static SDValue PerformXorCombine(SDNode *N, SelectionDAG &DAG,
if (!Subtarget->hasBMI())
return SDValue();
- EVT VT = N->getValueType(0);
-
if (VT != MVT::i32 && VT != MVT::i64)
return SDValue();
@@ -15854,23 +16524,62 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
EVT MemVT = Ld->getMemoryVT();
DebugLoc dl = Ld->getDebugLoc();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned RegSz = RegVT.getSizeInBits();
ISD::LoadExtType Ext = Ld->getExtensionType();
+ unsigned Alignment = Ld->getAlignment();
+ bool IsAligned = Alignment == 0 || Alignment == MemVT.getSizeInBits()/8;
+
+ // On Sandybridge unaligned 256bit loads are inefficient.
+ if (RegVT.is256BitVector() && !Subtarget->hasInt256() &&
+ !DCI.isBeforeLegalizeOps() && !IsAligned && Ext == ISD::NON_EXTLOAD) {
+ unsigned NumElems = RegVT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Ptr = Ld->getBasePtr();
+ SDValue Increment = DAG.getConstant(16, TLI.getPointerTy());
+
+ EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ NumElems/2);
+ SDValue Load1 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ Alignment);
+ Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
+ SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr,
+ Ld->getPointerInfo(), Ld->isVolatile(),
+ Ld->isNonTemporal(), Ld->isInvariant(),
+ std::max(Alignment/2U, 1U));
+ SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ Load1.getValue(1),
+ Load2.getValue(1));
+
+ SDValue NewVec = DAG.getUNDEF(RegVT);
+ NewVec = Insert128BitVector(NewVec, Load1, 0, DAG, dl);
+ NewVec = Insert128BitVector(NewVec, Load2, NumElems/2, DAG, dl);
+ return DCI.CombineTo(N, NewVec, TF, true);
+ }
// If this is a vector EXT Load then attempt to optimize it using a
- // shuffle. We need SSSE3 shuffles.
+ // shuffle. If SSSE3 is not available we may emit an illegal shuffle but the
+ // expansion is still better than scalar code.
+ // We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise we'll
+ // emit a shuffle and a arithmetic shift.
// TODO: It is possible to support ZExt by zeroing the undef values
// during the shuffle phase or after the shuffle.
- if (RegVT.isVector() && RegVT.isInteger() &&
- Ext == ISD::EXTLOAD && Subtarget->hasSSSE3()) {
+ if (RegVT.isVector() && RegVT.isInteger() && Subtarget->hasSSE2() &&
+ (Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)) {
assert(MemVT != RegVT && "Cannot extend to the same type");
assert(MemVT.isVector() && "Must load a vector from memory");
unsigned NumElems = RegVT.getVectorNumElements();
- unsigned RegSz = RegVT.getSizeInBits();
unsigned MemSz = MemVT.getSizeInBits();
assert(RegSz > MemSz && "Register size must be greater than the mem size");
+ if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget->hasInt256())
+ return SDValue();
+
// All sizes must be a power of two.
if (!isPowerOf2_32(RegSz * MemSz * NumElems))
return SDValue();
@@ -15894,16 +16603,23 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
// Calculate the number of scalar loads that we need to perform
// in order to load our vector from memory.
unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
+ if (Ext == ISD::SEXTLOAD && NumLoads > 1)
+ return SDValue();
+
+ unsigned loadRegZize = RegSz;
+ if (Ext == ISD::SEXTLOAD && RegSz == 256)
+ loadRegZize /= 2;
// Represent our vector as a sequence of elements which are the
// largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(*DAG.getContext(), SclrLoadTy,
- RegSz/SclrLoadTy.getSizeInBits());
+ loadRegZize/SclrLoadTy.getSizeInBits());
// Represent the data using the same element type that is stored in
// memory. In practice, we ''widen'' MemVT.
- EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
- RegSz/MemVT.getScalarType().getSizeInBits());
+ EVT WideVecVT =
+ EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
+ loadRegZize/MemVT.getScalarType().getSizeInBits());
assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
"Invalid vector type");
@@ -15944,6 +16660,39 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
SDValue SlicedVec = DAG.getNode(ISD::BITCAST, dl, WideVecVT, Res);
unsigned SizeRatio = RegSz/MemSz;
+ if (Ext == ISD::SEXTLOAD) {
+ // If we have SSE4.1 we can directly emit a VSEXT node.
+ if (Subtarget->hasSSE41()) {
+ SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+ return DCI.CombineTo(N, Sext, TF, true);
+ }
+
+ // Otherwise we'll shuffle the small elements in the high bits of the
+ // larger type and perform an arithmetic shift. If the shift is not legal
+ // it's better to scalarize.
+ if (!TLI.isOperationLegalOrCustom(ISD::SRA, RegVT))
+ return SDValue();
+
+ // Redistribute the loaded elements into the different locations.
+ SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
+ for (unsigned i = 0; i != NumElems; ++i)
+ ShuffleVec[i*SizeRatio + SizeRatio-1] = i;
+
+ SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
+ DAG.getUNDEF(WideVecVT),
+ &ShuffleVec[0]);
+
+ Shuff = DAG.getNode(ISD::BITCAST, dl, RegVT, Shuff);
+
+ // Build the arithmetic shift.
+ unsigned Amt = RegVT.getVectorElementType().getSizeInBits() -
+ MemVT.getVectorElementType().getSizeInBits();
+ Shuff = DAG.getNode(ISD::SRA, dl, RegVT, Shuff,
+ DAG.getConstant(Amt, RegVT));
+
+ return DCI.CombineTo(N, Shuff, TF, true);
+ }
+
// Redistribute the loaded elements into the different locations.
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
@@ -15972,16 +16721,21 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
DebugLoc dl = St->getDebugLoc();
SDValue StoredVal = St->getOperand(1);
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned Alignment = St->getAlignment();
+ bool IsAligned = Alignment == 0 || Alignment == VT.getSizeInBits()/8;
// If we are saving a concatenation of two XMM registers, perform two stores.
// On Sandy Bridge, 256-bit memory operations are executed by two
// 128-bit ports. However, on Haswell it is better to issue a single 256-bit
// memory operation.
if (VT.is256BitVector() && !Subtarget->hasInt256() &&
- StoredVal.getNode()->getOpcode() == ISD::CONCAT_VECTORS &&
- StoredVal.getNumOperands() == 2) {
- SDValue Value0 = StoredVal.getOperand(0);
- SDValue Value1 = StoredVal.getOperand(1);
+ StVT == VT && !IsAligned) {
+ unsigned NumElems = VT.getVectorNumElements();
+ if (NumElems < 2)
+ return SDValue();
+
+ SDValue Value0 = Extract128BitVector(StoredVal, 0, DAG, dl);
+ SDValue Value1 = Extract128BitVector(StoredVal, NumElems/2, DAG, dl);
SDValue Stride = DAG.getConstant(16, TLI.getPointerTy());
SDValue Ptr0 = St->getBasePtr();
@@ -15989,10 +16743,11 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
SDValue Ch0 = DAG.getStore(St->getChain(), dl, Value0, Ptr0,
St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), St->getAlignment());
+ St->isNonTemporal(), Alignment);
SDValue Ch1 = DAG.getStore(St->getChain(), dl, Value1, Ptr1,
St->getPointerInfo(), St->isVolatile(),
- St->isNonTemporal(), St->getAlignment());
+ St->isNonTemporal(),
+ std::max(Alignment/2U, 1U));
return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
}
@@ -16077,7 +16832,6 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
Chains.size());
}
-
// Turn load->store of MMX types into GPR load/stores. This avoids clobbering
// the FP state in cases where an emms may be missing.
// A preferable solution to the general problem is to figure out the right
@@ -16088,8 +16842,8 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
const Function *F = DAG.getMachineFunction().getFunction();
- bool NoImplicitFloatOps = F->getFnAttributes().
- hasAttribute(Attributes::NoImplicitFloat);
+ bool NoImplicitFloatOps = F->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat);
bool F64IsLegal = !DAG.getTarget().Options.UseSoftFloat && !NoImplicitFloatOps
&& Subtarget->hasSSE2();
if ((VT.isVector() ||
@@ -16383,7 +17137,6 @@ static SDValue PerformFMinFMaxCombine(SDNode *N, SelectionDAG &DAG) {
N->getOperand(0), N->getOperand(1));
}
-
/// PerformFANDCombine - Do target-specific dag combines on X86ISD::FAND nodes.
static SDValue PerformFANDCombine(SDNode *N, SelectionDAG &DAG) {
// FAND(0.0, x) -> 0.0
@@ -16429,6 +17182,41 @@ static SDValue PerformVZEXT_MOVLCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue PerformSIGN_EXTEND_INREGCombine(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget *Subtarget) {
+ EVT VT = N->getValueType(0);
+ if (!VT.isVector())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
+ DebugLoc dl = N->getDebugLoc();
+
+ // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
+ // both SSE and AVX2 since there is no sign-extended shift right
+ // operation on a vector with 64-bit elements.
+ //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
+ // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
+ if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
+ N0.getOpcode() == ISD::SIGN_EXTEND)) {
+ SDValue N00 = N0.getOperand(0);
+
+ // EXTLOAD has a better solution on AVX2,
+ // it may be replaced with X86ISD::VSEXT node.
+ if (N00.getOpcode() == ISD::LOAD && Subtarget->hasInt256())
+ if (!ISD::isNormalLoad(N00.getNode()))
+ return SDValue();
+
+ if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
+ SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
+ N00, N1);
+ return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
+ }
+ }
+ return SDValue();
+}
+
static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget *Subtarget) {
@@ -16439,48 +17227,12 @@ static SDValue PerformSExtCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
EVT VT = N->getValueType(0);
- SDValue Op = N->getOperand(0);
- EVT OpVT = Op.getValueType();
- DebugLoc dl = N->getDebugLoc();
-
- if ((VT == MVT::v4i64 && OpVT == MVT::v4i32) ||
- (VT == MVT::v8i32 && OpVT == MVT::v8i16)) {
-
- if (Subtarget->hasInt256())
- return DAG.getNode(X86ISD::VSEXT_MOVL, dl, VT, Op);
-
- // Optimize vectors in AVX mode
- // Sign extend v8i16 to v8i32 and
- // v4i32 to v4i64
- //
- // Divide input vector into two parts
- // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
- // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
- // concat the vectors to original VT
-
- unsigned NumElems = OpVT.getVectorNumElements();
- SDValue Undef = DAG.getUNDEF(OpVT);
-
- SmallVector<int,8> ShufMask1(NumElems, -1);
- for (unsigned i = 0; i != NumElems/2; ++i)
- ShufMask1[i] = i;
-
- SDValue OpLo = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask1[0]);
-
- SmallVector<int,8> ShufMask2(NumElems, -1);
- for (unsigned i = 0; i != NumElems/2; ++i)
- ShufMask2[i] = i + NumElems/2;
-
- SDValue OpHi = DAG.getVectorShuffle(OpVT, dl, Op, Undef, &ShufMask2[0]);
-
- EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), VT.getScalarType(),
- VT.getVectorNumElements()/2);
-
- OpLo = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpLo);
- OpHi = DAG.getNode(X86ISD::VSEXT_MOVL, dl, HalfVT, OpHi);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+ if (VT.isVector() && VT.getSizeInBits() == 256) {
+ SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
}
+
return SDValue();
}
@@ -16534,58 +17286,26 @@ static SDValue PerformZExtCombine(SDNode *N, SelectionDAG &DAG,
DebugLoc dl = N->getDebugLoc();
SDValue N0 = N->getOperand(0);
EVT VT = N->getValueType(0);
- EVT OpVT = N0.getValueType();
if (N0.getOpcode() == ISD::AND &&
N0.hasOneUse() &&
N0.getOperand(0).hasOneUse()) {
SDValue N00 = N0.getOperand(0);
- if (N00.getOpcode() != X86ISD::SETCC_CARRY)
- return SDValue();
- ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
- if (!C || C->getZExtValue() != 1)
- return SDValue();
- return DAG.getNode(ISD::AND, dl, VT,
- DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
- N00.getOperand(0), N00.getOperand(1)),
- DAG.getConstant(1, VT));
+ if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
+ ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!C || C->getZExtValue() != 1)
+ return SDValue();
+ return DAG.getNode(ISD::AND, dl, VT,
+ DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
+ N00.getOperand(0), N00.getOperand(1)),
+ DAG.getConstant(1, VT));
+ }
}
- // Optimize vectors in AVX mode:
- //
- // v8i16 -> v8i32
- // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
- // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
- // Concat upper and lower parts.
- //
- // v4i32 -> v4i64
- // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
- // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
- // Concat upper and lower parts.
- //
- if (!DCI.isBeforeLegalizeOps())
- return SDValue();
-
- if (!Subtarget->hasFp256())
- return SDValue();
-
- if (((VT == MVT::v8i32) && (OpVT == MVT::v8i16)) ||
- ((VT == MVT::v4i64) && (OpVT == MVT::v4i32))) {
-
- if (Subtarget->hasInt256())
- return DAG.getNode(X86ISD::VZEXT_MOVL, dl, VT, N0);
-
- SDValue ZeroVec = getZeroVector(OpVT, Subtarget, DAG, dl);
- SDValue OpLo = getUnpackl(DAG, dl, OpVT, N0, ZeroVec);
- SDValue OpHi = getUnpackh(DAG, dl, OpVT, N0, ZeroVec);
-
- EVT HVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
- VT.getVectorNumElements()/2);
-
- OpLo = DAG.getNode(ISD::BITCAST, dl, HVT, OpLo);
- OpHi = DAG.getNode(ISD::BITCAST, dl, HVT, OpHi);
-
- return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
+ if (VT.is256BitVector()) {
+ SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget);
+ if (R.getNode())
+ return R;
}
return SDValue();
@@ -16617,8 +17337,8 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
-// as "sbb reg,reg", since it can be extended without zext and produces
+// Helper function of PerformSETCCCombine. It is to materialize "setb reg"
+// as "sbb reg,reg", since it can be extended without zext and produces
// an all-ones bit which is more useful than 0/1 in some cases.
static SDValue MaterializeSETB(DebugLoc DL, SDValue EFLAGS, SelectionDAG &DAG) {
return DAG.getNode(ISD::AND, DL, MVT::i8,
@@ -16636,13 +17356,13 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
SDValue EFLAGS = N->getOperand(1);
if (CC == X86::COND_A) {
- // Try to convert COND_A into COND_B in an attempt to facilitate
+ // Try to convert COND_A into COND_B in an attempt to facilitate
// materializing "setb reg".
//
// Do not flip "e > c", where "c" is a constant, because Cmp instruction
// cannot take an immediate as its first operand.
//
- if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+ if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
EFLAGS.getValueType().isInteger() &&
!isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
SDValue NewSub = DAG.getNode(X86ISD::SUB, EFLAGS.getDebugLoc(),
@@ -16850,7 +17570,8 @@ static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
if (In.getOpcode() != X86ISD::VZEXT)
return SDValue();
- return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
+ return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0),
+ In.getOperand(0));
}
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
@@ -16888,13 +17609,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ANY_EXTEND:
case ISD::ZERO_EXTEND: return PerformZExtCombine(N, DAG, DCI, Subtarget);
case ISD::SIGN_EXTEND: return PerformSExtCombine(N, DAG, DCI, Subtarget);
+ case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
case ISD::TRUNCATE: return PerformTruncateCombine(N, DAG,DCI,Subtarget);
case ISD::SETCC: return PerformISDSETCCCombine(N, DAG);
case X86ISD::SETCC: return PerformSETCCCombine(N, DAG, DCI, Subtarget);
case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget);
case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget);
case X86ISD::SHUFP: // Handle all target specific shuffles
- case X86ISD::PALIGN:
+ case X86ISD::PALIGNR:
case X86ISD::UNPCKH:
case X86ISD::UNPCKL:
case X86ISD::MOVHLPS:
@@ -17077,7 +17799,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- std::sort(AsmPieces.begin(), AsmPieces.end());
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
if (AsmPieces.size() == 4 &&
AsmPieces[0] == "~{cc}" &&
AsmPieces[1] == "~{dirflag}" &&
@@ -17095,7 +17817,7 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
AsmPieces.clear();
const std::string &ConstraintsStr = IA->getConstraintString();
SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
- std::sort(AsmPieces.begin(), AsmPieces.end());
+ array_pod_sort(AsmPieces.begin(), AsmPieces.end());
if (AsmPieces.size() == 4 &&
AsmPieces[0] == "~{cc}" &&
AsmPieces[1] == "~{dirflag}" &&
@@ -17121,8 +17843,6 @@ bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
return false;
}
-
-
/// getConstraintType - Given a constraint letter, return the type of
/// constraint it is for this target.
X86TargetLowering::ConstraintType
@@ -17199,13 +17919,13 @@ TargetLowering::ConstraintWeight
case 'f':
case 't':
case 'u':
- if (type->isFloatingPointTy())
- weight = CW_SpecificReg;
- break;
+ if (type->isFloatingPointTy())
+ weight = CW_SpecificReg;
+ break;
case 'y':
- if (type->isX86_MMXTy() && Subtarget->hasMMX())
- weight = CW_SpecificReg;
- break;
+ if (type->isX86_MMXTy() && Subtarget->hasMMX())
+ weight = CW_SpecificReg;
+ break;
case 'x':
case 'Y':
if (((type->getPrimitiveSizeInBits() == 128) && Subtarget->hasSSE1()) ||
@@ -17577,7 +18297,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
// really want an 8-bit or 32-bit register, map to the appropriate register
// class and return the appropriate register.
if (Res.second == &X86::GR16RegClass) {
- if (VT == MVT::i8) {
+ if (VT == MVT::i8 || VT == MVT::i1) {
unsigned DestReg = 0;
switch (Res.first) {
default: break;
@@ -17590,7 +18310,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res.first = DestReg;
Res.second = &X86::GR8RegClass;
}
- } else if (VT == MVT::i32) {
+ } else if (VT == MVT::i32 || VT == MVT::f32) {
unsigned DestReg = 0;
switch (Res.first) {
default: break;
@@ -17607,7 +18327,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
Res.first = DestReg;
Res.second = &X86::GR32RegClass;
}
- } else if (VT == MVT::i64) {
+ } else if (VT == MVT::i64 || VT == MVT::f64) {
unsigned DestReg = 0;
switch (Res.first) {
default: break;
@@ -17645,218 +18365,3 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
return Res;
}
-
-//===----------------------------------------------------------------------===//
-//
-// X86 cost model.
-//
-//===----------------------------------------------------------------------===//
-
-struct X86CostTblEntry {
- int ISD;
- MVT Type;
- unsigned Cost;
-};
-
-static int
-FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
- for (unsigned int i = 0; i < len; ++i)
- if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
- return i;
-
- // Could not find an entry.
- return -1;
-}
-
-struct X86TypeConversionCostTblEntry {
- int ISD;
- MVT Dst;
- MVT Src;
- unsigned Cost;
-};
-
-static int
-FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
- int ISD, MVT Dst, MVT Src) {
- for (unsigned int i = 0; i < len; ++i)
- if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
- return i;
-
- // Could not find an entry.
- return -1;
-}
-
-ScalarTargetTransformInfo::PopcntHwSupport
-X86ScalarTargetTransformImpl::getPopcntHwSupport(unsigned TyWidth) const {
- assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
- const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
-
- // TODO: Currently the __builtin_popcount() implementation using SSE3
- // instructions is inefficient. Once the problem is fixed, we should
- // call ST.hasSSE3() instead of ST.hasSSE4().
- return ST.hasSSE41() ? Fast : None;
-}
-
-unsigned
-X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
- Type *Ty) const {
- // Legalize the type.
- std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
-
- int ISD = InstructionOpcodeToISD(Opcode);
- assert(ISD && "Invalid opcode");
-
- const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
-
- static const X86CostTblEntry AVX1CostTable[] = {
- // We don't have to scalarize unsupported ops. We can issue two half-sized
- // operations and we only need to extract the upper YMM half.
- // Two ops + 1 extract + 1 insert = 4.
- { ISD::MUL, MVT::v8i32, 4 },
- { ISD::SUB, MVT::v8i32, 4 },
- { ISD::ADD, MVT::v8i32, 4 },
- { ISD::MUL, MVT::v4i64, 4 },
- { ISD::SUB, MVT::v4i64, 4 },
- { ISD::ADD, MVT::v4i64, 4 },
- };
-
- // Look for AVX1 lowering tricks.
- if (ST.hasAVX()) {
- int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
- LT.second);
- if (Idx != -1)
- return LT.first * AVX1CostTable[Idx].Cost;
- }
- // Fallback to the default implementation.
- return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
-}
-
-unsigned
-X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) const {
- assert(Val->isVectorTy() && "This must be a vector type");
-
- if (Index != -1U) {
- // Legalize the type.
- std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val);
-
- // This type is legalized to a scalar type.
- if (!LT.second.isVector())
- return 0;
-
- // The type may be split. Normalize the index to the new type.
- unsigned Width = LT.second.getVectorNumElements();
- Index = Index % Width;
-
- // Floating point scalars are already located in index #0.
- if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
- return 0;
- }
-
- return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
-}
-
-unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
- Type *ValTy,
- Type *CondTy) const {
- // Legalize the type.
- std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
-
- MVT MTy = LT.second;
-
- int ISD = InstructionOpcodeToISD(Opcode);
- assert(ISD && "Invalid opcode");
-
- const X86Subtarget &ST =
- TLI->getTargetMachine().getSubtarget<X86Subtarget>();
-
- static const X86CostTblEntry SSE42CostTbl[] = {
- { ISD::SETCC, MVT::v2f64, 1 },
- { ISD::SETCC, MVT::v4f32, 1 },
- { ISD::SETCC, MVT::v2i64, 1 },
- { ISD::SETCC, MVT::v4i32, 1 },
- { ISD::SETCC, MVT::v8i16, 1 },
- { ISD::SETCC, MVT::v16i8, 1 },
- };
-
- static const X86CostTblEntry AVX1CostTbl[] = {
- { ISD::SETCC, MVT::v4f64, 1 },
- { ISD::SETCC, MVT::v8f32, 1 },
- // AVX1 does not support 8-wide integer compare.
- { ISD::SETCC, MVT::v4i64, 4 },
- { ISD::SETCC, MVT::v8i32, 4 },
- { ISD::SETCC, MVT::v16i16, 4 },
- { ISD::SETCC, MVT::v32i8, 4 },
- };
-
- static const X86CostTblEntry AVX2CostTbl[] = {
- { ISD::SETCC, MVT::v4i64, 1 },
- { ISD::SETCC, MVT::v8i32, 1 },
- { ISD::SETCC, MVT::v16i16, 1 },
- { ISD::SETCC, MVT::v32i8, 1 },
- };
-
- if (ST.hasSSE42()) {
- int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
- if (Idx != -1)
- return LT.first * SSE42CostTbl[Idx].Cost;
- }
-
- if (ST.hasAVX()) {
- int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
- if (Idx != -1)
- return LT.first * AVX1CostTbl[Idx].Cost;
- }
-
- if (ST.hasAVX2()) {
- int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
- if (Idx != -1)
- return LT.first * AVX2CostTbl[Idx].Cost;
- }
-
- return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);
-}
-
-unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
- Type *Dst,
- Type *Src) const {
- int ISD = InstructionOpcodeToISD(Opcode);
- assert(ISD && "Invalid opcode");
-
- EVT SrcTy = TLI->getValueType(Src);
- EVT DstTy = TLI->getValueType(Dst);
-
- if (!SrcTy.isSimple() || !DstTy.isSimple())
- return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
-
- const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
-
- static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
- { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
- { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
- { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
- { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 },
- { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
- { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
- { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
- { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
- { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
- { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
- { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
- { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
- { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
- };
-
- if (ST.hasAVX()) {
- int Idx = FindInConvertTable(AVXConversionTbl,
- array_lengthof(AVXConversionTbl),
- ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
- if (Idx != -1)
- return AVXConversionTbl[Idx].Cost;
- }
-
- return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
-}
-
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index a515be23ef..da1dad0f40 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -23,7 +23,6 @@
#include "llvm/CodeGen/SelectionDAG.h"
#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetTransformImpl.h"
namespace llvm {
namespace X86ISD {
@@ -182,6 +181,9 @@ namespace llvm {
/// BLENDI - Blend where the selector is an immediate.
BLENDI,
+ // SUBUS - Integer sub with unsigned saturation.
+ SUBUS,
+
/// HADD - Integer horizontal add.
HADD,
@@ -194,6 +196,12 @@ namespace llvm {
/// FHSUB - Floating point horizontal sub.
FHSUB,
+ /// UMAX, UMIN - Unsigned integer max and min.
+ UMAX, UMIN,
+
+ /// SMAX, SMIN - Signed integer max and min.
+ SMAX, SMIN,
+
/// FMAX, FMIN - Floating point max and min.
///
FMAX, FMIN,
@@ -226,11 +234,8 @@ namespace llvm {
// EH_SJLJ_LONGJMP - SjLj exception handling longjmp.
EH_SJLJ_LONGJMP,
- /// TC_RETURN - Tail call return.
- /// operand #0 chain
- /// operand #1 callee (register or absolute)
- /// operand #2 stack adjustment
- /// operand #3 optional in flag
+ /// TC_RETURN - Tail call return. See X86TargetLowering::LowerCall for
+ /// the list of operands.
TC_RETURN,
// VZEXT_MOVL - Vector move low and zero extend.
@@ -270,8 +275,6 @@ namespace llvm {
ADD, SUB, ADC, SBB, SMUL,
INC, DEC, OR, XOR, AND,
- ANDN, // ANDN - Bitwise AND NOT with FLAGS results.
-
BLSI, // BLSI - Extract lowest set isolated bit
BLSMSK, // BLSMSK - Get mask up to lowest set bit
BLSR, // BLSR - Reset lowest set bit
@@ -288,7 +291,7 @@ namespace llvm {
TESTP,
// Several flavors of instructions with vector shuffle behaviors.
- PALIGN,
+ PALIGNR,
PSHUFD,
PSHUFHW,
PSHUFLW,
@@ -468,7 +471,7 @@ namespace llvm {
virtual unsigned getJumpTableEncoding() const;
- virtual MVT getShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+ virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
virtual const MCExpr *
LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
@@ -494,18 +497,25 @@ namespace llvm {
/// lowering. If DstAlign is zero that means it's safe to destination
/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
/// means there isn't a need to check it against alignment requirement,
- /// probably because the source does not need to be loaded. If
- /// 'IsZeroVal' is true, that means it's safe to return a
- /// non-scalar-integer type, e.g. empty string source, constant, or loaded
- /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
- /// constant so it does not need to be loaded.
+ /// probably because the source does not need to be loaded. If 'IsMemset' is
+ /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+ /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+ /// source is constant so it does not need to be loaded.
/// It returns EVT::Other if the type should be determined using generic
/// target-independent logic.
virtual EVT
- getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
- bool IsZeroVal, bool MemcpyStrSrc,
+ getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
MachineFunction &MF) const;
+ /// isSafeMemOpType - Returns true if it's safe to use load / store of the
+ /// specified type to expand memcpy / memset inline. This is mostly true
+ /// for all types except for some special cases. For example, on X86
+ /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+ /// also does type conversion. Note the specified type doesn't have to be
+ /// legal as the hook is used before type legalization.
+ virtual bool isSafeMemOpType(MVT VT) const;
+
/// allowsUnalignedMemoryAccesses - Returns true if the target allows
/// unaligned memory accesses. of the specified type. Returns whether it
/// is "fast" by reference in the second argument.
@@ -708,7 +718,7 @@ namespace llvm {
protected:
std::pair<const TargetRegisterClass*, uint8_t>
- findRepresentativeClass(EVT VT) const;
+ findRepresentativeClass(MVT VT) const;
private:
/// Subtarget - Keep a pointer to the X86Subtarget around so that we can
@@ -781,9 +791,7 @@ namespace llvm {
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGlobalAddress(const GlobalValue *GV, DebugLoc dl,
@@ -798,18 +806,18 @@ namespace llvm {
SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSIGN_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerANY_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFABS(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFNEG(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerToBT(SDValue And, ISD::CondCode CC,
DebugLoc dl, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerMEMSET(SDValue Op, SelectionDAG &DAG) const;
@@ -826,8 +834,9 @@ namespace llvm {
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
-
+ SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
// Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
@@ -836,7 +845,7 @@ namespace llvm {
SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
- SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
virtual SDValue
LowerFormalArguments(SDValue Chain,
@@ -859,9 +868,8 @@ namespace llvm {
virtual bool mayBeEmittedAsTailCall(CallInst *CI) const;
- virtual EVT
- getTypeForExtArgOrReturn(LLVMContext &Context, EVT VT,
- ISD::NodeType ExtendKind) const;
+ virtual MVT
+ getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const;
virtual bool
CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
@@ -930,31 +938,6 @@ namespace llvm {
FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
const TargetLibraryInfo *libInfo);
}
-
- class X86ScalarTargetTransformImpl : public ScalarTargetTransformImpl {
- public:
- explicit X86ScalarTargetTransformImpl(const TargetLowering *TL) :
- ScalarTargetTransformImpl(TL) {};
-
- virtual PopcntHwSupport getPopcntHwSupport(unsigned TyWidth) const;
- };
-
- class X86VectorTargetTransformInfo : public VectorTargetTransformImpl {
- public:
- explicit X86VectorTargetTransformInfo(const TargetLowering *TL) :
- VectorTargetTransformImpl(TL) {}
-
- virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
-
- virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
- unsigned Index) const;
-
- unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
- Type *CondTy) const;
-
- virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
- Type *Src) const;
- };
}
#endif // X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 54b91c3edb..bb362f5c7b 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -87,12 +87,10 @@ defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", [(int_x86_mmx_femms)]>;
def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i32mem:$addr),
- "prefetch $addr", []>;
+ "prefetch\t$addr", []>;
-// FIXME: Diassembler gets a bogus decode conflict.
-let isAsmParserOnly = 1 in
def PREFETCHW : I3DNow<0x0D, MRM1m, (outs), (ins i16mem:$addr),
- "prefetchw $addr", []>;
+ "prefetchw\t$addr", []>;
// "3DNowA" instructions
defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index f790611b8f..d86a4065a7 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -29,11 +29,11 @@ def LEA32r : I<0x8D, MRMSrcMem,
def LEA64_32r : I<0x8D, MRMSrcMem,
(outs GR32:$dst), (ins lea64_32mem:$src),
"lea{l}\t{$src|$dst}, {$dst|$src}",
- [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
+ [(set GR32:$dst, lea64_32addr:$src)], IIC_LEA>,
Requires<[In64BitMode]>;
let isReMaterializable = 1 in
-def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+def LEA64r : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
"lea{q}\t{$src|$dst}, {$dst|$src}",
[(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
@@ -57,7 +57,7 @@ def MUL8r : I<0xF6, MRM4r, (outs), (ins GR8:$src), "mul{b}\t$src",
let Defs = [AX,DX,EFLAGS], Uses = [AX], neverHasSideEffects = 1 in
def MUL16r : I<0xF7, MRM4r, (outs), (ins GR16:$src),
- "mul{w}\t$src",
+ "mul{w}\t$src",
[], IIC_MUL16_REG>, OpSize; // AX,DX = AX*GR16
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX], neverHasSideEffects = 1 in
@@ -158,7 +158,7 @@ def IMUL16rm : I<0xAF, MRMSrcMem, (outs GR16:$dst),
(X86smul_flag GR16:$src1, (load addr:$src2)))],
IIC_IMUL16_RM>,
TB, OpSize;
-def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
+def IMUL32rm : I<0xAF, MRMSrcMem, (outs GR32:$dst),
(ins GR32:$src1, i32mem:$src2),
"imul{l}\t{$src2, $dst|$dst, $src2}",
[(set GR32:$dst, EFLAGS,
@@ -182,8 +182,8 @@ let Defs = [EFLAGS] in {
def IMUL16rri : Ii16<0x69, MRMSrcReg, // GR16 = GR16*I16
(outs GR16:$dst), (ins GR16:$src1, i16imm:$src2),
"imul{w}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR16:$dst, EFLAGS,
- (X86smul_flag GR16:$src1, imm:$src2))],
+ [(set GR16:$dst, EFLAGS,
+ (X86smul_flag GR16:$src1, imm:$src2))],
IIC_IMUL16_RRI>, OpSize;
def IMUL16rri8 : Ii8<0x6B, MRMSrcReg, // GR16 = GR16*I8
(outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
@@ -266,6 +266,7 @@ def IMUL64rmi8 : RIi8<0x6B, MRMSrcMem, // GR64 = [mem64]*I8
// unsigned division/remainder
+let hasSideEffects = 1 in { // so that we don't speculatively execute
let Defs = [AL,EFLAGS,AX], Uses = [AX] in
def DIV8r : I<0xF6, MRM6r, (outs), (ins GR8:$src), // AX/r8 = AL,AH
"div{b}\t$src", [], IIC_DIV8_REG>;
@@ -319,12 +320,13 @@ let Defs = [AX,DX,EFLAGS], Uses = [AX,DX] in
def IDIV16m: I<0xF7, MRM7m, (outs), (ins i16mem:$src), // DX:AX/[mem16] = AX,DX
"idiv{w}\t$src", [], IIC_IDIV16>, OpSize;
let Defs = [EAX,EDX,EFLAGS], Uses = [EAX,EDX] in // EDX:EAX/[mem32] = EAX,EDX
-def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
+def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
"idiv{l}\t$src", [], IIC_IDIV32>;
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
"idiv{q}\t$src", [], IIC_IDIV64>;
}
+} // hasSideEffects = 0
//===----------------------------------------------------------------------===//
// Two address Instructions.
@@ -412,11 +414,11 @@ def INC8r : I<0xFE, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
IIC_UNARY_REG>;
let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
-def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+def INC16r : I<0x40, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
"inc{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))], IIC_UNARY_REG>,
OpSize, Requires<[In32BitMode]>;
-def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+def INC32r : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
"inc{l}\t$dst",
[(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
IIC_UNARY_REG>,
@@ -430,22 +432,22 @@ def INC64r : RI<0xFF, MRM0r, (outs GR64:$dst), (ins GR64:$src1), "inc{q}\t$dst",
// In 64-bit mode, single byte INC and DEC cannot be encoded.
let isConvertibleToThreeAddress = 1, CodeSize = 2 in {
// Can transform into LEA.
-def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
+def INC64_16r : I<0xFF, MRM0r, (outs GR16:$dst), (ins GR16:$src1),
"inc{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86inc_flag GR16:$src1))],
IIC_UNARY_REG>,
OpSize, Requires<[In64BitMode]>;
-def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
+def INC64_32r : I<0xFF, MRM0r, (outs GR32:$dst), (ins GR32:$src1),
"inc{l}\t$dst",
[(set GR32:$dst, EFLAGS, (X86inc_flag GR32:$src1))],
IIC_UNARY_REG>,
Requires<[In64BitMode]>;
-def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
+def DEC64_16r : I<0xFF, MRM1r, (outs GR16:$dst), (ins GR16:$src1),
"dec{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
IIC_UNARY_REG>,
OpSize, Requires<[In64BitMode]>;
-def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
+def DEC64_32r : I<0xFF, MRM1r, (outs GR32:$dst), (ins GR32:$src1),
"dec{l}\t$dst",
[(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
IIC_UNARY_REG>,
@@ -469,7 +471,7 @@ let CodeSize = 2 in {
def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
[(store (add (loadi64 addr:$dst), 1), addr:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>;
-
+
// These are duplicates of their 32-bit counterparts. Only needed so X86 knows
// how to unfold them.
// FIXME: What is this for??
@@ -498,12 +500,12 @@ def DEC8r : I<0xFE, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
[(set GR8:$dst, EFLAGS, (X86dec_flag GR8:$src1))],
IIC_UNARY_REG>;
let isConvertibleToThreeAddress = 1, CodeSize = 1 in { // Can xform into LEA.
-def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
+def DEC16r : I<0x48, AddRegFrm, (outs GR16:$dst), (ins GR16:$src1),
"dec{w}\t$dst",
[(set GR16:$dst, EFLAGS, (X86dec_flag GR16:$src1))],
IIC_UNARY_REG>,
OpSize, Requires<[In32BitMode]>;
-def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
+def DEC32r : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
"dec{l}\t$dst",
[(set GR32:$dst, EFLAGS, (X86dec_flag GR32:$src1))],
IIC_UNARY_REG>,
@@ -544,57 +546,57 @@ class X86TypeInfo<ValueType vt, string instrsuffix, RegisterClass regclass,
bit hasOddOpcode, bit hasOpSizePrefix, bit hasREX_WPrefix> {
/// VT - This is the value type itself.
ValueType VT = vt;
-
+
/// InstrSuffix - This is the suffix used on instructions with this type. For
/// example, i8 -> "b", i16 -> "w", i32 -> "l", i64 -> "q".
string InstrSuffix = instrsuffix;
-
+
/// RegClass - This is the register class associated with this type. For
/// example, i8 -> GR8, i16 -> GR16, i32 -> GR32, i64 -> GR64.
RegisterClass RegClass = regclass;
-
+
/// LoadNode - This is the load node associated with this type. For
/// example, i8 -> loadi8, i16 -> loadi16, i32 -> loadi32, i64 -> loadi64.
PatFrag LoadNode = loadnode;
-
+
/// MemOperand - This is the memory operand associated with this type. For
/// example, i8 -> i8mem, i16 -> i16mem, i32 -> i32mem, i64 -> i64mem.
X86MemOperand MemOperand = memoperand;
-
+
/// ImmEncoding - This is the encoding of an immediate of this type. For
/// example, i8 -> Imm8, i16 -> Imm16, i32 -> Imm32. Note that i64 -> Imm32
/// since the immediate fields of i64 instructions is a 32-bit sign extended
/// value.
ImmType ImmEncoding = immkind;
-
+
/// ImmOperand - This is the operand kind of an immediate of this type. For
/// example, i8 -> i8imm, i16 -> i16imm, i32 -> i32imm. Note that i64 ->
/// i64i32imm since the immediate fields of i64 instructions is a 32-bit sign
/// extended value.
Operand ImmOperand = immoperand;
-
+
/// ImmOperator - This is the operator that should be used to match an
/// immediate of this kind in a pattern (e.g. imm, or i64immSExt32).
SDPatternOperator ImmOperator = immoperator;
-
+
/// Imm8Operand - This is the operand kind to use for an imm8 of this type.
/// For example, i8 -> <invalid>, i16 -> i16i8imm, i32 -> i32i8imm. This is
/// only used for instructions that have a sign-extended imm8 field form.
Operand Imm8Operand = imm8operand;
-
+
/// Imm8Operator - This is the operator that should be used to match an 8-bit
/// sign extended immediate of this kind in a pattern (e.g. imm16immSExt8).
SDPatternOperator Imm8Operator = imm8operator;
-
+
/// HasOddOpcode - This bit is true if the instruction should have an odd (as
/// opposed to even) opcode. Operations on i8 are usually even, operations on
/// other datatypes are odd.
bit HasOddOpcode = hasOddOpcode;
-
+
/// HasOpSizePrefix - This bit is set to true if the instruction should have
/// the 0x66 operand size prefix. This is set for i16 types.
bit HasOpSizePrefix = hasOpSizePrefix;
-
+
/// HasREX_WPrefix - This bit is set to true if the instruction should have
/// the 0x40 REX prefix. This is set for i64 types.
bit HasREX_WPrefix = hasREX_WPrefix;
@@ -624,12 +626,12 @@ def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
/// 3. Infers whether the instruction should have a 0x40 REX_W prefix.
/// 4. Infers whether the low bit of the opcode should be 0 (for i8 operations)
/// or 1 (for i16,i32,i64 operations).
-class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
+class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
string mnemonic, string args, list<dag> pattern,
InstrItinClass itin = IIC_BIN_NONMEM>
: I<{opcode{7}, opcode{6}, opcode{5}, opcode{4},
opcode{3}, opcode{2}, opcode{1}, typeinfo.HasOddOpcode },
- f, outs, ins,
+ f, outs, ins,
!strconcat(mnemonic, "{", typeinfo.InstrSuffix, "}\t", args), pattern,
itin> {
@@ -690,6 +692,7 @@ class BinOpRR_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
mnemonic, "{$src2, $dst|$dst, $src2}", [], IIC_BIN_NONMEM> {
// The disassembler should know about this, but not the asmparser.
let isCodeGenOnly = 1;
+ let hasSideEffects = 0;
}
// BinOpRR_F_Rev - Instructions like "cmp reg, reg" (reversed encoding).
@@ -699,6 +702,7 @@ class BinOpRR_F_Rev<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo>
mnemonic, "{$src2, $src1|$src1, $src2}", [], IIC_BIN_NONMEM> {
// The disassembler should know about this, but not the asmparser.
let isCodeGenOnly = 1;
+ let hasSideEffects = 0;
}
// BinOpRM - Instructions like "add reg, reg, [mem]".
@@ -764,13 +768,13 @@ class BinOpRI_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
class BinOpRI_RF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
: BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
- [(set typeinfo.RegClass:$dst, EFLAGS,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2))]>;
// BinOpRI_RFF - Instructions like "adc reg, reg, imm".
class BinOpRI_RFF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
: BinOpRI<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
- [(set typeinfo.RegClass:$dst, EFLAGS,
+ [(set typeinfo.RegClass:$dst, EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.ImmOperator:$src2,
EFLAGS))]>;
@@ -789,7 +793,7 @@ class BinOpRI8_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
: BinOpRI8<opcode, mnemonic, typeinfo, f, (outs typeinfo.RegClass:$dst),
[(set typeinfo.RegClass:$dst,
(opnode typeinfo.RegClass:$src1, typeinfo.Imm8Operator:$src2))]>;
-
+
// BinOpRI8_F - Instructions like "cmp reg, imm8".
class BinOpRI8_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
@@ -852,14 +856,14 @@ class BinOpMI<string mnemonic, X86TypeInfo typeinfo,
// BinOpMI_RMW - Instructions like "add [mem], imm".
class BinOpMI_RMW<string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
- : BinOpMI<mnemonic, typeinfo, f,
+ : BinOpMI<mnemonic, typeinfo, f,
[(store (opnode (typeinfo.VT (load addr:$dst)),
typeinfo.ImmOperator:$src), addr:$dst),
(implicit EFLAGS)]>;
// BinOpMI_RMW_FF - Instructions like "adc [mem], imm".
class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
SDNode opnode, Format f>
- : BinOpMI<mnemonic, typeinfo, f,
+ : BinOpMI<mnemonic, typeinfo, f,
[(store (opnode (typeinfo.VT (load addr:$dst)),
typeinfo.ImmOperator:$src, EFLAGS), addr:$dst),
(implicit EFLAGS)]>;
@@ -867,7 +871,7 @@ class BinOpMI_RMW_FF<string mnemonic, X86TypeInfo typeinfo,
// BinOpMI_F - Instructions like "cmp [mem], imm".
class BinOpMI_F<string mnemonic, X86TypeInfo typeinfo,
SDPatternOperator opnode, Format f, bits<8> opcode = 0x80>
- : BinOpMI<mnemonic, typeinfo, f,
+ : BinOpMI<mnemonic, typeinfo, f,
[(set EFLAGS, (opnode (typeinfo.VT (load addr:$dst)),
typeinfo.ImmOperator:$src))],
opcode>;
@@ -913,6 +917,7 @@ class BinOpAI<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
let ImmT = typeinfo.ImmEncoding;
let Uses = [areg];
let Defs = [areg];
+ let hasSideEffects = 0;
}
/// ArithBinOp_RF - This is an arithmetic binary operator where the pattern is
@@ -928,61 +933,61 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
let Constraints = "$src1 = $dst" in {
let isCommutable = CommutableRR,
isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def #NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
- def #NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
- def #NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
- def #NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
+ def NAME#8rr : BinOpRR_RF<BaseOpc, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rr : BinOpRR_RF<BaseOpc, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rr : BinOpRR_RF<BaseOpc, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rr : BinOpRR_RF<BaseOpc, mnemonic, Xi64, opnodeflag>;
} // isCommutable
- def #NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
- def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
- def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
- def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
- def #NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
- def #NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
- def #NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
- def #NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
+ def NAME#8rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi8 , opnodeflag>;
+ def NAME#16rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi16, opnodeflag>;
+ def NAME#32rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi32, opnodeflag>;
+ def NAME#64rm : BinOpRM_RF<BaseOpc2, mnemonic, Xi64, opnodeflag>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
- def #NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
- def #NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
- def #NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
-
- def #NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
- def #NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
- def #NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
- def #NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
+ def NAME#16ri8 : BinOpRI8_RF<0x82, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RF<0x82, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RF<0x82, mnemonic, Xi64, opnodeflag, RegMRM>;
+
+ def NAME#8ri : BinOpRI_RF<0x80, mnemonic, Xi8 , opnodeflag, RegMRM>;
+ def NAME#16ri : BinOpRI_RF<0x80, mnemonic, Xi16, opnodeflag, RegMRM>;
+ def NAME#32ri : BinOpRI_RF<0x80, mnemonic, Xi32, opnodeflag, RegMRM>;
+ def NAME#64ri32: BinOpRI_RF<0x80, mnemonic, Xi64, opnodeflag, RegMRM>;
}
} // Constraints = "$src1 = $dst"
- def #NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
- def #NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
- def #NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
- def #NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
+ def NAME#8mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW<BaseOpc, mnemonic, Xi64, opnode>;
// NOTE: These are order specific, we want the mi8 forms to be listed
// first so that they are slightly preferred to the mi forms.
- def #NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
- def #NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
- def #NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
-
- def #NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>;
- def #NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
- def #NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
- def #NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
-
- def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
- def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
- def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
- def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
- }
+ def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW<mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|AL, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|AX, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|EAX, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|RAX, $src}">;
+ }
}
/// ArithBinOp_RFF - This is an arithmetic binary operator where the pattern is
@@ -999,61 +1004,61 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
let Constraints = "$src1 = $dst" in {
let isCommutable = CommutableRR,
isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def #NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
- def #NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
- def #NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
- def #NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
+ def NAME#8rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_RFF<BaseOpc, mnemonic, Xi64, opnode>;
} // isCommutable
- def #NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
- def #NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
- def #NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
- def #NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
+ def NAME#8rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_Rev<BaseOpc2, mnemonic, Xi64>;
- def #NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
- def #NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
- def #NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
- def #NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
+ def NAME#8rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_RFF<BaseOpc2, mnemonic, Xi64, opnode>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
- def #NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
- def #NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
- def #NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
-
- def #NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
- def #NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
- def #NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
- def #NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ def NAME#16ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_RFF<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#8ri : BinOpRI_RFF<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+ def NAME#16ri : BinOpRI_RFF<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_RFF<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_RFF<0x80, mnemonic, Xi64, opnode, RegMRM>;
}
} // Constraints = "$src1 = $dst"
- def #NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
- def #NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
- def #NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
- def #NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
+ def NAME#8mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_RMW_FF<BaseOpc, mnemonic, Xi64, opnode>;
// NOTE: These are order specific, we want the mi8 forms to be listed
// first so that they are slightly preferred to the mi forms.
- def #NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
- def #NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
- def #NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
-
- def #NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>;
- def #NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
- def #NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
- def #NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
-
- def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
- def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
- def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
- def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
- }
+ def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_RMW_FF<mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|AL, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|AX, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|EAX, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|RAX, $src}">;
+ }
}
/// ArithBinOp_F - This is an arithmetic binary operator where the pattern is
@@ -1067,60 +1072,60 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
let Defs = [EFLAGS] in {
let isCommutable = CommutableRR,
isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
- def #NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
- def #NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
- def #NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
- def #NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ def NAME#8rr : BinOpRR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16rr : BinOpRR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32rr : BinOpRR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64rr : BinOpRR_F<BaseOpc, mnemonic, Xi64, opnode>;
} // isCommutable
- def #NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
- def #NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
- def #NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
- def #NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
+ def NAME#8rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi8>;
+ def NAME#16rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi16>;
+ def NAME#32rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi32>;
+ def NAME#64rr_REV : BinOpRR_F_Rev<BaseOpc2, mnemonic, Xi64>;
- def #NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
- def #NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
- def #NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
- def #NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
+ def NAME#8rm : BinOpRM_F<BaseOpc2, mnemonic, Xi8 , opnode>;
+ def NAME#16rm : BinOpRM_F<BaseOpc2, mnemonic, Xi16, opnode>;
+ def NAME#32rm : BinOpRM_F<BaseOpc2, mnemonic, Xi32, opnode>;
+ def NAME#64rm : BinOpRM_F<BaseOpc2, mnemonic, Xi64, opnode>;
let isConvertibleToThreeAddress = ConvertibleToThreeAddress in {
// NOTE: These are order specific, we want the ri8 forms to be listed
// first so that they are slightly preferred to the ri forms.
- def #NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
- def #NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
- def #NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
-
- def #NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
- def #NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
- def #NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
- def #NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
+ def NAME#16ri8 : BinOpRI8_F<0x82, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri8 : BinOpRI8_F<0x82, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri8 : BinOpRI8_F<0x82, mnemonic, Xi64, opnode, RegMRM>;
+
+ def NAME#8ri : BinOpRI_F<0x80, mnemonic, Xi8 , opnode, RegMRM>;
+ def NAME#16ri : BinOpRI_F<0x80, mnemonic, Xi16, opnode, RegMRM>;
+ def NAME#32ri : BinOpRI_F<0x80, mnemonic, Xi32, opnode, RegMRM>;
+ def NAME#64ri32: BinOpRI_F<0x80, mnemonic, Xi64, opnode, RegMRM>;
}
- def #NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
- def #NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
- def #NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
- def #NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
+ def NAME#8mr : BinOpMR_F<BaseOpc, mnemonic, Xi8 , opnode>;
+ def NAME#16mr : BinOpMR_F<BaseOpc, mnemonic, Xi16, opnode>;
+ def NAME#32mr : BinOpMR_F<BaseOpc, mnemonic, Xi32, opnode>;
+ def NAME#64mr : BinOpMR_F<BaseOpc, mnemonic, Xi64, opnode>;
// NOTE: These are order specific, we want the mi8 forms to be listed
// first so that they are slightly preferred to the mi forms.
- def #NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
- def #NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
- def #NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
-
- def #NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>;
- def #NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
- def #NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
- def #NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
-
- def #NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
- "{$src, %al|AL, $src}">;
- def #NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
- "{$src, %ax|AX, $src}">;
- def #NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
- "{$src, %eax|EAX, $src}">;
- def #NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
- "{$src, %rax|RAX, $src}">;
- }
+ def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8mi : BinOpMI_F<mnemonic, Xi8 , opnode, MemMRM>;
+ def NAME#16mi : BinOpMI_F<mnemonic, Xi16, opnode, MemMRM>;
+ def NAME#32mi : BinOpMI_F<mnemonic, Xi32, opnode, MemMRM>;
+ def NAME#64mi32 : BinOpMI_F<mnemonic, Xi64, opnode, MemMRM>;
+
+ def NAME#8i8 : BinOpAI<BaseOpc4, mnemonic, Xi8 , AL,
+ "{$src, %al|AL, $src}">;
+ def NAME#16i16 : BinOpAI<BaseOpc4, mnemonic, Xi16, AX,
+ "{$src, %ax|AX, $src}">;
+ def NAME#32i32 : BinOpAI<BaseOpc4, mnemonic, Xi32, EAX,
+ "{$src, %eax|EAX, $src}">;
+ def NAME#64i32 : BinOpAI<BaseOpc4, mnemonic, Xi64, RAX,
+ "{$src, %rax|RAX, $src}">;
+ }
}
@@ -1180,7 +1185,7 @@ let isCompare = 1, Defs = [EFLAGS] in {
def TEST16mi : BinOpMI_F<"test", Xi16, X86testpat, MRM0m, 0xF6>;
def TEST32mi : BinOpMI_F<"test", Xi32, X86testpat, MRM0m, 0xF6>;
def TEST64mi32 : BinOpMI_F<"test", Xi64, X86testpat, MRM0m, 0xF6>;
-
+
def TEST8i8 : BinOpAI<0xA8, "test", Xi8 , AL,
"{$src, %al|AL, $src}">;
def TEST16i16 : BinOpAI<0xA8, "test", Xi16, AX,
@@ -1204,12 +1209,12 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
PatFrag ld_frag> {
def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))],
+ [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
IIC_BIN_NONMEM>;
def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, EFLAGS,
- (X86andn_flag RC:$src1, (ld_frag addr:$src2)))], IIC_BIN_MEM>;
+ (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>;
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
@@ -1217,6 +1222,17 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W;
}
+let Predicates = [HasBMI] in {
+ def : Pat<(and (not GR32:$src1), GR32:$src2),
+ (ANDN32rr GR32:$src1, GR32:$src2)>;
+ def : Pat<(and (not GR64:$src1), GR64:$src2),
+ (ANDN64rr GR64:$src1, GR64:$src2)>;
+ def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+ (ANDN32rm GR32:$src1, addr:$src2)>;
+ def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+ (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
//===----------------------------------------------------------------------===//
// MULX Instruction
//
@@ -1240,3 +1256,49 @@ let Predicates = [HasBMI2] in {
let Uses = [RDX] in
defm MULX64 : bmi_mulx<"mulx{q}", GR64, i64mem>, VEX_W;
}
+
+//===----------------------------------------------------------------------===//
+// ADCX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+ def ADCX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "adcx{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8, OpSize;
+
+ def ADCX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "adcx{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+
+ let mayLoad = 1 in {
+ def ADCX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "adcx{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8, OpSize;
+
+ def ADCX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "adcx{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8, OpSize, REX_W, Requires<[In64BitMode]>;
+ }
+}
+
+//===----------------------------------------------------------------------===//
+// ADOX Instruction
+//
+let hasSideEffects = 0, Predicates = [HasADX], Defs = [EFLAGS] in {
+ def ADOX32rr : I<0xF6, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8XS;
+
+ def ADOX64rr : I<0xF6, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_NONMEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+
+ let mayLoad = 1 in {
+ def ADOX32rm : I<0xF6, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
+ "adox{l}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8XS;
+
+ def ADOX64rm : I<0xF6, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+ "adox{q}\t{$src, $dst|$dst, $src}",
+ [], IIC_BIN_MEM>, T8XS, REX_W, Requires<[In64BitMode]>;
+ }
+}
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index adeaf5410d..8f2d0a1aae 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -17,19 +17,19 @@
multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst",
isCommutable = 1 in {
- def #NAME#16rr
+ def NAME#16rr
: I<opc, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
[(set GR16:$dst,
(X86cmov GR16:$src1, GR16:$src2, CondNode, EFLAGS))],
IIC_CMOV16_RR>,TB,OpSize;
- def #NAME#32rr
+ def NAME#32rr
: I<opc, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
!strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
[(set GR32:$dst,
(X86cmov GR32:$src1, GR32:$src2, CondNode, EFLAGS))],
IIC_CMOV32_RR>, TB;
- def #NAME#64rr
+ def NAME#64rr
:RI<opc, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
!strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
[(set GR64:$dst,
@@ -38,18 +38,18 @@ multiclass CMOV<bits<8> opc, string Mnemonic, PatLeaf CondNode> {
}
let Uses = [EFLAGS], Predicates = [HasCMov], Constraints = "$src1 = $dst" in {
- def #NAME#16rm
+ def NAME#16rm
: I<opc, MRMSrcMem, (outs GR16:$dst), (ins GR16:$src1, i16mem:$src2),
!strconcat(Mnemonic, "{w}\t{$src2, $dst|$dst, $src2}"),
[(set GR16:$dst, (X86cmov GR16:$src1, (loadi16 addr:$src2),
CondNode, EFLAGS))], IIC_CMOV16_RM>,
TB, OpSize;
- def #NAME#32rm
+ def NAME#32rm
: I<opc, MRMSrcMem, (outs GR32:$dst), (ins GR32:$src1, i32mem:$src2),
!strconcat(Mnemonic, "{l}\t{$src2, $dst|$dst, $src2}"),
[(set GR32:$dst, (X86cmov GR32:$src1, (loadi32 addr:$src2),
CondNode, EFLAGS))], IIC_CMOV32_RM>, TB;
- def #NAME#64rm
+ def NAME#64rm
:RI<opc, MRMSrcMem, (outs GR64:$dst), (ins GR64:$src1, i64mem:$src2),
!strconcat(Mnemonic, "{q}\t{$src2, $dst|$dst, $src2}"),
[(set GR64:$dst, (X86cmov GR64:$src1, (loadi64 addr:$src2),
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 9e6f27988f..734e5982b2 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -513,18 +513,22 @@ def CMOV_RFP80 : I<0, Pseudo,
multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
- def #NAME#8 : I<0, Pseudo, (outs GR8:$dst),
- (ins i8mem:$ptr, GR8:$val),
- !strconcat(mnemonic, "8 PSEUDO!"), []>;
- def #NAME#16 : I<0, Pseudo,(outs GR16:$dst),
- (ins i16mem:$ptr, GR16:$val),
- !strconcat(mnemonic, "16 PSEUDO!"), []>;
- def #NAME#32 : I<0, Pseudo, (outs GR32:$dst),
- (ins i32mem:$ptr, GR32:$val),
- !strconcat(mnemonic, "32 PSEUDO!"), []>;
- def #NAME#64 : I<0, Pseudo, (outs GR64:$dst),
- (ins i64mem:$ptr, GR64:$val),
- !strconcat(mnemonic, "64 PSEUDO!"), []>;
+ let Defs = [EFLAGS, AL] in
+ def NAME#8 : I<0, Pseudo, (outs GR8:$dst),
+ (ins i8mem:$ptr, GR8:$val),
+ !strconcat(mnemonic, "8 PSEUDO!"), []>;
+ let Defs = [EFLAGS, AX] in
+ def NAME#16 : I<0, Pseudo,(outs GR16:$dst),
+ (ins i16mem:$ptr, GR16:$val),
+ !strconcat(mnemonic, "16 PSEUDO!"), []>;
+ let Defs = [EFLAGS, EAX] in
+ def NAME#32 : I<0, Pseudo, (outs GR32:$dst),
+ (ins i32mem:$ptr, GR32:$val),
+ !strconcat(mnemonic, "32 PSEUDO!"), []>;
+ let Defs = [EFLAGS, RAX] in
+ def NAME#64 : I<0, Pseudo, (outs GR64:$dst),
+ (ins i64mem:$ptr, GR64:$val),
+ !strconcat(mnemonic, "64 PSEUDO!"), []>;
}
}
@@ -559,10 +563,11 @@ defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
- let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in
- def #NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
- (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
- !strconcat(mnemonic, "6432 PSEUDO!"), []>;
+ let usesCustomInserter = 1, Defs = [EFLAGS, EAX, EDX],
+ mayLoad = 1, mayStore = 1, hasSideEffects = 0 in
+ def NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+ (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+ !strconcat(mnemonic, "6432 PSEUDO!"), []>;
}
defm ATOMAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
@@ -604,77 +609,77 @@ multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
Format ImmMod, string mnemonic> {
let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
-def #NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
- RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
- MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
- !strconcat(mnemonic, "{b}\t",
+def NAME#8mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 0 },
+ MRMDestMem, (outs), (ins i8mem:$dst, GR8:$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, LOCK;
+def NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_NONMEM>, OpSize, LOCK;
+def NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+ RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
+ MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
+ !strconcat(mnemonic, "{l}\t",
"{$src2, $dst|$dst, $src2}"),
[], IIC_ALU_NONMEM>, LOCK;
-def #NAME#16mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
- RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
- MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_NONMEM>, OpSize, LOCK;
-def #NAME#32mr : I<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
+def NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
- MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
- !strconcat(mnemonic, "{l}\t",
+ MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
+ !strconcat(mnemonic, "{q}\t",
"{$src2, $dst|$dst, $src2}"),
[], IIC_ALU_NONMEM>, LOCK;
-def #NAME#64mr : RI<{RegOpc{7}, RegOpc{6}, RegOpc{5}, RegOpc{4},
- RegOpc{3}, RegOpc{2}, RegOpc{1}, 1 },
- MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_NONMEM>, LOCK;
-
-def #NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
- ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
- !strconcat(mnemonic, "{b}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, LOCK;
-
-def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
- ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, OpSize, LOCK;
-
-def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
- ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
- !strconcat(mnemonic, "{l}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, LOCK;
-def #NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
- ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
- ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, LOCK;
-
-def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
- !strconcat(mnemonic, "{w}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, OpSize, LOCK;
-def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+def NAME#8mi : Ii8<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 0 },
+ ImmMod, (outs), (ins i8mem :$dst, i8imm :$src2),
+ !strconcat(mnemonic, "{b}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize, LOCK;
+
+def NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#64mi32 : RIi32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
+ ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
+ ImmMod, (outs), (ins i64mem :$dst, i64i32imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+
+def NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
+ !strconcat(mnemonic, "{w}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, OpSize, LOCK;
+def NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
+ ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
+ ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
+ !strconcat(mnemonic, "{l}\t",
+ "{$src2, $dst|$dst, $src2}"),
+ [], IIC_ALU_MEM>, LOCK;
+def NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
- !strconcat(mnemonic, "{l}\t",
+ ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
+ !strconcat(mnemonic, "{q}\t",
"{$src2, $dst|$dst, $src2}"),
[], IIC_ALU_MEM>, LOCK;
-def #NAME#64mi8 : RIi8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
- ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
- ImmMod, (outs), (ins i64mem :$dst, i64i8imm :$src2),
- !strconcat(mnemonic, "{q}\t",
- "{$src2, $dst|$dst, $src2}"),
- [], IIC_ALU_MEM>, LOCK;
}
@@ -691,18 +696,18 @@ multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
string mnemonic> {
let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
-def #NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
- !strconcat(mnemonic, "{b}\t$dst"),
+def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
+ !strconcat(mnemonic, "{b}\t$dst"),
+ [], IIC_UNARY_MEM>, LOCK;
+def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+ !strconcat(mnemonic, "{w}\t$dst"),
+ [], IIC_UNARY_MEM>, OpSize, LOCK;
+def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+ !strconcat(mnemonic, "{l}\t$dst"),
+ [], IIC_UNARY_MEM>, LOCK;
+def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+ !strconcat(mnemonic, "{q}\t$dst"),
[], IIC_UNARY_MEM>, LOCK;
-def #NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
- !strconcat(mnemonic, "{w}\t$dst"),
- [], IIC_UNARY_MEM>, OpSize, LOCK;
-def #NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
- !strconcat(mnemonic, "{l}\t$dst"),
- [], IIC_UNARY_MEM>, LOCK;
-def #NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
- !strconcat(mnemonic, "{q}\t$dst"),
- [], IIC_UNARY_MEM>, LOCK;
}
}
@@ -714,9 +719,9 @@ multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
SDPatternOperator frag, X86MemOperand x86memop,
InstrItinClass itin> {
let isCodeGenOnly = 1 in {
- def #NAME# : I<Opc, Form, (outs), (ins x86memop:$ptr),
- !strconcat(mnemonic, "\t$ptr"),
- [(frag addr:$ptr)], itin>, TB, LOCK;
+ def NAME : I<Opc, Form, (outs), (ins x86memop:$ptr),
+ !strconcat(mnemonic, "\t$ptr"),
+ [(frag addr:$ptr)], itin>, TB, LOCK;
}
}
@@ -725,21 +730,21 @@ multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
InstrItinClass itin8, InstrItinClass itin> {
let isCodeGenOnly = 1 in {
let Defs = [AL, EFLAGS], Uses = [AL] in
- def #NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
- !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+ def NAME#8 : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+ !strconcat(mnemonic, "{b}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
let Defs = [AX, EFLAGS], Uses = [AX] in
- def #NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
- !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
+ def NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+ !strconcat(mnemonic, "{w}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
let Defs = [EAX, EFLAGS], Uses = [EAX] in
- def #NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
- !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
+ def NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+ !strconcat(mnemonic, "{l}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
let Defs = [RAX, EFLAGS], Uses = [RAX] in
- def #NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
- !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
- [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+ def NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+ !strconcat(mnemonic, "{q}\t{$swap, $ptr|$ptr, $swap}"),
+ [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
}
}
@@ -764,33 +769,33 @@ multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
string frag,
InstrItinClass itin8, InstrItinClass itin> {
let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
- def #NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
- (ins GR8:$val, i8mem:$ptr),
- !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
- [(set GR8:$dst,
- (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
- itin8>;
- def #NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
- (ins GR16:$val, i16mem:$ptr),
- !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR16:$dst,
- (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
- itin>, OpSize;
- def #NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
- (ins GR32:$val, i32mem:$ptr),
- !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ def NAME#8 : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin8>;
+ def NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize;
+ def NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>;
+ def NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
[(set
- GR32:$dst,
- (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
itin>;
- def #NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
- (ins GR64:$val, i64mem:$ptr),
- !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR64:$dst,
- (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
- itin>;
}
}
@@ -1076,12 +1081,14 @@ def : Pat<(X86cmp GR64:$src1, 0),
// inverted.
multiclass CMOVmr<PatLeaf InvertedCond, Instruction Inst16, Instruction Inst32,
Instruction Inst64> {
- def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
- (Inst16 GR16:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
- (Inst32 GR32:$src2, addr:$src1)>;
- def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
- (Inst64 GR64:$src2, addr:$src1)>;
+ let Predicates = [HasCMov] in {
+ def : Pat<(X86cmov (loadi16 addr:$src1), GR16:$src2, InvertedCond, EFLAGS),
+ (Inst16 GR16:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi32 addr:$src1), GR32:$src2, InvertedCond, EFLAGS),
+ (Inst32 GR32:$src2, addr:$src1)>;
+ def : Pat<(X86cmov (loadi64 addr:$src1), GR64:$src2, InvertedCond, EFLAGS),
+ (Inst64 GR64:$src2, addr:$src1)>;
+ }
}
defm : CMOVmr<X86_COND_B , CMOVAE16rm, CMOVAE32rm, CMOVAE64rm>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index d360a73b34..7759a8a2da 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -60,14 +60,14 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
PatFrag MemFrag128, PatFrag MemFrag256,
SDNode Op, ValueType OpTy128, ValueType OpTy256> {
defm r213 : fma3p_rm<opc213,
- !strconcat(OpcodeStr, !strconcat("213", PackTy)),
+ !strconcat(OpcodeStr, "213", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
let neverHasSideEffects = 1 in {
defm r132 : fma3p_rm<opc132,
- !strconcat(OpcodeStr, !strconcat("132", PackTy)),
+ !strconcat(OpcodeStr, "132", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256>;
defm r231 : fma3p_rm<opc231,
- !strconcat(OpcodeStr, !strconcat("231", PackTy)),
+ !strconcat(OpcodeStr, "231", PackTy),
MemFrag128, MemFrag256, OpTy128, OpTy256>;
} // neverHasSideEffects = 1
}
@@ -160,15 +160,15 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
X86MemOperand x86memop, Operand memop, PatFrag mem_frag,
ComplexPattern mem_cpat> {
let neverHasSideEffects = 1 in {
- defm r132 : fma3s_rm<opc132, !strconcat(OpStr, !strconcat("132", PackTy)),
+ defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
x86memop, RC, OpVT, mem_frag>;
- defm r231 : fma3s_rm<opc231, !strconcat(OpStr, !strconcat("231", PackTy)),
+ defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
x86memop, RC, OpVT, mem_frag>;
}
-defm r213 : fma3s_rm<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
x86memop, RC, OpVT, mem_frag, OpNode>,
- fma3s_rm_int<opc213, !strconcat(OpStr, !strconcat("213", PackTy)),
+ fma3s_rm_int<opc213, !strconcat(OpStr, "213", PackTy),
memop, mem_cpat, Int, RC>;
}
@@ -220,7 +220,7 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst,
(OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>;
// For disassembler
-let isCodeGenOnly = 1 in
+let isCodeGenOnly = 1, hasSideEffects = 0 in
def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
@@ -294,7 +294,7 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR256:$dst, (OpNode VR256:$src1,
(ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
// For disassembler
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 6151d5cce2..44e574d246 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -570,7 +570,7 @@ class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = IIC_DEFAULT>
- : I<o, F, outs, ins, asm, pattern, itin>, TA,
+ : Ii8<o, F, outs, ins, asm, pattern, itin>, TA,
OpSize, VEX_4V, VEX_I8IMM, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 09ab995166..2a72fb6f7b 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -27,6 +27,11 @@ def SDTX86FPShiftOp : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>,
def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
SDTCisFP<1>, SDTCisVT<3, i8>]>;
+def X86umin : SDNode<"X86ISD::UMIN", SDTIntBinOp>;
+def X86umax : SDNode<"X86ISD::UMAX", SDTIntBinOp>;
+def X86smin : SDNode<"X86ISD::SMIN", SDTIntBinOp>;
+def X86smax : SDNode<"X86ISD::SMAX", SDTIntBinOp>;
+
def X86fmin : SDNode<"X86ISD::FMIN", SDTFPBinOp>;
def X86fmax : SDNode<"X86ISD::FMAX", SDTFPBinOp>;
@@ -128,6 +133,7 @@ def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisVec<1>,
SDTCisSameAs<2, 1>]>;
+def X86subus : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
def X86ptest : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
def X86testp : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
@@ -154,7 +160,7 @@ def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
-def X86PAlign : SDNode<"X86ISD::PALIGN", SDTShuff3OpI>;
+def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
def X86PShufhw : SDNode<"X86ISD::PSHUFHW", SDTShuff2OpI>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 4c61b32cac..17714acd86 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -24,8 +24,8 @@
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/CommandLine.h"
@@ -297,7 +297,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::DIV32r, X86::DIV32m, TB_FOLDED_LOAD },
{ X86::DIV64r, X86::DIV64m, TB_FOLDED_LOAD },
{ X86::DIV8r, X86::DIV8m, TB_FOLDED_LOAD },
- { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSmr, TB_FOLDED_STORE },
{ X86::FsMOVAPDrr, X86::MOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::FsMOVAPSrr, X86::MOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::IDIV16r, X86::IDIV16m, TB_FOLDED_LOAD },
@@ -355,7 +355,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
{ X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
// AVX 128-bit versions of foldable instructions
- { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE | TB_ALIGN_16 },
+ { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
{ X86::FsVMOVAPDrr, X86::VMOVSDmr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::FsVMOVAPSrr, X86::VMOVSSmr, TB_FOLDED_STORE | TB_NO_REVERSE },
{ X86::VEXTRACTF128rr, X86::VEXTRACTF128mr, TB_FOLDED_STORE | TB_ALIGN_16 },
@@ -467,9 +467,7 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::RSQRTSSr, X86::RSQRTSSm, 0 },
{ X86::RSQRTSSr_Int, X86::RSQRTSSm_Int, 0 },
{ X86::SQRTPDr, X86::SQRTPDm, TB_ALIGN_16 },
- { X86::SQRTPDr_Int, X86::SQRTPDm_Int, TB_ALIGN_16 },
{ X86::SQRTPSr, X86::SQRTPSm, TB_ALIGN_16 },
- { X86::SQRTPSr_Int, X86::SQRTPSm_Int, TB_ALIGN_16 },
{ X86::SQRTSDr, X86::SQRTSDm, 0 },
{ X86::SQRTSDr_Int, X86::SQRTSDm_Int, 0 },
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
@@ -510,27 +508,25 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VMOVDQArr, X86::VMOVDQArm, TB_ALIGN_16 },
{ X86::VMOVSLDUPrr, X86::VMOVSLDUPrm, TB_ALIGN_16 },
{ X86::VMOVSHDUPrr, X86::VMOVSHDUPrm, TB_ALIGN_16 },
- { X86::VMOVUPDrr, X86::VMOVUPDrm, TB_ALIGN_16 },
+ { X86::VMOVUPDrr, X86::VMOVUPDrm, 0 },
{ X86::VMOVUPSrr, X86::VMOVUPSrm, 0 },
{ X86::VMOVZDI2PDIrr, X86::VMOVZDI2PDIrm, 0 },
{ X86::VMOVZQI2PQIrr, X86::VMOVZQI2PQIrm, 0 },
{ X86::VMOVZPQILo2PQIrr,X86::VMOVZPQILo2PQIrm, TB_ALIGN_16 },
- { X86::VPABSBrr128, X86::VPABSBrm128, TB_ALIGN_16 },
- { X86::VPABSDrr128, X86::VPABSDrm128, TB_ALIGN_16 },
- { X86::VPABSWrr128, X86::VPABSWrm128, TB_ALIGN_16 },
- { X86::VPERMILPDri, X86::VPERMILPDmi, TB_ALIGN_16 },
- { X86::VPERMILPSri, X86::VPERMILPSmi, TB_ALIGN_16 },
- { X86::VPSHUFDri, X86::VPSHUFDmi, TB_ALIGN_16 },
- { X86::VPSHUFHWri, X86::VPSHUFHWmi, TB_ALIGN_16 },
- { X86::VPSHUFLWri, X86::VPSHUFLWmi, TB_ALIGN_16 },
- { X86::VRCPPSr, X86::VRCPPSm, TB_ALIGN_16 },
- { X86::VRCPPSr_Int, X86::VRCPPSm_Int, TB_ALIGN_16 },
- { X86::VRSQRTPSr, X86::VRSQRTPSm, TB_ALIGN_16 },
- { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, TB_ALIGN_16 },
- { X86::VSQRTPDr, X86::VSQRTPDm, TB_ALIGN_16 },
- { X86::VSQRTPDr_Int, X86::VSQRTPDm_Int, TB_ALIGN_16 },
- { X86::VSQRTPSr, X86::VSQRTPSm, TB_ALIGN_16 },
- { X86::VSQRTPSr_Int, X86::VSQRTPSm_Int, TB_ALIGN_16 },
+ { X86::VPABSBrr128, X86::VPABSBrm128, 0 },
+ { X86::VPABSDrr128, X86::VPABSDrm128, 0 },
+ { X86::VPABSWrr128, X86::VPABSWrm128, 0 },
+ { X86::VPERMILPDri, X86::VPERMILPDmi, 0 },
+ { X86::VPERMILPSri, X86::VPERMILPSmi, 0 },
+ { X86::VPSHUFDri, X86::VPSHUFDmi, 0 },
+ { X86::VPSHUFHWri, X86::VPSHUFHWmi, 0 },
+ { X86::VPSHUFLWri, X86::VPSHUFLWmi, 0 },
+ { X86::VRCPPSr, X86::VRCPPSm, 0 },
+ { X86::VRCPPSr_Int, X86::VRCPPSm_Int, 0 },
+ { X86::VRSQRTPSr, X86::VRSQRTPSm, 0 },
+ { X86::VRSQRTPSr_Int, X86::VRSQRTPSm_Int, 0 },
+ { X86::VSQRTPDr, X86::VSQRTPDm, 0 },
+ { X86::VSQRTPSr, X86::VSQRTPSm, 0 },
{ X86::VUCOMISDrr, X86::VUCOMISDrm, 0 },
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
{ X86::VBROADCASTSSrr, X86::VBROADCASTSSrm, TB_NO_REVERSE },
@@ -541,28 +537,41 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VMOVDQAYrr, X86::VMOVDQAYrm, TB_ALIGN_32 },
{ X86::VMOVUPDYrr, X86::VMOVUPDYrm, 0 },
{ X86::VMOVUPSYrr, X86::VMOVUPSYrm, 0 },
- { X86::VPERMILPDYri, X86::VPERMILPDYmi, TB_ALIGN_32 },
- { X86::VPERMILPSYri, X86::VPERMILPSYmi, TB_ALIGN_32 },
+ { X86::VPERMILPDYri, X86::VPERMILPDYmi, 0 },
+ { X86::VPERMILPSYri, X86::VPERMILPSYmi, 0 },
// AVX2 foldable instructions
- { X86::VPABSBrr256, X86::VPABSBrm256, TB_ALIGN_32 },
- { X86::VPABSDrr256, X86::VPABSDrm256, TB_ALIGN_32 },
- { X86::VPABSWrr256, X86::VPABSWrm256, TB_ALIGN_32 },
- { X86::VPSHUFDYri, X86::VPSHUFDYmi, TB_ALIGN_32 },
- { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, TB_ALIGN_32 },
- { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, TB_ALIGN_32 },
- { X86::VRCPPSYr, X86::VRCPPSYm, TB_ALIGN_32 },
- { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, TB_ALIGN_32 },
- { X86::VRSQRTPSYr, X86::VRSQRTPSYm, TB_ALIGN_32 },
- { X86::VRSQRTPSYr_Int, X86::VRSQRTPSYm_Int, TB_ALIGN_32 },
- { X86::VSQRTPDYr, X86::VSQRTPDYm, TB_ALIGN_32 },
- { X86::VSQRTPDYr_Int, X86::VSQRTPDYm_Int, TB_ALIGN_32 },
- { X86::VSQRTPSYr, X86::VSQRTPSYm, TB_ALIGN_32 },
- { X86::VSQRTPSYr_Int, X86::VSQRTPSYm_Int, TB_ALIGN_32 },
+ { X86::VPABSBrr256, X86::VPABSBrm256, 0 },
+ { X86::VPABSDrr256, X86::VPABSDrm256, 0 },
+ { X86::VPABSWrr256, X86::VPABSWrm256, 0 },
+ { X86::VPSHUFDYri, X86::VPSHUFDYmi, 0 },
+ { X86::VPSHUFHWYri, X86::VPSHUFHWYmi, 0 },
+ { X86::VPSHUFLWYri, X86::VPSHUFLWYmi, 0 },
+ { X86::VRCPPSYr, X86::VRCPPSYm, 0 },
+ { X86::VRCPPSYr_Int, X86::VRCPPSYm_Int, 0 },
+ { X86::VRSQRTPSYr, X86::VRSQRTPSYm, 0 },
+ { X86::VSQRTPDYr, X86::VSQRTPDYm, 0 },
+ { X86::VSQRTPSYr, X86::VSQRTPSYm, 0 },
{ X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm, TB_NO_REVERSE },
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm, TB_NO_REVERSE },
- // BMI/BMI2 foldable instructions
+ // BMI/BMI2/LZCNT/POPCNT foldable instructions
+ { X86::BEXTR32rr, X86::BEXTR32rm, 0 },
+ { X86::BEXTR64rr, X86::BEXTR64rm, 0 },
+ { X86::BLSI32rr, X86::BLSI32rm, 0 },
+ { X86::BLSI64rr, X86::BLSI64rm, 0 },
+ { X86::BLSMSK32rr, X86::BLSMSK32rm, 0 },
+ { X86::BLSMSK64rr, X86::BLSMSK64rm, 0 },
+ { X86::BLSR32rr, X86::BLSR32rm, 0 },
+ { X86::BLSR64rr, X86::BLSR64rm, 0 },
+ { X86::BZHI32rr, X86::BZHI32rm, 0 },
+ { X86::BZHI64rr, X86::BZHI64rm, 0 },
+ { X86::LZCNT16rr, X86::LZCNT16rm, 0 },
+ { X86::LZCNT32rr, X86::LZCNT32rm, 0 },
+ { X86::LZCNT64rr, X86::LZCNT64rm, 0 },
+ { X86::POPCNT16rr, X86::POPCNT16rm, 0 },
+ { X86::POPCNT32rr, X86::POPCNT32rm, 0 },
+ { X86::POPCNT64rr, X86::POPCNT64rm, 0 },
{ X86::RORX32ri, X86::RORX32mi, 0 },
{ X86::RORX64ri, X86::RORX64mi, 0 },
{ X86::SARX32rr, X86::SARX32rm, 0 },
@@ -571,6 +580,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::SHRX64rr, X86::SHRX64rm, 0 },
{ X86::SHLX32rr, X86::SHLX32rm, 0 },
{ X86::SHLX64rr, X86::SHLX64rm, 0 },
+ { X86::TZCNT16rr, X86::TZCNT16rm, 0 },
+ { X86::TZCNT32rr, X86::TZCNT32rm, 0 },
+ { X86::TZCNT64rr, X86::TZCNT64rm, 0 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -691,21 +703,13 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
{ X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, 0 },
{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
- { X86::MAXPDrr_Int, X86::MAXPDrm_Int, TB_ALIGN_16 },
{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
- { X86::MAXPSrr_Int, X86::MAXPSrm_Int, TB_ALIGN_16 },
{ X86::MAXSDrr, X86::MAXSDrm, 0 },
- { X86::MAXSDrr_Int, X86::MAXSDrm_Int, 0 },
{ X86::MAXSSrr, X86::MAXSSrm, 0 },
- { X86::MAXSSrr_Int, X86::MAXSSrm_Int, 0 },
{ X86::MINPDrr, X86::MINPDrm, TB_ALIGN_16 },
- { X86::MINPDrr_Int, X86::MINPDrm_Int, TB_ALIGN_16 },
{ X86::MINPSrr, X86::MINPSrm, TB_ALIGN_16 },
- { X86::MINPSrr_Int, X86::MINPSrm_Int, TB_ALIGN_16 },
{ X86::MINSDrr, X86::MINSDrm, 0 },
- { X86::MINSDrr_Int, X86::MINSDrm_Int, 0 },
{ X86::MINSSrr, X86::MINSSrm, 0 },
- { X86::MINSSrr_Int, X86::MINSSrm_Int, 0 },
{ X86::MPSADBWrri, X86::MPSADBWrmi, TB_ALIGN_16 },
{ X86::MULPDrr, X86::MULPDrm, TB_ALIGN_16 },
{ X86::MULPSrr, X86::MULPSrm, TB_ALIGN_16 },
@@ -756,6 +760,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::PMAXUBrr, X86::PMAXUBrm, TB_ALIGN_16 },
{ X86::PMINSWrr, X86::PMINSWrm, TB_ALIGN_16 },
{ X86::PMINUBrr, X86::PMINUBrm, TB_ALIGN_16 },
+ { X86::PMINSBrr, X86::PMINSBrm, TB_ALIGN_16 },
+ { X86::PMINSDrr, X86::PMINSDrm, TB_ALIGN_16 },
+ { X86::PMINUDrr, X86::PMINUDrm, TB_ALIGN_16 },
+ { X86::PMINUWrr, X86::PMINUWrm, TB_ALIGN_16 },
+ { X86::PMAXSBrr, X86::PMAXSBrm, TB_ALIGN_16 },
+ { X86::PMAXSDrr, X86::PMAXSDrm, TB_ALIGN_16 },
+ { X86::PMAXUDrr, X86::PMAXUDrm, TB_ALIGN_16 },
+ { X86::PMAXUWrr, X86::PMAXUWrm, TB_ALIGN_16 },
{ X86::PMULDQrr, X86::PMULDQrm, TB_ALIGN_16 },
{ X86::PMULHRSWrr128, X86::PMULHRSWrm128, TB_ALIGN_16 },
{ X86::PMULHUWrr, X86::PMULHUWrm, TB_ALIGN_16 },
@@ -827,31 +839,31 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
{ X86::VCVTSS2SDrr, X86::VCVTSS2SDrm, 0 },
{ X86::Int_VCVTSS2SDrr, X86::Int_VCVTSS2SDrm, 0 },
- { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, TB_ALIGN_16 },
- { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, TB_ALIGN_16 },
+ { X86::VCVTTPD2DQrr, X86::VCVTTPD2DQXrm, 0 },
+ { X86::VCVTTPS2DQrr, X86::VCVTTPS2DQrm, 0 },
{ X86::VRSQRTSSr, X86::VRSQRTSSm, 0 },
{ X86::VSQRTSDr, X86::VSQRTSDm, 0 },
{ X86::VSQRTSSr, X86::VSQRTSSm, 0 },
- { X86::VADDPDrr, X86::VADDPDrm, TB_ALIGN_16 },
- { X86::VADDPSrr, X86::VADDPSrm, TB_ALIGN_16 },
+ { X86::VADDPDrr, X86::VADDPDrm, 0 },
+ { X86::VADDPSrr, X86::VADDPSrm, 0 },
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
{ X86::VADDSSrr, X86::VADDSSrm, 0 },
- { X86::VADDSUBPDrr, X86::VADDSUBPDrm, TB_ALIGN_16 },
- { X86::VADDSUBPSrr, X86::VADDSUBPSrm, TB_ALIGN_16 },
- { X86::VANDNPDrr, X86::VANDNPDrm, TB_ALIGN_16 },
- { X86::VANDNPSrr, X86::VANDNPSrm, TB_ALIGN_16 },
- { X86::VANDPDrr, X86::VANDPDrm, TB_ALIGN_16 },
- { X86::VANDPSrr, X86::VANDPSrm, TB_ALIGN_16 },
- { X86::VBLENDPDrri, X86::VBLENDPDrmi, TB_ALIGN_16 },
- { X86::VBLENDPSrri, X86::VBLENDPSrmi, TB_ALIGN_16 },
- { X86::VBLENDVPDrr, X86::VBLENDVPDrm, TB_ALIGN_16 },
- { X86::VBLENDVPSrr, X86::VBLENDVPSrm, TB_ALIGN_16 },
- { X86::VCMPPDrri, X86::VCMPPDrmi, TB_ALIGN_16 },
- { X86::VCMPPSrri, X86::VCMPPSrmi, TB_ALIGN_16 },
+ { X86::VADDSUBPDrr, X86::VADDSUBPDrm, 0 },
+ { X86::VADDSUBPSrr, X86::VADDSUBPSrm, 0 },
+ { X86::VANDNPDrr, X86::VANDNPDrm, 0 },
+ { X86::VANDNPSrr, X86::VANDNPSrm, 0 },
+ { X86::VANDPDrr, X86::VANDPDrm, 0 },
+ { X86::VANDPSrr, X86::VANDPSrm, 0 },
+ { X86::VBLENDPDrri, X86::VBLENDPDrmi, 0 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrmi, 0 },
+ { X86::VBLENDVPDrr, X86::VBLENDVPDrm, 0 },
+ { X86::VBLENDVPSrr, X86::VBLENDVPSrm, 0 },
+ { X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
+ { X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
{ X86::VCMPSDrr, X86::VCMPSDrm, 0 },
{ X86::VCMPSSrr, X86::VCMPSSrm, 0 },
- { X86::VDIVPDrr, X86::VDIVPDrm, TB_ALIGN_16 },
- { X86::VDIVPSrr, X86::VDIVPSrm, TB_ALIGN_16 },
+ { X86::VDIVPDrr, X86::VDIVPDrm, 0 },
+ { X86::VDIVPSrr, X86::VDIVPSrm, 0 },
{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
{ X86::VDIVSSrr, X86::VDIVSSrm, 0 },
{ X86::VFsANDNPDrr, X86::VFsANDNPDrm, TB_ALIGN_16 },
@@ -862,263 +874,267 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFsORPSrr, X86::VFsORPSrm, TB_ALIGN_16 },
{ X86::VFsXORPDrr, X86::VFsXORPDrm, TB_ALIGN_16 },
{ X86::VFsXORPSrr, X86::VFsXORPSrm, TB_ALIGN_16 },
- { X86::VHADDPDrr, X86::VHADDPDrm, TB_ALIGN_16 },
- { X86::VHADDPSrr, X86::VHADDPSrm, TB_ALIGN_16 },
- { X86::VHSUBPDrr, X86::VHSUBPDrm, TB_ALIGN_16 },
- { X86::VHSUBPSrr, X86::VHSUBPSrm, TB_ALIGN_16 },
+ { X86::VHADDPDrr, X86::VHADDPDrm, 0 },
+ { X86::VHADDPSrr, X86::VHADDPSrm, 0 },
+ { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
+ { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
{ X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, 0 },
{ X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, 0 },
- { X86::VMAXPDrr, X86::VMAXPDrm, TB_ALIGN_16 },
- { X86::VMAXPDrr_Int, X86::VMAXPDrm_Int, TB_ALIGN_16 },
- { X86::VMAXPSrr, X86::VMAXPSrm, TB_ALIGN_16 },
- { X86::VMAXPSrr_Int, X86::VMAXPSrm_Int, TB_ALIGN_16 },
+ { X86::VMAXPDrr, X86::VMAXPDrm, 0 },
+ { X86::VMAXPSrr, X86::VMAXPSrm, 0 },
{ X86::VMAXSDrr, X86::VMAXSDrm, 0 },
- { X86::VMAXSDrr_Int, X86::VMAXSDrm_Int, 0 },
{ X86::VMAXSSrr, X86::VMAXSSrm, 0 },
- { X86::VMAXSSrr_Int, X86::VMAXSSrm_Int, 0 },
- { X86::VMINPDrr, X86::VMINPDrm, TB_ALIGN_16 },
- { X86::VMINPDrr_Int, X86::VMINPDrm_Int, TB_ALIGN_16 },
- { X86::VMINPSrr, X86::VMINPSrm, TB_ALIGN_16 },
- { X86::VMINPSrr_Int, X86::VMINPSrm_Int, TB_ALIGN_16 },
+ { X86::VMINPDrr, X86::VMINPDrm, 0 },
+ { X86::VMINPSrr, X86::VMINPSrm, 0 },
{ X86::VMINSDrr, X86::VMINSDrm, 0 },
- { X86::VMINSDrr_Int, X86::VMINSDrm_Int, 0 },
{ X86::VMINSSrr, X86::VMINSSrm, 0 },
- { X86::VMINSSrr_Int, X86::VMINSSrm_Int, 0 },
- { X86::VMPSADBWrri, X86::VMPSADBWrmi, TB_ALIGN_16 },
- { X86::VMULPDrr, X86::VMULPDrm, TB_ALIGN_16 },
- { X86::VMULPSrr, X86::VMULPSrm, TB_ALIGN_16 },
+ { X86::VMPSADBWrri, X86::VMPSADBWrmi, 0 },
+ { X86::VMULPDrr, X86::VMULPDrm, 0 },
+ { X86::VMULPSrr, X86::VMULPSrm, 0 },
{ X86::VMULSDrr, X86::VMULSDrm, 0 },
{ X86::VMULSSrr, X86::VMULSSrm, 0 },
- { X86::VORPDrr, X86::VORPDrm, TB_ALIGN_16 },
- { X86::VORPSrr, X86::VORPSrm, TB_ALIGN_16 },
- { X86::VPACKSSDWrr, X86::VPACKSSDWrm, TB_ALIGN_16 },
- { X86::VPACKSSWBrr, X86::VPACKSSWBrm, TB_ALIGN_16 },
- { X86::VPACKUSDWrr, X86::VPACKUSDWrm, TB_ALIGN_16 },
- { X86::VPACKUSWBrr, X86::VPACKUSWBrm, TB_ALIGN_16 },
- { X86::VPADDBrr, X86::VPADDBrm, TB_ALIGN_16 },
- { X86::VPADDDrr, X86::VPADDDrm, TB_ALIGN_16 },
- { X86::VPADDQrr, X86::VPADDQrm, TB_ALIGN_16 },
- { X86::VPADDSBrr, X86::VPADDSBrm, TB_ALIGN_16 },
- { X86::VPADDSWrr, X86::VPADDSWrm, TB_ALIGN_16 },
- { X86::VPADDUSBrr, X86::VPADDUSBrm, TB_ALIGN_16 },
- { X86::VPADDUSWrr, X86::VPADDUSWrm, TB_ALIGN_16 },
- { X86::VPADDWrr, X86::VPADDWrm, TB_ALIGN_16 },
- { X86::VPALIGNR128rr, X86::VPALIGNR128rm, TB_ALIGN_16 },
- { X86::VPANDNrr, X86::VPANDNrm, TB_ALIGN_16 },
- { X86::VPANDrr, X86::VPANDrm, TB_ALIGN_16 },
- { X86::VPAVGBrr, X86::VPAVGBrm, TB_ALIGN_16 },
- { X86::VPAVGWrr, X86::VPAVGWrm, TB_ALIGN_16 },
- { X86::VPBLENDWrri, X86::VPBLENDWrmi, TB_ALIGN_16 },
- { X86::VPCMPEQBrr, X86::VPCMPEQBrm, TB_ALIGN_16 },
- { X86::VPCMPEQDrr, X86::VPCMPEQDrm, TB_ALIGN_16 },
- { X86::VPCMPEQQrr, X86::VPCMPEQQrm, TB_ALIGN_16 },
- { X86::VPCMPEQWrr, X86::VPCMPEQWrm, TB_ALIGN_16 },
- { X86::VPCMPGTBrr, X86::VPCMPGTBrm, TB_ALIGN_16 },
- { X86::VPCMPGTDrr, X86::VPCMPGTDrm, TB_ALIGN_16 },
- { X86::VPCMPGTQrr, X86::VPCMPGTQrm, TB_ALIGN_16 },
- { X86::VPCMPGTWrr, X86::VPCMPGTWrm, TB_ALIGN_16 },
- { X86::VPHADDDrr, X86::VPHADDDrm, TB_ALIGN_16 },
- { X86::VPHADDSWrr128, X86::VPHADDSWrm128, TB_ALIGN_16 },
- { X86::VPHADDWrr, X86::VPHADDWrm, TB_ALIGN_16 },
- { X86::VPHSUBDrr, X86::VPHSUBDrm, TB_ALIGN_16 },
- { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, TB_ALIGN_16 },
- { X86::VPHSUBWrr, X86::VPHSUBWrm, TB_ALIGN_16 },
- { X86::VPERMILPDrr, X86::VPERMILPDrm, TB_ALIGN_16 },
- { X86::VPERMILPSrr, X86::VPERMILPSrm, TB_ALIGN_16 },
- { X86::VPINSRWrri, X86::VPINSRWrmi, TB_ALIGN_16 },
- { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, TB_ALIGN_16 },
- { X86::VPMADDWDrr, X86::VPMADDWDrm, TB_ALIGN_16 },
- { X86::VPMAXSWrr, X86::VPMAXSWrm, TB_ALIGN_16 },
- { X86::VPMAXUBrr, X86::VPMAXUBrm, TB_ALIGN_16 },
- { X86::VPMINSWrr, X86::VPMINSWrm, TB_ALIGN_16 },
- { X86::VPMINUBrr, X86::VPMINUBrm, TB_ALIGN_16 },
- { X86::VPMULDQrr, X86::VPMULDQrm, TB_ALIGN_16 },
- { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, TB_ALIGN_16 },
- { X86::VPMULHUWrr, X86::VPMULHUWrm, TB_ALIGN_16 },
- { X86::VPMULHWrr, X86::VPMULHWrm, TB_ALIGN_16 },
- { X86::VPMULLDrr, X86::VPMULLDrm, TB_ALIGN_16 },
- { X86::VPMULLWrr, X86::VPMULLWrm, TB_ALIGN_16 },
- { X86::VPMULUDQrr, X86::VPMULUDQrm, TB_ALIGN_16 },
- { X86::VPORrr, X86::VPORrm, TB_ALIGN_16 },
- { X86::VPSADBWrr, X86::VPSADBWrm, TB_ALIGN_16 },
- { X86::VPSHUFBrr, X86::VPSHUFBrm, TB_ALIGN_16 },
- { X86::VPSIGNBrr, X86::VPSIGNBrm, TB_ALIGN_16 },
- { X86::VPSIGNWrr, X86::VPSIGNWrm, TB_ALIGN_16 },
- { X86::VPSIGNDrr, X86::VPSIGNDrm, TB_ALIGN_16 },
- { X86::VPSLLDrr, X86::VPSLLDrm, TB_ALIGN_16 },
- { X86::VPSLLQrr, X86::VPSLLQrm, TB_ALIGN_16 },
- { X86::VPSLLWrr, X86::VPSLLWrm, TB_ALIGN_16 },
- { X86::VPSRADrr, X86::VPSRADrm, TB_ALIGN_16 },
- { X86::VPSRAWrr, X86::VPSRAWrm, TB_ALIGN_16 },
- { X86::VPSRLDrr, X86::VPSRLDrm, TB_ALIGN_16 },
- { X86::VPSRLQrr, X86::VPSRLQrm, TB_ALIGN_16 },
- { X86::VPSRLWrr, X86::VPSRLWrm, TB_ALIGN_16 },
- { X86::VPSUBBrr, X86::VPSUBBrm, TB_ALIGN_16 },
- { X86::VPSUBDrr, X86::VPSUBDrm, TB_ALIGN_16 },
- { X86::VPSUBSBrr, X86::VPSUBSBrm, TB_ALIGN_16 },
- { X86::VPSUBSWrr, X86::VPSUBSWrm, TB_ALIGN_16 },
- { X86::VPSUBWrr, X86::VPSUBWrm, TB_ALIGN_16 },
- { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, TB_ALIGN_16 },
- { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, TB_ALIGN_16 },
- { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, TB_ALIGN_16 },
- { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, TB_ALIGN_16 },
- { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, TB_ALIGN_16 },
- { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, TB_ALIGN_16 },
- { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, TB_ALIGN_16 },
- { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, TB_ALIGN_16 },
- { X86::VPXORrr, X86::VPXORrm, TB_ALIGN_16 },
- { X86::VSHUFPDrri, X86::VSHUFPDrmi, TB_ALIGN_16 },
- { X86::VSHUFPSrri, X86::VSHUFPSrmi, TB_ALIGN_16 },
- { X86::VSUBPDrr, X86::VSUBPDrm, TB_ALIGN_16 },
- { X86::VSUBPSrr, X86::VSUBPSrm, TB_ALIGN_16 },
+ { X86::VORPDrr, X86::VORPDrm, 0 },
+ { X86::VORPSrr, X86::VORPSrm, 0 },
+ { X86::VPACKSSDWrr, X86::VPACKSSDWrm, 0 },
+ { X86::VPACKSSWBrr, X86::VPACKSSWBrm, 0 },
+ { X86::VPACKUSDWrr, X86::VPACKUSDWrm, 0 },
+ { X86::VPACKUSWBrr, X86::VPACKUSWBrm, 0 },
+ { X86::VPADDBrr, X86::VPADDBrm, 0 },
+ { X86::VPADDDrr, X86::VPADDDrm, 0 },
+ { X86::VPADDQrr, X86::VPADDQrm, 0 },
+ { X86::VPADDSBrr, X86::VPADDSBrm, 0 },
+ { X86::VPADDSWrr, X86::VPADDSWrm, 0 },
+ { X86::VPADDUSBrr, X86::VPADDUSBrm, 0 },
+ { X86::VPADDUSWrr, X86::VPADDUSWrm, 0 },
+ { X86::VPADDWrr, X86::VPADDWrm, 0 },
+ { X86::VPALIGNR128rr, X86::VPALIGNR128rm, 0 },
+ { X86::VPANDNrr, X86::VPANDNrm, 0 },
+ { X86::VPANDrr, X86::VPANDrm, 0 },
+ { X86::VPAVGBrr, X86::VPAVGBrm, 0 },
+ { X86::VPAVGWrr, X86::VPAVGWrm, 0 },
+ { X86::VPBLENDWrri, X86::VPBLENDWrmi, 0 },
+ { X86::VPCMPEQBrr, X86::VPCMPEQBrm, 0 },
+ { X86::VPCMPEQDrr, X86::VPCMPEQDrm, 0 },
+ { X86::VPCMPEQQrr, X86::VPCMPEQQrm, 0 },
+ { X86::VPCMPEQWrr, X86::VPCMPEQWrm, 0 },
+ { X86::VPCMPGTBrr, X86::VPCMPGTBrm, 0 },
+ { X86::VPCMPGTDrr, X86::VPCMPGTDrm, 0 },
+ { X86::VPCMPGTQrr, X86::VPCMPGTQrm, 0 },
+ { X86::VPCMPGTWrr, X86::VPCMPGTWrm, 0 },
+ { X86::VPHADDDrr, X86::VPHADDDrm, 0 },
+ { X86::VPHADDSWrr128, X86::VPHADDSWrm128, 0 },
+ { X86::VPHADDWrr, X86::VPHADDWrm, 0 },
+ { X86::VPHSUBDrr, X86::VPHSUBDrm, 0 },
+ { X86::VPHSUBSWrr128, X86::VPHSUBSWrm128, 0 },
+ { X86::VPHSUBWrr, X86::VPHSUBWrm, 0 },
+ { X86::VPERMILPDrr, X86::VPERMILPDrm, 0 },
+ { X86::VPERMILPSrr, X86::VPERMILPSrm, 0 },
+ { X86::VPINSRWrri, X86::VPINSRWrmi, 0 },
+ { X86::VPMADDUBSWrr128, X86::VPMADDUBSWrm128, 0 },
+ { X86::VPMADDWDrr, X86::VPMADDWDrm, 0 },
+ { X86::VPMAXSWrr, X86::VPMAXSWrm, 0 },
+ { X86::VPMAXUBrr, X86::VPMAXUBrm, 0 },
+ { X86::VPMINSWrr, X86::VPMINSWrm, 0 },
+ { X86::VPMINUBrr, X86::VPMINUBrm, 0 },
+ { X86::VPMINSBrr, X86::VPMINSBrm, 0 },
+ { X86::VPMINSDrr, X86::VPMINSDrm, 0 },
+ { X86::VPMINUDrr, X86::VPMINUDrm, 0 },
+ { X86::VPMINUWrr, X86::VPMINUWrm, 0 },
+ { X86::VPMAXSBrr, X86::VPMAXSBrm, 0 },
+ { X86::VPMAXSDrr, X86::VPMAXSDrm, 0 },
+ { X86::VPMAXUDrr, X86::VPMAXUDrm, 0 },
+ { X86::VPMAXUWrr, X86::VPMAXUWrm, 0 },
+ { X86::VPMULDQrr, X86::VPMULDQrm, 0 },
+ { X86::VPMULHRSWrr128, X86::VPMULHRSWrm128, 0 },
+ { X86::VPMULHUWrr, X86::VPMULHUWrm, 0 },
+ { X86::VPMULHWrr, X86::VPMULHWrm, 0 },
+ { X86::VPMULLDrr, X86::VPMULLDrm, 0 },
+ { X86::VPMULLWrr, X86::VPMULLWrm, 0 },
+ { X86::VPMULUDQrr, X86::VPMULUDQrm, 0 },
+ { X86::VPORrr, X86::VPORrm, 0 },
+ { X86::VPSADBWrr, X86::VPSADBWrm, 0 },
+ { X86::VPSHUFBrr, X86::VPSHUFBrm, 0 },
+ { X86::VPSIGNBrr, X86::VPSIGNBrm, 0 },
+ { X86::VPSIGNWrr, X86::VPSIGNWrm, 0 },
+ { X86::VPSIGNDrr, X86::VPSIGNDrm, 0 },
+ { X86::VPSLLDrr, X86::VPSLLDrm, 0 },
+ { X86::VPSLLQrr, X86::VPSLLQrm, 0 },
+ { X86::VPSLLWrr, X86::VPSLLWrm, 0 },
+ { X86::VPSRADrr, X86::VPSRADrm, 0 },
+ { X86::VPSRAWrr, X86::VPSRAWrm, 0 },
+ { X86::VPSRLDrr, X86::VPSRLDrm, 0 },
+ { X86::VPSRLQrr, X86::VPSRLQrm, 0 },
+ { X86::VPSRLWrr, X86::VPSRLWrm, 0 },
+ { X86::VPSUBBrr, X86::VPSUBBrm, 0 },
+ { X86::VPSUBDrr, X86::VPSUBDrm, 0 },
+ { X86::VPSUBSBrr, X86::VPSUBSBrm, 0 },
+ { X86::VPSUBSWrr, X86::VPSUBSWrm, 0 },
+ { X86::VPSUBWrr, X86::VPSUBWrm, 0 },
+ { X86::VPUNPCKHBWrr, X86::VPUNPCKHBWrm, 0 },
+ { X86::VPUNPCKHDQrr, X86::VPUNPCKHDQrm, 0 },
+ { X86::VPUNPCKHQDQrr, X86::VPUNPCKHQDQrm, 0 },
+ { X86::VPUNPCKHWDrr, X86::VPUNPCKHWDrm, 0 },
+ { X86::VPUNPCKLBWrr, X86::VPUNPCKLBWrm, 0 },
+ { X86::VPUNPCKLDQrr, X86::VPUNPCKLDQrm, 0 },
+ { X86::VPUNPCKLQDQrr, X86::VPUNPCKLQDQrm, 0 },
+ { X86::VPUNPCKLWDrr, X86::VPUNPCKLWDrm, 0 },
+ { X86::VPXORrr, X86::VPXORrm, 0 },
+ { X86::VSHUFPDrri, X86::VSHUFPDrmi, 0 },
+ { X86::VSHUFPSrri, X86::VSHUFPSrmi, 0 },
+ { X86::VSUBPDrr, X86::VSUBPDrm, 0 },
+ { X86::VSUBPSrr, X86::VSUBPSrm, 0 },
{ X86::VSUBSDrr, X86::VSUBSDrm, 0 },
{ X86::VSUBSSrr, X86::VSUBSSrm, 0 },
- { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, TB_ALIGN_16 },
- { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, TB_ALIGN_16 },
- { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, TB_ALIGN_16 },
- { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, TB_ALIGN_16 },
- { X86::VXORPDrr, X86::VXORPDrm, TB_ALIGN_16 },
- { X86::VXORPSrr, X86::VXORPSrm, TB_ALIGN_16 },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrm, 0 },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrm, 0 },
+ { X86::VUNPCKLPDrr, X86::VUNPCKLPDrm, 0 },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrm, 0 },
+ { X86::VXORPDrr, X86::VXORPDrm, 0 },
+ { X86::VXORPSrr, X86::VXORPSrm, 0 },
// AVX 256-bit foldable instructions
- { X86::VADDPDYrr, X86::VADDPDYrm, TB_ALIGN_32 },
- { X86::VADDPSYrr, X86::VADDPSYrm, TB_ALIGN_32 },
- { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, TB_ALIGN_32 },
- { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, TB_ALIGN_32 },
- { X86::VANDNPDYrr, X86::VANDNPDYrm, TB_ALIGN_32 },
- { X86::VANDNPSYrr, X86::VANDNPSYrm, TB_ALIGN_32 },
- { X86::VANDPDYrr, X86::VANDPDYrm, TB_ALIGN_32 },
- { X86::VANDPSYrr, X86::VANDPSYrm, TB_ALIGN_32 },
- { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, TB_ALIGN_32 },
- { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, TB_ALIGN_32 },
- { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, TB_ALIGN_32 },
- { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, TB_ALIGN_32 },
- { X86::VCMPPDYrri, X86::VCMPPDYrmi, TB_ALIGN_32 },
- { X86::VCMPPSYrri, X86::VCMPPSYrmi, TB_ALIGN_32 },
- { X86::VDIVPDYrr, X86::VDIVPDYrm, TB_ALIGN_32 },
- { X86::VDIVPSYrr, X86::VDIVPSYrm, TB_ALIGN_32 },
- { X86::VHADDPDYrr, X86::VHADDPDYrm, TB_ALIGN_32 },
- { X86::VHADDPSYrr, X86::VHADDPSYrm, TB_ALIGN_32 },
- { X86::VHSUBPDYrr, X86::VHSUBPDYrm, TB_ALIGN_32 },
- { X86::VHSUBPSYrr, X86::VHSUBPSYrm, TB_ALIGN_32 },
- { X86::VINSERTF128rr, X86::VINSERTF128rm, TB_ALIGN_32 },
- { X86::VMAXPDYrr, X86::VMAXPDYrm, TB_ALIGN_32 },
- { X86::VMAXPDYrr_Int, X86::VMAXPDYrm_Int, TB_ALIGN_32 },
- { X86::VMAXPSYrr, X86::VMAXPSYrm, TB_ALIGN_32 },
- { X86::VMAXPSYrr_Int, X86::VMAXPSYrm_Int, TB_ALIGN_32 },
- { X86::VMINPDYrr, X86::VMINPDYrm, TB_ALIGN_32 },
- { X86::VMINPDYrr_Int, X86::VMINPDYrm_Int, TB_ALIGN_32 },
- { X86::VMINPSYrr, X86::VMINPSYrm, TB_ALIGN_32 },
- { X86::VMINPSYrr_Int, X86::VMINPSYrm_Int, TB_ALIGN_32 },
- { X86::VMULPDYrr, X86::VMULPDYrm, TB_ALIGN_32 },
- { X86::VMULPSYrr, X86::VMULPSYrm, TB_ALIGN_32 },
- { X86::VORPDYrr, X86::VORPDYrm, TB_ALIGN_32 },
- { X86::VORPSYrr, X86::VORPSYrm, TB_ALIGN_32 },
- { X86::VPERM2F128rr, X86::VPERM2F128rm, TB_ALIGN_32 },
- { X86::VPERMILPDYrr, X86::VPERMILPDYrm, TB_ALIGN_32 },
- { X86::VPERMILPSYrr, X86::VPERMILPSYrm, TB_ALIGN_32 },
- { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, TB_ALIGN_32 },
- { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, TB_ALIGN_32 },
- { X86::VSUBPDYrr, X86::VSUBPDYrm, TB_ALIGN_32 },
- { X86::VSUBPSYrr, X86::VSUBPSYrm, TB_ALIGN_32 },
- { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, TB_ALIGN_32 },
- { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, TB_ALIGN_32 },
- { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, TB_ALIGN_32 },
- { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, TB_ALIGN_32 },
- { X86::VXORPDYrr, X86::VXORPDYrm, TB_ALIGN_32 },
- { X86::VXORPSYrr, X86::VXORPSYrm, TB_ALIGN_32 },
+ { X86::VADDPDYrr, X86::VADDPDYrm, 0 },
+ { X86::VADDPSYrr, X86::VADDPSYrm, 0 },
+ { X86::VADDSUBPDYrr, X86::VADDSUBPDYrm, 0 },
+ { X86::VADDSUBPSYrr, X86::VADDSUBPSYrm, 0 },
+ { X86::VANDNPDYrr, X86::VANDNPDYrm, 0 },
+ { X86::VANDNPSYrr, X86::VANDNPSYrm, 0 },
+ { X86::VANDPDYrr, X86::VANDPDYrm, 0 },
+ { X86::VANDPSYrr, X86::VANDPSYrm, 0 },
+ { X86::VBLENDPDYrri, X86::VBLENDPDYrmi, 0 },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrmi, 0 },
+ { X86::VBLENDVPDYrr, X86::VBLENDVPDYrm, 0 },
+ { X86::VBLENDVPSYrr, X86::VBLENDVPSYrm, 0 },
+ { X86::VCMPPDYrri, X86::VCMPPDYrmi, 0 },
+ { X86::VCMPPSYrri, X86::VCMPPSYrmi, 0 },
+ { X86::VDIVPDYrr, X86::VDIVPDYrm, 0 },
+ { X86::VDIVPSYrr, X86::VDIVPSYrm, 0 },
+ { X86::VHADDPDYrr, X86::VHADDPDYrm, 0 },
+ { X86::VHADDPSYrr, X86::VHADDPSYrm, 0 },
+ { X86::VHSUBPDYrr, X86::VHSUBPDYrm, 0 },
+ { X86::VHSUBPSYrr, X86::VHSUBPSYrm, 0 },
+ { X86::VINSERTF128rr, X86::VINSERTF128rm, 0 },
+ { X86::VMAXPDYrr, X86::VMAXPDYrm, 0 },
+ { X86::VMAXPSYrr, X86::VMAXPSYrm, 0 },
+ { X86::VMINPDYrr, X86::VMINPDYrm, 0 },
+ { X86::VMINPSYrr, X86::VMINPSYrm, 0 },
+ { X86::VMULPDYrr, X86::VMULPDYrm, 0 },
+ { X86::VMULPSYrr, X86::VMULPSYrm, 0 },
+ { X86::VORPDYrr, X86::VORPDYrm, 0 },
+ { X86::VORPSYrr, X86::VORPSYrm, 0 },
+ { X86::VPERM2F128rr, X86::VPERM2F128rm, 0 },
+ { X86::VPERMILPDYrr, X86::VPERMILPDYrm, 0 },
+ { X86::VPERMILPSYrr, X86::VPERMILPSYrm, 0 },
+ { X86::VSHUFPDYrri, X86::VSHUFPDYrmi, 0 },
+ { X86::VSHUFPSYrri, X86::VSHUFPSYrmi, 0 },
+ { X86::VSUBPDYrr, X86::VSUBPDYrm, 0 },
+ { X86::VSUBPSYrr, X86::VSUBPSYrm, 0 },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrm, 0 },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrm, 0 },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrm, 0 },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrm, 0 },
+ { X86::VXORPDYrr, X86::VXORPDYrm, 0 },
+ { X86::VXORPSYrr, X86::VXORPSYrm, 0 },
// AVX2 foldable instructions
- { X86::VINSERTI128rr, X86::VINSERTI128rm, TB_ALIGN_16 },
- { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, TB_ALIGN_32 },
- { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, TB_ALIGN_32 },
- { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, TB_ALIGN_32 },
- { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, TB_ALIGN_32 },
- { X86::VPADDBYrr, X86::VPADDBYrm, TB_ALIGN_32 },
- { X86::VPADDDYrr, X86::VPADDDYrm, TB_ALIGN_32 },
- { X86::VPADDQYrr, X86::VPADDQYrm, TB_ALIGN_32 },
- { X86::VPADDSBYrr, X86::VPADDSBYrm, TB_ALIGN_32 },
- { X86::VPADDSWYrr, X86::VPADDSWYrm, TB_ALIGN_32 },
- { X86::VPADDUSBYrr, X86::VPADDUSBYrm, TB_ALIGN_32 },
- { X86::VPADDUSWYrr, X86::VPADDUSWYrm, TB_ALIGN_32 },
- { X86::VPADDWYrr, X86::VPADDWYrm, TB_ALIGN_32 },
- { X86::VPALIGNR256rr, X86::VPALIGNR256rm, TB_ALIGN_32 },
- { X86::VPANDNYrr, X86::VPANDNYrm, TB_ALIGN_32 },
- { X86::VPANDYrr, X86::VPANDYrm, TB_ALIGN_32 },
- { X86::VPAVGBYrr, X86::VPAVGBYrm, TB_ALIGN_32 },
- { X86::VPAVGWYrr, X86::VPAVGWYrm, TB_ALIGN_32 },
- { X86::VPBLENDDrri, X86::VPBLENDDrmi, TB_ALIGN_32 },
- { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, TB_ALIGN_32 },
- { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, TB_ALIGN_32 },
- { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, TB_ALIGN_32 },
- { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, TB_ALIGN_32 },
- { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, TB_ALIGN_32 },
- { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, TB_ALIGN_32 },
- { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, TB_ALIGN_32 },
- { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, TB_ALIGN_32 },
- { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, TB_ALIGN_32 },
- { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, TB_ALIGN_32 },
- { X86::VPERM2I128rr, X86::VPERM2I128rm, TB_ALIGN_32 },
- { X86::VPERMDYrr, X86::VPERMDYrm, TB_ALIGN_32 },
- { X86::VPERMPDYri, X86::VPERMPDYmi, TB_ALIGN_32 },
- { X86::VPERMPSYrr, X86::VPERMPSYrm, TB_ALIGN_32 },
- { X86::VPERMQYri, X86::VPERMQYmi, TB_ALIGN_32 },
- { X86::VPHADDDYrr, X86::VPHADDDYrm, TB_ALIGN_32 },
- { X86::VPHADDSWrr256, X86::VPHADDSWrm256, TB_ALIGN_32 },
- { X86::VPHADDWYrr, X86::VPHADDWYrm, TB_ALIGN_32 },
- { X86::VPHSUBDYrr, X86::VPHSUBDYrm, TB_ALIGN_32 },
- { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, TB_ALIGN_32 },
- { X86::VPHSUBWYrr, X86::VPHSUBWYrm, TB_ALIGN_32 },
- { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, TB_ALIGN_32 },
- { X86::VPMADDWDYrr, X86::VPMADDWDYrm, TB_ALIGN_32 },
- { X86::VPMAXSWYrr, X86::VPMAXSWYrm, TB_ALIGN_32 },
- { X86::VPMAXUBYrr, X86::VPMAXUBYrm, TB_ALIGN_32 },
- { X86::VPMINSWYrr, X86::VPMINSWYrm, TB_ALIGN_32 },
- { X86::VPMINUBYrr, X86::VPMINUBYrm, TB_ALIGN_32 },
- { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, TB_ALIGN_32 },
- { X86::VPMULDQYrr, X86::VPMULDQYrm, TB_ALIGN_32 },
- { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, TB_ALIGN_32 },
- { X86::VPMULHUWYrr, X86::VPMULHUWYrm, TB_ALIGN_32 },
- { X86::VPMULHWYrr, X86::VPMULHWYrm, TB_ALIGN_32 },
- { X86::VPMULLDYrr, X86::VPMULLDYrm, TB_ALIGN_32 },
- { X86::VPMULLWYrr, X86::VPMULLWYrm, TB_ALIGN_32 },
- { X86::VPMULUDQYrr, X86::VPMULUDQYrm, TB_ALIGN_32 },
- { X86::VPORYrr, X86::VPORYrm, TB_ALIGN_32 },
- { X86::VPSADBWYrr, X86::VPSADBWYrm, TB_ALIGN_32 },
- { X86::VPSHUFBYrr, X86::VPSHUFBYrm, TB_ALIGN_32 },
- { X86::VPSIGNBYrr, X86::VPSIGNBYrm, TB_ALIGN_32 },
- { X86::VPSIGNWYrr, X86::VPSIGNWYrm, TB_ALIGN_32 },
- { X86::VPSIGNDYrr, X86::VPSIGNDYrm, TB_ALIGN_32 },
- { X86::VPSLLDYrr, X86::VPSLLDYrm, TB_ALIGN_16 },
- { X86::VPSLLQYrr, X86::VPSLLQYrm, TB_ALIGN_16 },
- { X86::VPSLLWYrr, X86::VPSLLWYrm, TB_ALIGN_16 },
- { X86::VPSLLVDrr, X86::VPSLLVDrm, TB_ALIGN_16 },
- { X86::VPSLLVDYrr, X86::VPSLLVDYrm, TB_ALIGN_32 },
- { X86::VPSLLVQrr, X86::VPSLLVQrm, TB_ALIGN_16 },
- { X86::VPSLLVQYrr, X86::VPSLLVQYrm, TB_ALIGN_32 },
- { X86::VPSRADYrr, X86::VPSRADYrm, TB_ALIGN_16 },
- { X86::VPSRAWYrr, X86::VPSRAWYrm, TB_ALIGN_16 },
- { X86::VPSRAVDrr, X86::VPSRAVDrm, TB_ALIGN_16 },
- { X86::VPSRAVDYrr, X86::VPSRAVDYrm, TB_ALIGN_32 },
- { X86::VPSRLDYrr, X86::VPSRLDYrm, TB_ALIGN_16 },
- { X86::VPSRLQYrr, X86::VPSRLQYrm, TB_ALIGN_16 },
- { X86::VPSRLWYrr, X86::VPSRLWYrm, TB_ALIGN_16 },
- { X86::VPSRLVDrr, X86::VPSRLVDrm, TB_ALIGN_16 },
- { X86::VPSRLVDYrr, X86::VPSRLVDYrm, TB_ALIGN_32 },
- { X86::VPSRLVQrr, X86::VPSRLVQrm, TB_ALIGN_16 },
- { X86::VPSRLVQYrr, X86::VPSRLVQYrm, TB_ALIGN_32 },
- { X86::VPSUBBYrr, X86::VPSUBBYrm, TB_ALIGN_32 },
- { X86::VPSUBDYrr, X86::VPSUBDYrm, TB_ALIGN_32 },
- { X86::VPSUBSBYrr, X86::VPSUBSBYrm, TB_ALIGN_32 },
- { X86::VPSUBSWYrr, X86::VPSUBSWYrm, TB_ALIGN_32 },
- { X86::VPSUBWYrr, X86::VPSUBWYrm, TB_ALIGN_32 },
- { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, TB_ALIGN_32 },
- { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, TB_ALIGN_32 },
- { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, TB_ALIGN_16 },
- { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, TB_ALIGN_32 },
- { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, TB_ALIGN_32 },
- { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, TB_ALIGN_32 },
- { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, TB_ALIGN_32 },
- { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, TB_ALIGN_32 },
- { X86::VPXORYrr, X86::VPXORYrm, TB_ALIGN_32 },
+ { X86::VINSERTI128rr, X86::VINSERTI128rm, 0 },
+ { X86::VPACKSSDWYrr, X86::VPACKSSDWYrm, 0 },
+ { X86::VPACKSSWBYrr, X86::VPACKSSWBYrm, 0 },
+ { X86::VPACKUSDWYrr, X86::VPACKUSDWYrm, 0 },
+ { X86::VPACKUSWBYrr, X86::VPACKUSWBYrm, 0 },
+ { X86::VPADDBYrr, X86::VPADDBYrm, 0 },
+ { X86::VPADDDYrr, X86::VPADDDYrm, 0 },
+ { X86::VPADDQYrr, X86::VPADDQYrm, 0 },
+ { X86::VPADDSBYrr, X86::VPADDSBYrm, 0 },
+ { X86::VPADDSWYrr, X86::VPADDSWYrm, 0 },
+ { X86::VPADDUSBYrr, X86::VPADDUSBYrm, 0 },
+ { X86::VPADDUSWYrr, X86::VPADDUSWYrm, 0 },
+ { X86::VPADDWYrr, X86::VPADDWYrm, 0 },
+ { X86::VPALIGNR256rr, X86::VPALIGNR256rm, 0 },
+ { X86::VPANDNYrr, X86::VPANDNYrm, 0 },
+ { X86::VPANDYrr, X86::VPANDYrm, 0 },
+ { X86::VPAVGBYrr, X86::VPAVGBYrm, 0 },
+ { X86::VPAVGWYrr, X86::VPAVGWYrm, 0 },
+ { X86::VPBLENDDrri, X86::VPBLENDDrmi, 0 },
+ { X86::VPBLENDDYrri, X86::VPBLENDDYrmi, 0 },
+ { X86::VPBLENDWYrri, X86::VPBLENDWYrmi, 0 },
+ { X86::VPCMPEQBYrr, X86::VPCMPEQBYrm, 0 },
+ { X86::VPCMPEQDYrr, X86::VPCMPEQDYrm, 0 },
+ { X86::VPCMPEQQYrr, X86::VPCMPEQQYrm, 0 },
+ { X86::VPCMPEQWYrr, X86::VPCMPEQWYrm, 0 },
+ { X86::VPCMPGTBYrr, X86::VPCMPGTBYrm, 0 },
+ { X86::VPCMPGTDYrr, X86::VPCMPGTDYrm, 0 },
+ { X86::VPCMPGTQYrr, X86::VPCMPGTQYrm, 0 },
+ { X86::VPCMPGTWYrr, X86::VPCMPGTWYrm, 0 },
+ { X86::VPERM2I128rr, X86::VPERM2I128rm, 0 },
+ { X86::VPERMDYrr, X86::VPERMDYrm, 0 },
+ { X86::VPERMPDYri, X86::VPERMPDYmi, 0 },
+ { X86::VPERMPSYrr, X86::VPERMPSYrm, 0 },
+ { X86::VPERMQYri, X86::VPERMQYmi, 0 },
+ { X86::VPHADDDYrr, X86::VPHADDDYrm, 0 },
+ { X86::VPHADDSWrr256, X86::VPHADDSWrm256, 0 },
+ { X86::VPHADDWYrr, X86::VPHADDWYrm, 0 },
+ { X86::VPHSUBDYrr, X86::VPHSUBDYrm, 0 },
+ { X86::VPHSUBSWrr256, X86::VPHSUBSWrm256, 0 },
+ { X86::VPHSUBWYrr, X86::VPHSUBWYrm, 0 },
+ { X86::VPMADDUBSWrr256, X86::VPMADDUBSWrm256, 0 },
+ { X86::VPMADDWDYrr, X86::VPMADDWDYrm, 0 },
+ { X86::VPMAXSWYrr, X86::VPMAXSWYrm, 0 },
+ { X86::VPMAXUBYrr, X86::VPMAXUBYrm, 0 },
+ { X86::VPMINSWYrr, X86::VPMINSWYrm, 0 },
+ { X86::VPMINUBYrr, X86::VPMINUBYrm, 0 },
+ { X86::VPMINSBYrr, X86::VPMINSBYrm, 0 },
+ { X86::VPMINSDYrr, X86::VPMINSDYrm, 0 },
+ { X86::VPMINUDYrr, X86::VPMINUDYrm, 0 },
+ { X86::VPMINUWYrr, X86::VPMINUWYrm, 0 },
+ { X86::VPMAXSBYrr, X86::VPMAXSBYrm, 0 },
+ { X86::VPMAXSDYrr, X86::VPMAXSDYrm, 0 },
+ { X86::VPMAXUDYrr, X86::VPMAXUDYrm, 0 },
+ { X86::VPMAXUWYrr, X86::VPMAXUWYrm, 0 },
+ { X86::VMPSADBWYrri, X86::VMPSADBWYrmi, 0 },
+ { X86::VPMULDQYrr, X86::VPMULDQYrm, 0 },
+ { X86::VPMULHRSWrr256, X86::VPMULHRSWrm256, 0 },
+ { X86::VPMULHUWYrr, X86::VPMULHUWYrm, 0 },
+ { X86::VPMULHWYrr, X86::VPMULHWYrm, 0 },
+ { X86::VPMULLDYrr, X86::VPMULLDYrm, 0 },
+ { X86::VPMULLWYrr, X86::VPMULLWYrm, 0 },
+ { X86::VPMULUDQYrr, X86::VPMULUDQYrm, 0 },
+ { X86::VPORYrr, X86::VPORYrm, 0 },
+ { X86::VPSADBWYrr, X86::VPSADBWYrm, 0 },
+ { X86::VPSHUFBYrr, X86::VPSHUFBYrm, 0 },
+ { X86::VPSIGNBYrr, X86::VPSIGNBYrm, 0 },
+ { X86::VPSIGNWYrr, X86::VPSIGNWYrm, 0 },
+ { X86::VPSIGNDYrr, X86::VPSIGNDYrm, 0 },
+ { X86::VPSLLDYrr, X86::VPSLLDYrm, 0 },
+ { X86::VPSLLQYrr, X86::VPSLLQYrm, 0 },
+ { X86::VPSLLWYrr, X86::VPSLLWYrm, 0 },
+ { X86::VPSLLVDrr, X86::VPSLLVDrm, 0 },
+ { X86::VPSLLVDYrr, X86::VPSLLVDYrm, 0 },
+ { X86::VPSLLVQrr, X86::VPSLLVQrm, 0 },
+ { X86::VPSLLVQYrr, X86::VPSLLVQYrm, 0 },
+ { X86::VPSRADYrr, X86::VPSRADYrm, 0 },
+ { X86::VPSRAWYrr, X86::VPSRAWYrm, 0 },
+ { X86::VPSRAVDrr, X86::VPSRAVDrm, 0 },
+ { X86::VPSRAVDYrr, X86::VPSRAVDYrm, 0 },
+ { X86::VPSRLDYrr, X86::VPSRLDYrm, 0 },
+ { X86::VPSRLQYrr, X86::VPSRLQYrm, 0 },
+ { X86::VPSRLWYrr, X86::VPSRLWYrm, 0 },
+ { X86::VPSRLVDrr, X86::VPSRLVDrm, 0 },
+ { X86::VPSRLVDYrr, X86::VPSRLVDYrm, 0 },
+ { X86::VPSRLVQrr, X86::VPSRLVQrm, 0 },
+ { X86::VPSRLVQYrr, X86::VPSRLVQYrm, 0 },
+ { X86::VPSUBBYrr, X86::VPSUBBYrm, 0 },
+ { X86::VPSUBDYrr, X86::VPSUBDYrm, 0 },
+ { X86::VPSUBSBYrr, X86::VPSUBSBYrm, 0 },
+ { X86::VPSUBSWYrr, X86::VPSUBSWYrm, 0 },
+ { X86::VPSUBWYrr, X86::VPSUBWYrm, 0 },
+ { X86::VPUNPCKHBWYrr, X86::VPUNPCKHBWYrm, 0 },
+ { X86::VPUNPCKHDQYrr, X86::VPUNPCKHDQYrm, 0 },
+ { X86::VPUNPCKHQDQYrr, X86::VPUNPCKHQDQYrm, 0 },
+ { X86::VPUNPCKHWDYrr, X86::VPUNPCKHWDYrm, 0 },
+ { X86::VPUNPCKLBWYrr, X86::VPUNPCKLBWYrm, 0 },
+ { X86::VPUNPCKLDQYrr, X86::VPUNPCKLDQYrm, 0 },
+ { X86::VPUNPCKLQDQYrr, X86::VPUNPCKLQDQYrm, 0 },
+ { X86::VPUNPCKLWDYrr, X86::VPUNPCKLWDYrm, 0 },
+ { X86::VPXORYrr, X86::VPXORYrm, 0 },
// FIXME: add AVX 256-bit foldable instructions
// FMA4 foldable patterns
@@ -1156,8 +1172,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
{ X86::VFMSUBADDPD4rrY, X86::VFMSUBADDPD4mrY, TB_ALIGN_32 },
// BMI/BMI2 foldable instructions
+ { X86::ANDN32rr, X86::ANDN32rm, 0 },
+ { X86::ANDN64rr, X86::ANDN64rm, 0 },
{ X86::MULX32rr, X86::MULX32rm, 0 },
{ X86::MULX64rr, X86::MULX64rm, 0 },
+ { X86::PDEP32rr, X86::PDEP32rm, 0 },
+ { X86::PDEP64rr, X86::PDEP64rm, 0 },
+ { X86::PEXT32rr, X86::PEXT32rm, 0 },
+ { X86::PEXT64rr, X86::PEXT64rm, 0 },
};
for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -2843,6 +2865,8 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
}
// Moving EFLAGS to / from another register requires a push and a pop.
+ // Notice that we have to adjust the stack if we don't want to clobber the
+ // first frame index. See X86FrameLowering.cpp - colobbersTheStack.
if (SrcReg == X86::EFLAGS) {
if (X86::GR64RegClass.contains(DestReg)) {
BuildMI(MBB, MI, DL, get(X86::PUSHF64));
@@ -3152,19 +3176,15 @@ inline static bool isDefConvertible(MachineInstr *MI) {
case X86::SUB8ri: case X86::SUB64rr: case X86::SUB32rr:
case X86::SUB16rr: case X86::SUB8rr: case X86::SUB64rm:
case X86::SUB32rm: case X86::SUB16rm: case X86::SUB8rm:
- case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
- case X86::DEC64m: case X86::DEC32m: case X86::DEC16m: case X86::DEC8m:
+ case X86::DEC64r: case X86::DEC32r: case X86::DEC16r: case X86::DEC8r:
case X86::DEC64_32r: case X86::DEC64_16r:
- case X86::DEC64_32m: case X86::DEC64_16m:
case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
case X86::ADD32ri8: case X86::ADD16ri: case X86::ADD16ri8:
case X86::ADD8ri: case X86::ADD64rr: case X86::ADD32rr:
case X86::ADD16rr: case X86::ADD8rr: case X86::ADD64rm:
case X86::ADD32rm: case X86::ADD16rm: case X86::ADD8rm:
- case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
- case X86::INC64m: case X86::INC32m: case X86::INC16m: case X86::INC8m:
+ case X86::INC64r: case X86::INC32r: case X86::INC16r: case X86::INC8r:
case X86::INC64_32r: case X86::INC64_16r:
- case X86::INC64_32m: case X86::INC64_16m:
case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
case X86::AND32ri8: case X86::AND16ri: case X86::AND16ri8:
case X86::AND8ri: case X86::AND64rr: case X86::AND32rr:
@@ -3180,6 +3200,8 @@ inline static bool isDefConvertible(MachineInstr *MI) {
case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::ANDN32rr: case X86::ANDN32rm:
+ case X86::ANDN64rr: case X86::ANDN64rm:
return true;
}
}
@@ -3502,43 +3524,44 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
/// to:
/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
///
-static bool Expand2AddrUndef(MachineInstr *MI, const MCInstrDesc &Desc) {
+static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
+ const MCInstrDesc &Desc) {
assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
- unsigned Reg = MI->getOperand(0).getReg();
- MI->setDesc(Desc);
+ unsigned Reg = MIB->getOperand(0).getReg();
+ MIB->setDesc(Desc);
// MachineInstr::addOperand() will insert explicit operands before any
// implicit operands.
- MachineInstrBuilder(MI).addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
+ MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
// But we don't trust that.
- assert(MI->getOperand(1).getReg() == Reg &&
- MI->getOperand(2).getReg() == Reg && "Misplaced operand");
+ assert(MIB->getOperand(1).getReg() == Reg &&
+ MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
return true;
}
bool X86InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+ MachineInstrBuilder MIB(*MI->getParent()->getParent(), MI);
switch (MI->getOpcode()) {
case X86::SETB_C8r:
- return Expand2AddrUndef(MI, get(X86::SBB8rr));
+ return Expand2AddrUndef(MIB, get(X86::SBB8rr));
case X86::SETB_C16r:
- return Expand2AddrUndef(MI, get(X86::SBB16rr));
+ return Expand2AddrUndef(MIB, get(X86::SBB16rr));
case X86::SETB_C32r:
- return Expand2AddrUndef(MI, get(X86::SBB32rr));
+ return Expand2AddrUndef(MIB, get(X86::SBB32rr));
case X86::SETB_C64r:
- return Expand2AddrUndef(MI, get(X86::SBB64rr));
+ return Expand2AddrUndef(MIB, get(X86::SBB64rr));
case X86::V_SET0:
case X86::FsFLD0SS:
case X86::FsFLD0SD:
- return Expand2AddrUndef(MI, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
case X86::AVX_SET0:
assert(HasAVX && "AVX not supported");
- return Expand2AddrUndef(MI, get(X86::VXORPSYrr));
+ return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
case X86::V_SETALLONES:
- return Expand2AddrUndef(MI, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
+ return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
- return Expand2AddrUndef(MI, get(X86::VPCMPEQDYrr));
+ return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
case X86::TEST8ri_NOREX:
MI->setDesc(get(X86::TEST8ri));
return true;
@@ -3564,9 +3587,10 @@ static MachineInstr *FuseTwoAddrInst(MachineFunction &MF, unsigned Opcode,
MachineInstr *MI,
const TargetInstrInfo &TII) {
// Create the base instruction with the memory operand as the first part.
+ // Omit the implicit operands, something BuildMI can't do.
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
MI->getDebugLoc(), true);
- MachineInstrBuilder MIB(NewMI);
+ MachineInstrBuilder MIB(MF, NewMI);
unsigned NumAddrOps = MOs.size();
for (unsigned i = 0; i != NumAddrOps; ++i)
MIB.addOperand(MOs[i]);
@@ -3590,9 +3614,10 @@ static MachineInstr *FuseInst(MachineFunction &MF,
unsigned Opcode, unsigned OpNo,
const SmallVectorImpl<MachineOperand> &MOs,
MachineInstr *MI, const TargetInstrInfo &TII) {
+ // Omit the implicit operands, something BuildMI can't do.
MachineInstr *NewMI = MF.CreateMachineInstr(TII.get(Opcode),
MI->getDebugLoc(), true);
- MachineInstrBuilder MIB(NewMI);
+ MachineInstrBuilder MIB(MF, NewMI);
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
MachineOperand &MO = MI->getOperand(i);
@@ -3839,8 +3864,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
- if (!MF.getFunction()->getFnAttributes().
- hasAttribute(Attributes::OptimizeForSize) &&
+ if (!MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
hasPartialRegUpdate(MI->getOpcode()))
return 0;
@@ -3881,8 +3906,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
- if (!MF.getFunction()->getFnAttributes().
- hasAttribute(Attributes::OptimizeForSize) &&
+ if (!MF.getFunction()->getAttributes().
+ hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
hasPartialRegUpdate(MI->getOpcode()))
return 0;
@@ -4132,7 +4157,7 @@ bool X86InstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
// Emit the data processing instruction.
MachineInstr *DataMI = MF.CreateMachineInstr(MCID, MI->getDebugLoc(), true);
- MachineInstrBuilder MIB(DataMI);
+ MachineInstrBuilder MIB(MF, DataMI);
if (FoldedStore)
MIB.addReg(Reg, RegState::Define);
@@ -4638,13 +4663,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
case X86::DIVSSrr:
case X86::DIVSSrr_Int:
case X86::SQRTPDm:
- case X86::SQRTPDm_Int:
case X86::SQRTPDr:
- case X86::SQRTPDr_Int:
case X86::SQRTPSm:
- case X86::SQRTPSm_Int:
case X86::SQRTPSr:
- case X86::SQRTPSr_Int:
case X86::SQRTSDm:
case X86::SQRTSDm_Int:
case X86::SQRTSDr:
@@ -4663,13 +4684,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
case X86::VDIVSSrr:
case X86::VDIVSSrr_Int:
case X86::VSQRTPDm:
- case X86::VSQRTPDm_Int:
case X86::VSQRTPDr:
- case X86::VSQRTPDr_Int:
case X86::VSQRTPSm:
- case X86::VSQRTPSm_Int:
case X86::VSQRTPSr:
- case X86::VSQRTPSr_Int:
case X86::VSQRTSDm:
case X86::VSQRTSDm_Int:
case X86::VSQRTSDr:
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index cdf1c8935f..d989ec7bb0 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -247,9 +247,9 @@ def X86and_flag : SDNode<"X86ISD::AND", SDTBinaryArithWithFlags,
[SDNPCommutative]>;
def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
-def X86blsi_flag : SDNode<"X86ISD::BLSI", SDTUnaryArithWithFlags>;
-def X86blsmsk_flag : SDNode<"X86ISD::BLSMSK", SDTUnaryArithWithFlags>;
-def X86blsr_flag : SDNode<"X86ISD::BLSR", SDTUnaryArithWithFlags>;
+def X86blsi : SDNode<"X86ISD::BLSI", SDTIntUnaryOp>;
+def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
+def X86blsr : SDNode<"X86ISD::BLSR", SDTIntUnaryOp>;
def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
@@ -525,6 +525,13 @@ def lea64_32mem : Operand<i32> {
let ParserMatchClass = X86MemAsmOperand;
}
+// Memory operands that use 64-bit pointers in both ILP32 and LP64.
+def lea64mem : Operand<i64> {
+ let PrintMethod = "printi64mem";
+ let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+ let ParserMatchClass = X86MemAsmOperand;
+}
+
//===----------------------------------------------------------------------===//
// X86 Complex Pattern Definitions.
@@ -535,6 +542,12 @@ def addr : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
[add, sub, mul, X86mul_imm, shl, or, frameindex],
[]>;
+// In 64-bit mode 32-bit LEAs can use RIP-relative addressing.
+def lea64_32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
+ [add, sub, mul, X86mul_imm, shl, or,
+ frameindex, X86WrapperRIP],
+ []>;
+
def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
[tglobaltlsaddr], []>;
@@ -590,6 +603,7 @@ def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
def HasRTM : Predicate<"Subtarget->hasRTM()">;
+def HasADX : Predicate<"Subtarget->hasADX()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -856,16 +870,14 @@ let Defs = [RSP], Uses = [RSP, EFLAGS], mayStore = 1, neverHasSideEffects=1 in
def PUSHF64 : I<0x9C, RawFrm, (outs), (ins), "pushfq", [], IIC_PUSH_F>,
Requires<[In64BitMode]>;
-
-
let Defs = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP], Uses = [ESP],
mayLoad=1, neverHasSideEffects=1 in {
-def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l}", [], IIC_POP_A>,
+def POPA32 : I<0x61, RawFrm, (outs), (ins), "popa{l|d}", [], IIC_POP_A>,
Requires<[In32BitMode]>;
}
let Defs = [ESP], Uses = [EDI, ESI, EBP, EBX, EDX, ECX, EAX, ESP],
mayStore=1, neverHasSideEffects=1 in {
-def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l}", [], IIC_PUSH_A>,
+def PUSHA32 : I<0x60, RawFrm, (outs), (ins), "pusha{l|d}", [], IIC_PUSH_A>,
Requires<[In32BitMode]>;
}
@@ -1043,7 +1055,7 @@ def MOV64ao64 : RIi32<0xA3, RawFrm, (outs offset64:$dst), (ins),
*/
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def MOV8rr_REV : I<0x8A, MRMSrcReg, (outs GR8:$dst), (ins GR8:$src),
"mov{b}\t{$src, $dst|$dst, $src}", [], IIC_MOV>;
def MOV16rr_REV : I<0x8B, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
@@ -1133,24 +1145,26 @@ def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
// perspective, this is pretty bizarre. Make these instructions disassembly
// only for now.
-def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
- "bt{w}\t{$src2, $src1|$src1, $src2}",
-// [(X86bt (loadi16 addr:$src1), GR16:$src2),
-// (implicit EFLAGS)]
- [], IIC_BT_MR
- >, OpSize, TB, Requires<[FastBTMem]>;
-def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
- "bt{l}\t{$src2, $src1|$src1, $src2}",
-// [(X86bt (loadi32 addr:$src1), GR32:$src2),
-// (implicit EFLAGS)]
- [], IIC_BT_MR
- >, TB, Requires<[FastBTMem]>;
-def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "bt{q}\t{$src2, $src1|$src1, $src2}",
-// [(X86bt (loadi64 addr:$src1), GR64:$src2),
-// (implicit EFLAGS)]
- [], IIC_BT_MR
- >, TB;
+let mayLoad = 1, hasSideEffects = 0 in {
+ def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
+ "bt{w}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi16 addr:$src1), GR16:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, OpSize, TB, Requires<[FastBTMem]>;
+ def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
+ "bt{l}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi32 addr:$src1), GR32:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, TB, Requires<[FastBTMem]>;
+ def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
+ "bt{q}\t{$src2, $src1|$src1, $src2}",
+ // [(X86bt (loadi64 addr:$src1), GR64:$src2),
+ // (implicit EFLAGS)]
+ [], IIC_BT_MR
+ >, TB;
+}
def BT16ri8 : Ii8<0xBA, MRM4r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
@@ -1181,7 +1195,7 @@ def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
[(set EFLAGS, (X86bt (loadi64 addr:$src1),
i64immSExt8:$src2))], IIC_BT_MI>, TB;
-
+let hasSideEffects = 0 in {
def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
OpSize, TB;
@@ -1189,6 +1203,8 @@ def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
OpSize, TB;
@@ -1196,6 +1212,8 @@ def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize, TB;
@@ -1203,6 +1221,8 @@ def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
def BTC16mi8 : Ii8<0xBA, MRM7m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize, TB;
@@ -1210,6 +1230,7 @@ def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
@@ -1218,6 +1239,8 @@ def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
OpSize, TB;
@@ -1225,6 +1248,8 @@ def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize, TB;
@@ -1232,6 +1257,8 @@ def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
def BTR16mi8 : Ii8<0xBA, MRM6m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize, TB;
@@ -1239,6 +1266,7 @@ def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
@@ -1247,6 +1275,8 @@ def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
OpSize, TB;
@@ -1254,6 +1284,8 @@ def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+}
+
def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize, TB;
@@ -1261,6 +1293,8 @@ def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
+
+let mayLoad = 1, mayStore = 1 in {
def BTS16mi8 : Ii8<0xBA, MRM5m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize, TB;
@@ -1268,6 +1302,8 @@ def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+}
+} // hasSideEffects = 0
} // Defs = [EFLAGS]
@@ -1280,34 +1316,34 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
InstrItinClass itin> {
let Constraints = "$val = $dst" in {
- def #NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
- (ins GR8:$val, i8mem:$ptr),
- !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR8:$dst,
- (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
- itin>;
- def #NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
- (ins GR16:$val, i16mem:$ptr),
- !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR16:$dst,
- (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
- itin>, OpSize;
- def #NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
- (ins GR32:$val, i32mem:$ptr),
- !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ def NAME#8rm : I<opc8, MRMSrcMem, (outs GR8:$dst),
+ (ins GR8:$val, i8mem:$ptr),
+ !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR8:$dst,
+ (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+ itin>;
+ def NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+ (ins GR16:$val, i16mem:$ptr),
+ !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR16:$dst,
+ (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+ itin>, OpSize;
+ def NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+ (ins GR32:$val, i32mem:$ptr),
+ !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+ [(set
+ GR32:$dst,
+ (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ itin>;
+ def NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+ (ins GR64:$val, i64mem:$ptr),
+ !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
[(set
- GR32:$dst,
- (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+ GR64:$dst,
+ (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
itin>;
- def #NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
- (ins GR64:$val, i64mem:$ptr),
- !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
- [(set
- GR64:$dst,
- (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
- itin>;
}
}
@@ -1486,10 +1522,10 @@ def BOUNDS32rm : I<0x62, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
Requires<[In32BitMode]>;
// Adjust RPL Field of Segment Selector
-def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$src), (ins GR16:$dst),
+def ARPL16rr : I<0x63, MRMDestReg, (outs GR16:$dst), (ins GR16:$src),
"arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_REG>,
Requires<[In32BitMode]>;
-def ARPL16mr : I<0x63, MRMSrcMem, (outs GR16:$src), (ins i16mem:$dst),
+def ARPL16mr : I<0x63, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src),
"arpl\t{$src, $dst|$dst, $src}", [], IIC_ARPL_MEM>,
Requires<[In32BitMode]>;
@@ -1605,26 +1641,26 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
PatFrag ld_frag> {
def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, EFLAGS, (OpNode RC:$src))]>, T8, VEX_4V;
+ [(set RC:$dst, (OpNode RC:$src)), (implicit EFLAGS)]>, T8, VEX_4V;
def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, EFLAGS, (OpNode (ld_frag addr:$src)))]>,
+ [(set RC:$dst, (OpNode (ld_frag addr:$src))), (implicit EFLAGS)]>,
T8, VEX_4V;
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem,
- X86blsr_flag, loadi32>;
+ X86blsr, loadi32>;
defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem,
- X86blsr_flag, loadi64>, VEX_W;
+ X86blsr, loadi64>, VEX_W;
defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem,
- X86blsmsk_flag, loadi32>;
+ X86blsmsk, loadi32>;
defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem,
- X86blsmsk_flag, loadi64>, VEX_W;
+ X86blsmsk, loadi64>, VEX_W;
defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem,
- X86blsi_flag, loadi32>;
+ X86blsi, loadi32>;
defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem,
- X86blsi_flag, loadi64>, VEX_W;
+ X86blsi, loadi64>, VEX_W;
}
multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
@@ -1886,6 +1922,8 @@ def : InstAlias<"fmulp", (MUL_FPrST0 ST1)>;
def : InstAlias<"fdivp", (DIVR_FPrST0 ST1)>;
def : InstAlias<"fdivrp", (DIV_FPrST0 ST1)>;
def : InstAlias<"fxch", (XCH_F ST1)>;
+def : InstAlias<"fcom", (COM_FST0r ST1)>;
+def : InstAlias<"fcomp", (COMP_FST0r ST1)>;
def : InstAlias<"fcomi", (COM_FIr ST1)>;
def : InstAlias<"fcompi", (COM_FIPr ST1)>;
def : InstAlias<"fucom", (UCOM_Fr ST1)>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1912a936ce..0979752757 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -203,9 +203,8 @@ multiclass sse12_fp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
string OpcodeStr, X86MemOperand x86memop,
list<dag> pat_rr, list<dag> pat_rm,
- bit Is2Addr = 1,
- bit rr_hasSideEffects = 0> {
- let isCommutable = 1, neverHasSideEffects = rr_hasSideEffects in
+ bit Is2Addr = 1> {
+ let isCommutable = 1, hasSideEffects = 0 in
def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -218,27 +217,6 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
pat_rm, IIC_DEFAULT, d>;
}
-/// sse12_fp_packed_int - SSE 1 & 2 packed instructions intrinsics class
-multiclass sse12_fp_packed_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
- string asm, string SSEVer, string FPSizeStr,
- X86MemOperand x86memop, PatFrag mem_frag,
- Domain d, OpndItins itins, bit Is2Addr = 1> {
- def rr_Int : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
- !if(Is2Addr,
- !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, RC:$src2))], IIC_DEFAULT, d>;
- def rm_Int : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1,x86memop:$src2),
- !if(Is2Addr,
- !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_", SSEVer, "_", OpcodeStr, FPSizeStr))
- RC:$src1, (mem_frag addr:$src2)))], IIC_DEFAULT, d>;
-}
-
//===----------------------------------------------------------------------===//
// Non-instruction patterns
//===----------------------------------------------------------------------===//
@@ -458,93 +436,69 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
// in terms of a copy, and just mentioned, we don't use movss/movsd for copies.
//===----------------------------------------------------------------------===//
-class sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, string asm> :
- SI<0x10, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, RC:$src2), asm,
- [(set VR128:$dst, (vt (OpNode VR128:$src1,
- (scalar_to_vector RC:$src2))))],
- IIC_SSE_MOV_S_RR>;
+multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string base_opc,
+ string asm_opr> {
+ def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [(set VR128:$dst, (vt (OpNode VR128:$src1,
+ (scalar_to_vector RC:$src2))))],
+ IIC_SSE_MOV_S_RR>;
-// Loading from memory automatically zeroing upper bits.
-class sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
- PatFrag mem_pat, string OpcodeStr> :
- SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set RC:$dst, (mem_pat addr:$src))],
- IIC_SSE_MOV_S_RM>;
-
-// AVX
-def VMOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
- "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XS, VEX_4V,
- VEX_LIG;
-def VMOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
- "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}">, XD, VEX_4V,
- VEX_LIG;
-
-// For the disassembler
-let isCodeGenOnly = 1 in {
- def VMOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
- (ins VR128:$src1, FR32:$src2),
- "movss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- IIC_SSE_MOV_S_RR>,
- XS, VEX_4V, VEX_LIG;
- def VMOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
- (ins VR128:$src1, FR64:$src2),
- "movsd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
- IIC_SSE_MOV_S_RR>,
- XD, VEX_4V, VEX_LIG;
+ // For the disassembler
+ let isCodeGenOnly = 1, hasSideEffects = 0 in
+ def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
+ (ins VR128:$src1, RC:$src2),
+ !strconcat(base_opc, asm_opr),
+ [], IIC_SSE_MOV_S_RR>;
}
-let canFoldAsLoad = 1, isReMaterializable = 1 in {
- def VMOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS, VEX,
- VEX_LIG;
- let AddedComplexity = 20 in
- def VMOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD, VEX,
- VEX_LIG;
-}
+multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
+ X86MemOperand x86memop, string OpcodeStr> {
+ // AVX
+ defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}">,
+ VEX_4V, VEX_LIG;
-def VMOVSSmr : SI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
- "movss\t{$src, $dst|$dst, $src}",
- [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
- XS, VEX, VEX_LIG;
-def VMOVSDmr : SI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
- "movsd\t{$src, $dst|$dst, $src}",
- [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
- XD, VEX, VEX_LIG;
+ def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>,
+ VEX, VEX_LIG;
+ // SSE1 & 2
+ let Constraints = "$src1 = $dst" in {
+ defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ "\t{$src2, $dst|$dst, $src2}">;
+ }
-// SSE1 & 2
-let Constraints = "$src1 = $dst" in {
- def MOVSSrr : sse12_move_rr<FR32, X86Movss, v4f32,
- "movss\t{$src2, $dst|$dst, $src2}">, XS;
- def MOVSDrr : sse12_move_rr<FR64, X86Movsd, v2f64,
- "movsd\t{$src2, $dst|$dst, $src2}">, XD;
+ def NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
+}
- // For the disassembler
- let isCodeGenOnly = 1 in {
- def MOVSSrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
- (ins VR128:$src1, FR32:$src2),
- "movss\t{$src2, $dst|$dst, $src2}", [],
- IIC_SSE_MOV_S_RR>, XS;
- def MOVSDrr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
- (ins VR128:$src1, FR64:$src2),
- "movsd\t{$src2, $dst|$dst, $src2}", [],
- IIC_SSE_MOV_S_RR>, XD;
- }
+// Loading from memory automatically zeroing upper bits.
+multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
+ PatFrag mem_pat, string OpcodeStr> {
+ def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM>, VEX, VEX_LIG;
+ def NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+ [(set RC:$dst, (mem_pat addr:$src))],
+ IIC_SSE_MOV_S_RM>;
}
+defm MOVSS : sse12_move<FR32, X86Movss, v4f32, f32mem, "movss">, XS;
+defm MOVSD : sse12_move<FR64, X86Movsd, v2f64, f64mem, "movsd">, XD;
+
let canFoldAsLoad = 1, isReMaterializable = 1 in {
- def MOVSSrm : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
+ defm MOVSS : sse12_move_rm<FR32, f32mem, loadf32, "movss">, XS;
let AddedComplexity = 20 in
- def MOVSDrm : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
+ defm MOVSD : sse12_move_rm<FR64, f64mem, loadf64, "movsd">, XD;
}
-def MOVSSmr : SSI<0x11, MRMDestMem, (outs), (ins f32mem:$dst, FR32:$src),
- "movss\t{$src, $dst|$dst, $src}",
- [(store FR32:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
-def MOVSDmr : SDI<0x11, MRMDestMem, (outs), (ins f64mem:$dst, FR64:$src),
- "movsd\t{$src, $dst|$dst, $src}",
- [(store FR64:$src, addr:$dst)], IIC_SSE_MOV_S_MR>;
-
// Patterns
let Predicates = [HasAVX] in {
let AddedComplexity = 15 in {
@@ -870,7 +824,7 @@ def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
// For disassembler
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
(ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
@@ -944,7 +898,7 @@ def MOVUPDmr : PDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
IIC_SSE_MOVU_P_MR>;
// For disassembler
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def MOVAPSrr_REV : PSI<0x29, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movaps\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>;
@@ -1132,34 +1086,41 @@ def FsMOVAPDrm : PDI<0x28, MRMSrcMem, (outs FR64:$dst), (ins f128mem:$src),
// SSE 1 & 2 - Move Low packed FP Instructions
//===----------------------------------------------------------------------===//
-multiclass sse12_mov_hilo_packed<bits<8>opc, RegisterClass RC,
- SDNode psnode, SDNode pdnode, string base_opc,
- string asm_opr, InstrItinClass itin> {
+multiclass sse12_mov_hilo_packed_base<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, string asm_opr,
+ InstrItinClass itin> {
def PSrm : PI<opc, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
!strconcat(base_opc, "s", asm_opr),
- [(set RC:$dst,
- (psnode RC:$src1,
+ [(set VR128:$dst,
+ (psnode VR128:$src1,
(bc_v4f32 (v2f64 (scalar_to_vector (loadf64 addr:$src2))))))],
itin, SSEPackedSingle>, TB;
def PDrm : PI<opc, MRMSrcMem,
- (outs RC:$dst), (ins RC:$src1, f64mem:$src2),
+ (outs VR128:$dst), (ins VR128:$src1, f64mem:$src2),
!strconcat(base_opc, "d", asm_opr),
- [(set RC:$dst, (v2f64 (pdnode RC:$src1,
+ [(set VR128:$dst, (v2f64 (pdnode VR128:$src1,
(scalar_to_vector (loadf64 addr:$src2)))))],
itin, SSEPackedDouble>, TB, OpSize;
+
}
-let AddedComplexity = 20 in {
- defm VMOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- IIC_SSE_MOV_LH>, VEX_4V;
+multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
+ string base_opc, InstrItinClass itin> {
+ defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ itin>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
+ "\t{$src2, $dst|$dst, $src2}",
+ itin>;
}
-let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
- defm MOVL : sse12_mov_hilo_packed<0x12, VR128, X86Movlps, X86Movlpd, "movlp",
- "\t{$src2, $dst|$dst, $src2}",
- IIC_SSE_MOV_LH>;
+
+let AddedComplexity = 20 in {
+ defm MOVL : sse12_mov_hilo_packed<0x12, X86Movlps, X86Movlpd, "movlp",
+ IIC_SSE_MOV_LH>;
}
def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
@@ -1257,14 +1218,8 @@ let Predicates = [UseSSE2] in {
//===----------------------------------------------------------------------===//
let AddedComplexity = 20 in {
- defm VMOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- IIC_SSE_MOV_LH>, VEX_4V;
-}
-let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
- defm MOVH : sse12_mov_hilo_packed<0x16, VR128, X86Movlhps, X86Movlhpd, "movhp",
- "\t{$src2, $dst|$dst, $src2}",
- IIC_SSE_MOV_LH>;
+ defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+ IIC_SSE_MOV_LH>;
}
// v2f64 extract element 1 is always custom lowered to unpack high to low
@@ -1457,7 +1412,7 @@ defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
SSE_CVT_SS2SI_32>,
XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ "cvttss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_64>,
XS, VEX, VEX_W, VEX_LIG;
defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
@@ -1465,26 +1420,43 @@ defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
SSE_CVT_SD2SI>,
XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SD2SI>,
XD, VEX, VEX_W, VEX_LIG;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"vcvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
// The assembler can recognize rr 64-bit instructions by seeing a rxx
// register, but the same isn't true when only using memory operands,
// provide other assembly "l" and "q" forms to address this explicitly
// where appropriate to do so.
-defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss">,
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
XS, VEX_4V, VEX_LIG;
defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
XS, VEX_4V, VEX_W, VEX_LIG;
-defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd">,
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
XD, VEX_4V, VEX_LIG;
defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
XD, VEX_4V, VEX_W, VEX_LIG;
-def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
- (VCVTSI2SDrr FR64:$dst, FR64:$src1, GR32:$src)>;
-def : InstAlias<"vcvtsi2sd{l}\t{$src, $src1, $dst|$dst, $src1, $src}",
+def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
+ (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
(VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
let Predicates = [HasAVX] in {
@@ -1511,27 +1483,49 @@ defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
"cvttss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_32>, XS;
defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
- "cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ "cvttss2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SS2SI_64>, XS, REX_W;
defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
"cvttsd2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SD2SI>, XD;
defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
- "cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ "cvttsd2si\t{$src, $dst|$dst, $src}",
SSE_CVT_SD2SI>, XD, REX_W;
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
- "cvtsi2ss\t{$src, $dst|$dst, $src}",
+ "cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XS;
defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
- "cvtsi2sd\t{$src, $dst|$dst, $src}",
+ "cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XD;
defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
SSE_CVT_Scalar>, XD, REX_W;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SIrm GR32:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrr GR32:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SIrm GR32:$dst, f64mem:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rr GR64:$dst, FR32:$src), 0>;
+def : InstAlias<"cvttss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSS2SI64rm GR64:$dst, f32mem:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rr GR64:$dst, FR64:$src), 0>;
+def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
+
+def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
+ (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
+def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
+ (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
+
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
// and/or XMM operand(s).
@@ -1566,27 +1560,27 @@ multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
}
defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32,
- int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si{l}",
+ int_x86_sse2_cvtsd2si, sdmem, sse_load_f64, "cvtsd2si",
SSE_CVT_SD2SI>, XD, VEX, VEX_LIG;
defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64,
- int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si{q}",
+ int_x86_sse2_cvtsd2si64, sdmem, sse_load_f64, "cvtsd2si",
SSE_CVT_SD2SI>, XD, VEX, VEX_W, VEX_LIG;
defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse2_cvtsd2si,
- sdmem, sse_load_f64, "cvtsd2si{l}", SSE_CVT_SD2SI>, XD;
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD;
defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
- sdmem, sse_load_f64, "cvtsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W;
+ sdmem, sse_load_f64, "cvtsd2si", SSE_CVT_SD2SI>, XD, REX_W;
defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss",
+ int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
SSE_CVT_Scalar, 0>, XS, VEX_4V;
defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
SSE_CVT_Scalar, 0>, XS, VEX_4V,
VEX_W;
defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd",
+ int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
SSE_CVT_Scalar, 0>, XD, VEX_4V;
defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
@@ -1596,13 +1590,13 @@ defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
let Constraints = "$src1 = $dst" in {
defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse_cvtsi2ss, i32mem, loadi32,
- "cvtsi2ss", SSE_CVT_Scalar>, XS;
+ "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse_cvtsi642ss, i64mem, loadi64,
"cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse2_cvtsi2sd, i32mem, loadi32,
- "cvtsi2sd", SSE_CVT_Scalar>, XD;
+ "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse2_cvtsi642sd, i64mem, loadi64,
"cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
@@ -1616,40 +1610,40 @@ defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
SSE_CVT_SS2SI_32>, XS, VEX;
defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
- "cvttss2si{q}", SSE_CVT_SS2SI_64>,
+ "cvttss2si", SSE_CVT_SS2SI_64>,
XS, VEX, VEX_W;
defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
sdmem, sse_load_f64, "cvttsd2si",
SSE_CVT_SD2SI>, XD, VEX;
defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
- "cvttsd2si{q}", SSE_CVT_SD2SI>,
+ "cvttsd2si", SSE_CVT_SD2SI>,
XD, VEX, VEX_W;
defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
ssmem, sse_load_f32, "cvttss2si",
SSE_CVT_SS2SI_32>, XS;
defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
- "cvttss2si{q}", SSE_CVT_SS2SI_64>, XS, REX_W;
+ "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
sdmem, sse_load_f64, "cvttsd2si",
SSE_CVT_SD2SI>, XD;
defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
- "cvttsd2si{q}", SSE_CVT_SD2SI>, XD, REX_W;
+ "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
- ssmem, sse_load_f32, "cvtss2si{l}",
+ ssmem, sse_load_f32, "cvtss2si",
SSE_CVT_SS2SI_32>, XS, VEX, VEX_LIG;
defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
- ssmem, sse_load_f32, "cvtss2si{q}",
+ ssmem, sse_load_f32, "cvtss2si",
SSE_CVT_SS2SI_64>, XS, VEX, VEX_W, VEX_LIG;
defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, int_x86_sse_cvtss2si,
- ssmem, sse_load_f32, "cvtss2si{l}",
+ ssmem, sse_load_f32, "cvtss2si",
SSE_CVT_SS2SI_32>, XS;
defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
- ssmem, sse_load_f32, "cvtss2si{q}",
+ ssmem, sse_load_f32, "cvtss2si",
SSE_CVT_SS2SI_64>, XS, REX_W;
defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
@@ -1666,6 +1660,40 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
SSEPackedSingle, SSE_CVT_PS>,
TB, Requires<[UseSSE2]>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
+ (CVTSD2SI64rm GR64:$dst, sdmem:$src)>;
+
/// SSE 2 Only
// Convert scalar double to scalar single
@@ -2657,10 +2685,8 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
/// PDI_binop_rm - Simple SSE2 binary operator.
multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop,
- OpndItins itins,
- bit IsCommutable = 0,
- bit Is2Addr = 1> {
+ X86MemOperand x86memop, OpndItins itins,
+ bit IsCommutable, bit Is2Addr> {
let isCommutable = IsCommutable in
def rr : PDI<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2),
@@ -2679,40 +2705,30 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
} // ExeDomain = SSEPackedInt
-// These are ordered here for pattern ordering requirements with the fp versions
+multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
+ ValueType OpVT128, ValueType OpVT256,
+ OpndItins itins, bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+ defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
+ VR128, memopv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
-let Predicates = [HasAVX] in {
-defm VPAND : PDI_binop_rm<0xDB, "vpand", and, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
-defm VPOR : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
-defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
-defm VPANDN : PDI_binop_rm<0xDF, "vpandn", X86andnp, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V;
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
+ memopv2i64, i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+ defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
+ OpVT256, VR256, memopv4i64, i256mem, itins,
+ IsCommutable, 0>, VEX_4V, VEX_L;
}
-let Constraints = "$src1 = $dst" in {
-defm PAND : PDI_binop_rm<0xDB, "pand", and, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 1>;
-defm POR : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 1>;
-defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 1>;
-defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64,
- i128mem, SSE_BIT_ITINS_P, 0>;
-} // Constraints = "$src1 = $dst"
+// These are ordered here for pattern ordering requirements with the fp versions
-let Predicates = [HasAVX2] in {
-defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
- i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPORY : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
- i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
- i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
- i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-}
+defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, SSE_BIT_ITINS_P, 1>;
+defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
+ SSE_BIT_ITINS_P, 0>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Logical Instructions
@@ -2757,6 +2773,20 @@ let neverHasSideEffects = 1, Pattern = []<dag>, isCommutable = 0 in
///
multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
+ defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
+ !strconcat(OpcodeStr, "ps"), f256mem,
+ [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
+ (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
+
+ defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
+ !strconcat(OpcodeStr, "pd"), f256mem,
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (bc_v4i64 (v4f64 VR256:$src2))))],
+ [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
+ (memopv4i64 addr:$src2)))], 0>,
+ TB, OpSize, VEX_4V, VEX_L;
+
// In AVX no need to add a pattern for 128-bit logical rr ps, because they
// are all promoted to v2i64, and the patterns are covered by the int
// version. This is needed in SSE only, because v2i64 isn't supported on
@@ -2764,7 +2794,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f128mem, [],
[(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
- (memopv2i64 addr:$src2)))], 0, 1>, TB, VEX_4V;
+ (memopv2i64 addr:$src2)))], 0>, TB, VEX_4V;
defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
!strconcat(OpcodeStr, "pd"), f128mem,
@@ -2773,6 +2803,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
(memopv2i64 addr:$src2)))], 0>,
TB, OpSize, VEX_4V;
+
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f128mem,
@@ -2789,31 +2820,6 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
}
}
-/// sse12_fp_packed_logical_y - AVX 256-bit SSE 1 & 2 logical ops forms
-///
-multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
- defm PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
- !strconcat(OpcodeStr, "ps"), f256mem,
- [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
- [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
- (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
-
- defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
- !strconcat(OpcodeStr, "pd"), f256mem,
- [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
- (bc_v4i64 (v4f64 VR256:$src2))))],
- [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
- (memopv4i64 addr:$src2)))], 0>,
- TB, OpSize, VEX_4V, VEX_L;
-}
-
-// AVX 256-bit packed logical ops forms
-defm VAND : sse12_fp_packed_logical_y<0x54, "and", and>;
-defm VOR : sse12_fp_packed_logical_y<0x56, "or", or>;
-defm VXOR : sse12_fp_packed_logical_y<0x57, "xor", xor>;
-defm VANDN : sse12_fp_packed_logical_y<0x55, "andn", X86andnp>;
-
defm AND : sse12_fp_packed_logical<0x54, "and", and>;
defm OR : sse12_fp_packed_logical<0x56, "or", or>;
defm XOR : sse12_fp_packed_logical<0x57, "xor", xor>;
@@ -2848,26 +2854,32 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
itins.d, Is2Addr>, XD;
}
-multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SizeItins itins,
- bit Is2Addr = 1> {
+multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
+ SDNode OpNode, SizeItins itins> {
+let Predicates = [HasAVX] in {
+ defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
+ VR128, v4f32, f128mem, memopv4f32,
+ SSEPackedSingle, itins.s, 0>, TB, VEX_4V;
+ defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
+ VR128, v2f64, f128mem, memopv2f64,
+ SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V;
+
+ defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
+ OpNode, VR256, v8f32, f256mem, memopv8f32,
+ SSEPackedSingle, itins.s, 0>, TB, VEX_4V, VEX_L;
+ defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
+ OpNode, VR256, v4f64, f256mem, memopv4f64,
+ SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR128,
- v4f32, f128mem, memopv4f32, SSEPackedSingle, itins.s, Is2Addr>,
- TB;
+ v4f32, f128mem, memopv4f32, SSEPackedSingle,
+ itins.s, 1>, TB;
defm PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR128,
- v2f64, f128mem, memopv2f64, SSEPackedDouble, itins.d, Is2Addr>,
- TB, OpSize;
+ v2f64, f128mem, memopv2f64, SSEPackedDouble,
+ itins.d, 1>, TB, OpSize;
}
-
-multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
- SDNode OpNode,
- SizeItins itins> {
- defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
- v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
- TB, VEX_L;
- defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
- v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
- TB, OpSize, VEX_L;
}
multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
@@ -2881,116 +2893,69 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
itins.d, Is2Addr>, XD;
}
-multiclass basic_sse12_fp_binop_p_int<bits<8> opc, string OpcodeStr,
- SizeItins itins,
- bit Is2Addr = 1> {
- defm PS : sse12_fp_packed_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "ps"), "sse", "_ps", f128mem, memopv4f32,
- SSEPackedSingle, itins.s, Is2Addr>,
- TB;
-
- defm PD : sse12_fp_packed_int<opc, OpcodeStr, VR128,
- !strconcat(OpcodeStr, "pd"), "sse2", "_pd", f128mem, memopv2f64,
- SSEPackedDouble, itins.d, Is2Addr>,
- TB, OpSize;
+// Binary Arithmetic instructions
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>;
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>;
+let isCommutable = 0 in {
+ defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>;
+ defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>;
+ defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>;
+ defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>;
}
-multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr,
- SizeItins itins> {
- defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
- !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
- SSEPackedSingle, itins.s, 0>, TB, VEX_L;
-
- defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
- !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
- SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_L;
+let isCodeGenOnly = 1 in {
+ defm MAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
+ defm MINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
}
-// Binary Arithmetic instructions
defm VADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S, 0>,
basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S, 0>,
VEX_4V, VEX_LIG;
-defm VADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y<0x58, "add", fadd, SSE_ALU_ITINS_P>,
- VEX_4V;
defm VMUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S, 0>,
basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S, 0>,
VEX_4V, VEX_LIG;
-defm VMUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
- VEX_4V;
let isCommutable = 0 in {
defm VSUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S, 0>,
basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S, 0>,
VEX_4V, VEX_LIG;
- defm VSUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
- VEX_4V;
defm VDIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S, 0>,
basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S, 0>,
VEX_4V, VEX_LIG;
- defm VDIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
- VEX_4V;
defm VMAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S, 0>,
basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S, 0>,
VEX_4V, VEX_LIG;
- defm VMAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_p_y_int<0x5F, "max", SSE_ALU_ITINS_P>,
- VEX_4V;
defm VMIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S, 0>,
basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S, 0>,
VEX_4V, VEX_LIG;
- defm VMIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y_int<0x5D, "min", SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_p_y<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
- VEX_4V;
}
let Constraints = "$src1 = $dst" in {
defm ADD : basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s_int<0x58, "add", SSE_ALU_ITINS_S>;
defm MUL : basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
- basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
basic_sse12_fp_binop_s_int<0x59, "mul", SSE_MUL_ITINS_S>;
let isCommutable = 0 in {
defm SUB : basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
basic_sse12_fp_binop_s_int<0x5C, "sub", SSE_ALU_ITINS_S>;
defm DIV : basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
- basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
basic_sse12_fp_binop_s_int<0x5E, "div", SSE_DIV_ITINS_S>;
defm MAX : basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p_int<0x5F, "max", SSE_ALU_ITINS_P>;
+ basic_sse12_fp_binop_s_int<0x5F, "max", SSE_ALU_ITINS_S>;
defm MIN : basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
- basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p_int<0x5D, "min", SSE_ALU_ITINS_P>;
+ basic_sse12_fp_binop_s_int<0x5D, "min", SSE_ALU_ITINS_S>;
}
}
let isCodeGenOnly = 1 in {
defm VMAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S, 0>,
VEX_4V, VEX_LIG;
- defm VMAXC: basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>, VEX_4V;
defm VMINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S, 0>,
VEX_4V, VEX_LIG;
- defm VMINC: basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P, 0>,
- basic_sse12_fp_binop_p_y<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>, VEX_4V;
let Constraints = "$src1 = $dst" in {
- defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_P>;
- defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>,
- basic_sse12_fp_binop_p<0x5D, "min", X86fminc, SSE_ALU_ITINS_P>;
+ defm MAXC: basic_sse12_fp_binop_s<0x5F, "max", X86fmaxc, SSE_ALU_ITINS_S>;
+ defm MINC: basic_sse12_fp_binop_s<0x5D, "min", X86fminc, SSE_ALU_ITINS_S>;
}
}
@@ -3021,6 +2986,26 @@ def SSE_RCPS : OpndItins<
/// sse1_fp_unop_s - SSE1 unops in scalar form.
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
SDNode OpNode, Intrinsic F32Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+ def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ let mayLoad = 1 in {
+ def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1,f32mem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ }
+}
+
def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
!strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
[(set FR32:$dst, (OpNode FR32:$src))]>;
@@ -3040,49 +3025,115 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (F32Int sse_load_f32:$src))], itins.rm>;
}
-/// sse1_fp_unop_s_avx - AVX SSE1 unops in scalar form.
-multiclass sse1_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
- def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
+multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+ def V#NAME#SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst),
+ (ins FR32:$src1, FR32:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
let mayLoad = 1 in {
- def SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1,f32mem:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
- def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, ssmem:$src2),
- !strconcat(OpcodeStr,
- "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
+ def V#NAME#SSm : SSI<opc, MRMSrcMem, (outs FR32:$dst),
+ (ins FR32:$src1,f32mem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ def V#NAME#SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat("v", OpcodeStr,
+ "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ }
+}
+
+ def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode FR32:$src))]>;
+ // For scalar unary operations, fold a load into the operation
+ // only in OptForSize mode. It eliminates an instruction, but it also
+ // eliminates a whole-register clobber (the load), so it introduces a
+ // partial register update condition.
+ def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
+ !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
+ [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
+ Requires<[UseSSE1, OptForSize]>;
+ let Constraints = "$src1 = $dst" in {
+ def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [], itins.rr>;
+ let mayLoad = 1, hasSideEffects = 0 in
+ def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, ssmem:$src2),
+ !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
+ [], itins.rm>;
}
}
/// sse1_fp_unop_p - SSE1 unops in packed form.
multiclass sse1_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
+ itins.rr>, VEX;
+ def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))],
+ itins.rm>, VEX;
+ def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L;
+ def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
+ itins.rm>, VEX, VEX_L;
+}
+
def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>;
+ !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))], itins.rr>;
def PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (OpNode (memopv4f32 addr:$src)))], itins.rm>;
}
-/// sse1_fp_unop_p_y - AVX 256-bit SSE1 unops in packed form.
-multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
- def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
- itins.rr>, VEX_L;
- def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
- !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
- itins.rm>, VEX_L;
-}
-
/// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
- Intrinsic V4F32Int, OpndItins itins> {
+ Intrinsic V4F32Int, Intrinsic V8F32Int,
+ OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int VR128:$src))],
+ itins.rr>, VEX;
+ def V#NAME#PSm_Int : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (V4F32Int (memopv4f32 addr:$src)))],
+ itins.rm>, VEX;
+ def V#NAME#PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (V8F32Int VR256:$src))],
+ itins.rr>, VEX, VEX_L;
+ def V#NAME#PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst),
+ (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "ps\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (V8F32Int (memopv8f32 addr:$src)))],
+ itins.rm>, VEX, VEX_L;
+}
+
def PSr_Int : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (V4F32Int VR128:$src))],
@@ -3093,22 +3144,29 @@ multiclass sse1_fp_unop_p_int<bits<8> opc, string OpcodeStr,
itins.rm>;
}
-/// sse1_fp_unop_p_y_int - AVX 256-bit intrinsics unops in packed forms.
-multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
- Intrinsic V4F32Int, OpndItins itins> {
- def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V4F32Int VR256:$src))],
- itins.rr>, VEX_L;
- def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
- !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))],
- itins.rm>, VEX_L;
-}
-
/// sse2_fp_unop_s - SSE2 unops in scalar form.
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
SDNode OpNode, Intrinsic F64Int, OpndItins itins> {
+let Predicates = [HasAVX], hasSideEffects = 0 in {
+ def V#NAME#SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst),
+ (ins FR64:$src1, FR64:$src2),
+ !strconcat("v", OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ let mayLoad = 1 in {
+ def V#NAME#SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst),
+ (ins FR64:$src1,f64mem:$src2),
+ !strconcat("v", OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ def V#NAME#SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, sdmem:$src2),
+ !strconcat("v", OpcodeStr,
+ "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ []>, VEX_4V, VEX_LIG;
+ }
+}
+
def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src),
!strconcat(OpcodeStr, "sd\t{$src, $dst|$dst, $src}"),
[(set FR64:$dst, (OpNode FR64:$src))], itins.rr>;
@@ -3125,26 +3183,32 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (F64Int sse_load_f64:$src))], itins.rm>;
}
-/// sse2_fp_unop_s_avx - AVX SSE2 unops in scalar form.
-let hasSideEffects = 0 in
-multiclass sse2_fp_unop_s_avx<bits<8> opc, string OpcodeStr> {
- def SDr : SDI<opc, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
- let mayLoad = 1 in {
- def SDm : SDI<opc, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1,f64mem:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
- def SDm_Int : SDI<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, sdmem:$src2),
- !strconcat(OpcodeStr,
- "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>;
- }
-}
-
/// sse2_fp_unop_p - SSE2 unops in vector forms.
multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
SDNode OpNode, OpndItins itins> {
+let Predicates = [HasAVX] in {
+ def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
+ itins.rr>, VEX;
+ def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))],
+ itins.rm>, VEX;
+ def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
+ itins.rr>, VEX, VEX_L;
+ def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
+ !strconcat("v", OpcodeStr,
+ "pd\t{$src, $dst|$dst, $src}"),
+ [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
+ itins.rm>, VEX, VEX_L;
+}
+
def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (v2f64 (OpNode VR128:$src)))], itins.rr>;
@@ -3153,82 +3217,24 @@ multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
[(set VR128:$dst, (OpNode (memopv2f64 addr:$src)))], itins.rm>;
}
-/// sse2_fp_unop_p_y - AVX SSE2 256-bit unops in vector forms.
-multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
- def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
- itins.rr>, VEX_L;
- def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
- !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
- itins.rm>, VEX_L;
-}
-
-/// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
-multiclass sse2_fp_unop_p_int<bits<8> opc, string OpcodeStr,
- Intrinsic V2F64Int, OpndItins itins> {
- def PDr_Int : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
- !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (V2F64Int VR128:$src))],
- itins.rr>;
- def PDm_Int : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
- !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (V2F64Int (memopv2f64 addr:$src)))],
- itins.rm>;
-}
-
-/// sse2_fp_unop_p_y_int - AVX 256-bit intrinsic unops in vector forms.
-multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
- Intrinsic V2F64Int, OpndItins itins> {
- def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
- !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V2F64Int VR256:$src))],
- itins.rr>, VEX_L;
- def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
- !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))],
- itins.rm>, VEX_L;
-}
+// Square root.
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
+ SSE_SQRTS>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd,
+ SSE_SQRTS>,
+ sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTP>;
-let Predicates = [HasAVX] in {
- // Square root.
- defm VSQRT : sse1_fp_unop_s_avx<0x51, "vsqrt">,
- sse2_fp_unop_s_avx<0x51, "vsqrt">, VEX_4V, VEX_LIG;
-
- defm VSQRT : sse1_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
- sse2_fp_unop_p<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
- sse1_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
- sse2_fp_unop_p_y<0x51, "vsqrt", fsqrt, SSE_SQRTP>,
- sse1_fp_unop_p_int<0x51, "vsqrt", int_x86_sse_sqrt_ps,
- SSE_SQRTP>,
- sse2_fp_unop_p_int<0x51, "vsqrt", int_x86_sse2_sqrt_pd,
- SSE_SQRTP>,
- sse1_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_ps_256,
- SSE_SQRTP>,
- sse2_fp_unop_p_y_int<0x51, "vsqrt", int_x86_avx_sqrt_pd_256,
- SSE_SQRTP>,
- VEX;
-
- // Reciprocal approximations. Note that these typically require refinement
- // in order to obtain suitable precision.
- defm VRSQRT : sse1_fp_unop_s_avx<0x52, "vrsqrt">, VEX_4V, VEX_LIG;
- defm VRSQRT : sse1_fp_unop_p<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>,
- sse1_fp_unop_p_y<0x52, "vrsqrt", X86frsqrt, SSE_SQRTP>,
- sse1_fp_unop_p_y_int<0x52, "vrsqrt", int_x86_avx_rsqrt_ps_256,
- SSE_SQRTP>,
- sse1_fp_unop_p_int<0x52, "vrsqrt", int_x86_sse_rsqrt_ps,
- SSE_SQRTP>, VEX;
-
- defm VRCP : sse1_fp_unop_s_avx<0x53, "vrcp">, VEX_4V, VEX_LIG;
- defm VRCP : sse1_fp_unop_p<0x53, "vrcp", X86frcp, SSE_RCPP>,
- sse1_fp_unop_p_y<0x53, "vrcp", X86frcp, SSE_RCPP>,
- sse1_fp_unop_p_y_int<0x53, "vrcp", int_x86_avx_rcp_ps_256,
- SSE_RCPP>,
- sse1_fp_unop_p_int<0x53, "vrcp", int_x86_sse_rcp_ps,
- SSE_RCPP>, VEX;
-}
+// Reciprocal approximations. Note that these typically require refinement
+// in order to obtain suitable precision.
+defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTP>,
+ sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
+ int_x86_avx_rsqrt_ps_256, SSE_SQRTP>;
+defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, SSE_RCPS>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP>,
+ sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps,
+ int_x86_avx_rcp_ps_256, SSE_RCPP>;
def : Pat<(f32 (fsqrt FR32:$src)),
(VSQRTSSr (f32 (IMPLICIT_DEF)), FR32:$src)>, Requires<[HasAVX]>;
@@ -3283,59 +3289,11 @@ let Predicates = [HasAVX] in {
(VRCPSSm_Int (v4f32 (IMPLICIT_DEF)), sse_load_f32:$src)>;
}
-// Square root.
-defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse_sqrt_ss,
- SSE_SQRTS>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>,
- sse1_fp_unop_p_int<0x51, "sqrt", int_x86_sse_sqrt_ps, SSE_SQRTS>,
- sse2_fp_unop_s<0x51, "sqrt", fsqrt, int_x86_sse2_sqrt_sd,
- SSE_SQRTS>,
- sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTS>,
- sse2_fp_unop_p_int<0x51, "sqrt", int_x86_sse2_sqrt_pd, SSE_SQRTS>;
-
-/// sse1_fp_unop_s_rw - SSE1 unops where vector form has a read-write operand.
-multiclass sse1_fp_unop_rw<bits<8> opc, string OpcodeStr, SDNode OpNode,
- Intrinsic F32Int, OpndItins itins> {
- def SSr : SSI<opc, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src),
- !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
- [(set FR32:$dst, (OpNode FR32:$src))]>;
- // For scalar unary operations, fold a load into the operation
- // only in OptForSize mode. It eliminates an instruction, but it also
- // eliminates a whole-register clobber (the load), so it introduces a
- // partial register update condition.
- def SSm : I<opc, MRMSrcMem, (outs FR32:$dst), (ins f32mem:$src),
- !strconcat(OpcodeStr, "ss\t{$src, $dst|$dst, $src}"),
- [(set FR32:$dst, (OpNode (load addr:$src)))], itins.rm>, XS,
- Requires<[UseSSE1, OptForSize]>;
- let Constraints = "$src1 = $dst" in {
- def SSr_Int : SSI<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2),
- !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
- [], itins.rr>;
- def SSm_Int : SSI<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, ssmem:$src2),
- !strconcat(OpcodeStr, "ss\t{$src2, $dst|$dst, $src2}"),
- [], itins.rm>;
- }
-}
-
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_rw<0x52, "rsqrt", X86frsqrt, int_x86_sse_rsqrt_ss,
- SSE_SQRTS>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_SQRTS>,
- sse1_fp_unop_p_int<0x52, "rsqrt", int_x86_sse_rsqrt_ps,
- SSE_SQRTS>;
let Predicates = [UseSSE1] in {
def : Pat<(int_x86_sse_rsqrt_ss VR128:$src),
(RSQRTSSr_Int VR128:$src, VR128:$src)>;
-}
-
-defm RCP : sse1_fp_unop_rw<0x53, "rcp", X86frcp, int_x86_sse_rcp_ss,
- SSE_RCPS>,
- sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPS>,
- sse1_fp_unop_p_int<0x53, "rcp", int_x86_sse_rcp_ps, SSE_RCPS>;
-let Predicates = [UseSSE1] in {
def : Pat<(int_x86_sse_rcp_ss VR128:$src),
(RCPSSr_Int VR128:$src, VR128:$src)>;
}
@@ -3508,7 +3466,7 @@ def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
}
// For Disassembler
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def VMOVDQArr_REV : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>,
@@ -3571,7 +3529,7 @@ def MOVDQUrr : I<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
[], IIC_SSE_MOVU_P_RR>, XS, Requires<[UseSSE2]>;
// For Disassembler
-let isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1, hasSideEffects = 0 in {
def MOVDQArr_REV : PDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
"movdqa\t{$src, $dst|$dst, $src}", [],
IIC_SSE_MOVA_P_RR>;
@@ -3650,6 +3608,24 @@ multiclass PDI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId,
itins.rm>;
}
+multiclass PDI_binop_all_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
+ Intrinsic IntId256, OpndItins itins,
+ bit IsCommutable = 0> {
+let Predicates = [HasAVX] in
+ defm V#NAME : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId128,
+ VR128, memopv2i64, i128mem, itins,
+ IsCommutable, 0>, VEX_4V;
+
+let Constraints = "$src1 = $dst" in
+ defm NAME : PDI_binop_rm_int<opc, OpcodeStr, IntId128, VR128, memopv2i64,
+ i128mem, itins, IsCommutable, 1>;
+
+let Predicates = [HasAVX2] in
+ defm V#NAME#Y : PDI_binop_rm_int<opc, !strconcat("v", OpcodeStr), IntId256,
+ VR256, memopv4i64, i256mem, itins,
+ IsCommutable, 0>, VEX_4V, VEX_L;
+}
+
multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
string OpcodeStr, SDNode OpNode,
SDNode OpNode2, RegisterClass RC,
@@ -3679,7 +3655,7 @@ multiclass PDI_binop_rmi<bits<8> opc, bits<8> opc2, Format ImmForm,
[(set RC:$dst, (DstVT (OpNode2 RC:$src1, (i32 imm:$src2))))], itins.ri>;
}
-/// PDI_binop_rm - Simple SSE2 binary operator with different src and dst types
+/// PDI_binop_rm2 - Simple SSE2 binary operator with different src and dst types
multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType DstVT, ValueType SrcVT, RegisterClass RC,
PatFrag memop_frag, X86MemOperand x86memop,
@@ -3702,249 +3678,75 @@ multiclass PDI_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
} // ExeDomain = SSEPackedInt
-// 128-bit Integer Arithmetic
+defm PADDB : PDI_binop_all<0xFC, "paddb", add, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PADDW : PDI_binop_all<0xFD, "paddw", add, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
+defm PADDD : PDI_binop_all<0xFE, "paddd", add, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1>;
+defm PADDQ : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 1>;
+defm PMULLW : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
+ SSE_INTMUL_ITINS_P, 1>;
+defm PSUBB : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBW : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBD : PDI_binop_all<0xFA, "psubd", sub, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBQ : PDI_binop_all<0xFB, "psubq", sub, v2i64, v4i64,
+ SSE_INTALUQ_ITINS_P, 0>;
+defm PSUBUSB : PDI_binop_all<0xD8, "psubusb", X86subus, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0>;
+defm PSUBUSW : PDI_binop_all<0xD9, "psubusw", X86subus, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0>;
+defm PMINUB : PDI_binop_all<0xDA, "pminub", X86umin, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PMINSW : PDI_binop_all<0xEA, "pminsw", X86smin, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
+defm PMAXUB : PDI_binop_all<0xDE, "pmaxub", X86umax, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PMAXSW : PDI_binop_all<0xEE, "pmaxsw", X86smax, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
-let Predicates = [HasAVX] in {
-defm VPADDB : PDI_binop_rm<0xFC, "vpaddb", add, v16i8, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 1, 0 /*3addr*/>,
- VEX_4V;
-defm VPADDW : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPADDD : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPADDQ : PDI_binop_rm<0xD4, "vpaddq", add, v2i64, VR128, memopv2i64,
- i128mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V;
-defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, VR128, memopv2i64,
- i128mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
-defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
- i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+// Intrinsic forms
+defm PSUBSB : PDI_binop_all_int<0xE8, "psubsb", int_x86_sse2_psubs_b,
+ int_x86_avx2_psubs_b, SSE_INTALU_ITINS_P, 0>;
+defm PSUBSW : PDI_binop_all_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
+ int_x86_avx2_psubs_w, SSE_INTALU_ITINS_P, 0>;
+defm PADDSB : PDI_binop_all_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
+ int_x86_avx2_padds_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDSW : PDI_binop_all_int<0xED, "paddsw" , int_x86_sse2_padds_w,
+ int_x86_avx2_padds_w, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
+ int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
+defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
+ int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
+defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
+ int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
+defm PMULHW : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
+ int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
+defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
+ int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
+defm PAVGB : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
+ int_x86_avx2_pavg_b, SSE_INTALU_ITINS_P, 1>;
+defm PAVGW : PDI_binop_all_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
+ int_x86_avx2_pavg_w, SSE_INTALU_ITINS_P, 1>;
+defm PSADBW : PDI_binop_all_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
+ int_x86_avx2_psad_bw, SSE_INTALU_ITINS_P, 1>;
+
+let Predicates = [HasAVX] in
defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
VEX_4V;
-
-// Intrinsic forms
-defm VPSUBSB : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBSW : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPADDSB : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPADDSW : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_sse2_padds_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPADDUSB : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_sse2_paddus_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPADDUSW : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_sse2_paddus_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPMULHUW : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_sse2_pmulhu_w,
- VR128, memopv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
-defm VPMULHW : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_sse2_pmulh_w,
- VR128, memopv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
-defm VPMADDWD : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_sse2_pmadd_wd,
- VR128, memopv2i64, i128mem,
- SSE_PMADD, 1, 0>, VEX_4V;
-defm VPAVGB : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_sse2_pavg_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPAVGW : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_sse2_pavg_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPMINUB : PDI_binop_rm_int<0xDA, "vpminub", int_x86_sse2_pminu_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPMINSW : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_sse2_pmins_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPMAXUB : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_sse2_pmaxu_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPMAXSW : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_sse2_pmaxs_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-defm VPSADBW : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
-}
-
-let Predicates = [HasAVX2] in {
-defm VPADDBY : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64,
- i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPADDWY : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64,
- i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPADDDY : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64,
- i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPADDQY : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64,
- i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64,
- i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPSUBBY : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64,
- i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBWY : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64,
- i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBDY : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
- i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBQY : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
- i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L;
+let Predicates = [HasAVX2] in
defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
VR256, memopv4i64, i256mem,
SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-
-// Intrinsic forms
-defm VPSUBSBY : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBSWY : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPADDSBY : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPADDSWY : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
- VR256, memopv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMULHWY : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
- VR256, memopv4i64, i256mem,
- SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
- VR256, memopv4i64, i256mem,
- SSE_PMADD, 1, 0>, VEX_4V, VEX_L;
-defm VPAVGBY : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPAVGWY : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMINUBY : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMINSWY : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMAXUBY : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPMAXSWY : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-defm VPSADBWY : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
-}
-
-let Constraints = "$src1 = $dst" in {
-defm PADDB : PDI_binop_rm<0xFC, "paddb", add, v16i8, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 1>;
-defm PADDW : PDI_binop_rm<0xFD, "paddw", add, v8i16, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 1>;
-defm PADDD : PDI_binop_rm<0xFE, "paddd", add, v4i32, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P, 1>;
-defm PADDQ : PDI_binop_rm<0xD4, "paddq", add, v2i64, VR128, memopv2i64,
- i128mem, SSE_INTALUQ_ITINS_P, 1>;
-defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, VR128, memopv2i64,
- i128mem, SSE_INTMUL_ITINS_P, 1>;
-defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P>;
-defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P>;
-defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
- i128mem, SSE_INTALU_ITINS_P>;
-defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
- i128mem, SSE_INTALUQ_ITINS_P>;
+let Constraints = "$src1 = $dst" in
defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
-// Intrinsic forms
-defm PSUBSB : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-defm PSUBSW : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-defm PADDSB : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PADDSW : PDI_binop_rm_int<0xED, "paddsw" , int_x86_sse2_padds_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PADDUSB : PDI_binop_rm_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PADDUSW : PDI_binop_rm_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PMULHUW : PDI_binop_rm_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
- VR128, memopv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1>;
-defm PMULHW : PDI_binop_rm_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
- VR128, memopv2i64, i128mem,
- SSE_INTMUL_ITINS_P, 1>;
-defm PMADDWD : PDI_binop_rm_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
- VR128, memopv2i64, i128mem,
- SSE_PMADD, 1>;
-defm PAVGB : PDI_binop_rm_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PAVGW : PDI_binop_rm_int<0xE3, "pavgw", int_x86_sse2_pavg_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PMINUB : PDI_binop_rm_int<0xDA, "pminub", int_x86_sse2_pminu_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PMINSW : PDI_binop_rm_int<0xEA, "pminsw", int_x86_sse2_pmins_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PMAXUB : PDI_binop_rm_int<0xDE, "pmaxub", int_x86_sse2_pmaxu_b,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PMAXSW : PDI_binop_rm_int<0xEE, "pmaxsw", int_x86_sse2_pmaxs_w,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-defm PSADBW : PDI_binop_rm_int<0xF6, "psadbw", int_x86_sse2_psad_bw,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
-
-} // Constraints = "$src1 = $dst"
-
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Logical Instructions
//===---------------------------------------------------------------------===//
@@ -4126,186 +3928,106 @@ let Predicates = [UseSSE2] in {
// SSE2 - Packed Integer Comparison Instructions
//===---------------------------------------------------------------------===//
-let Predicates = [HasAVX] in {
- defm VPCMPEQB : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v16i8,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
- defm VPCMPEQW : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v8i16,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
- defm VPCMPEQD : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v4i32,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
- defm VPCMPGTB : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v16i8,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
- defm VPCMPGTW : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v8i16,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
- defm VPCMPGTD : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v4i32,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-}
-
-let Predicates = [HasAVX2] in {
- defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
- defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
- defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
- defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
- defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
- defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-}
-
-let Constraints = "$src1 = $dst" in {
- defm PCMPEQB : PDI_binop_rm<0x74, "pcmpeqb", X86pcmpeq, v16i8,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
- defm PCMPEQW : PDI_binop_rm<0x75, "pcmpeqw", X86pcmpeq, v8i16,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
- defm PCMPEQD : PDI_binop_rm<0x76, "pcmpeqd", X86pcmpeq, v4i32,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 1>;
- defm PCMPGTB : PDI_binop_rm<0x64, "pcmpgtb", X86pcmpgt, v16i8,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
- defm PCMPGTW : PDI_binop_rm<0x65, "pcmpgtw", X86pcmpgt, v8i16,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
- defm PCMPGTD : PDI_binop_rm<0x66, "pcmpgtd", X86pcmpgt, v4i32,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-} // Constraints = "$src1 = $dst"
+defm PCMPEQB : PDI_binop_all<0x74, "pcmpeqb", X86pcmpeq, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQW : PDI_binop_all<0x75, "pcmpeqw", X86pcmpeq, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 1>;
+defm PCMPEQD : PDI_binop_all<0x76, "pcmpeqd", X86pcmpeq, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 1>;
+defm PCMPGTB : PDI_binop_all<0x64, "pcmpgtb", X86pcmpgt, v16i8, v32i8,
+ SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTW : PDI_binop_all<0x65, "pcmpgtw", X86pcmpgt, v8i16, v16i16,
+ SSE_INTALU_ITINS_P, 0>;
+defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
+ SSE_INTALU_ITINS_P, 0>;
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Pack Instructions
//===---------------------------------------------------------------------===//
-let Predicates = [HasAVX] in {
-defm VPACKSSWB : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_sse2_packsswb_128,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPACKSSDW : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_sse2_packssdw_128,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-}
-
-let Predicates = [HasAVX2] in {
-defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb,
- VR256, memopv4i64, i256mem,
- SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-}
-
-let Constraints = "$src1 = $dst" in {
-defm PACKSSWB : PDI_binop_rm_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-defm PACKSSDW : PDI_binop_rm_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-defm PACKUSWB : PDI_binop_rm_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
- VR128, memopv2i64, i128mem,
- SSE_INTALU_ITINS_P>;
-} // Constraints = "$src1 = $dst"
+defm PACKSSWB : PDI_binop_all_int<0x63, "packsswb", int_x86_sse2_packsswb_128,
+ int_x86_avx2_packsswb, SSE_INTALU_ITINS_P, 0>;
+defm PACKSSDW : PDI_binop_all_int<0x6B, "packssdw", int_x86_sse2_packssdw_128,
+ int_x86_avx2_packssdw, SSE_INTALU_ITINS_P, 0>;
+defm PACKUSWB : PDI_binop_all_int<0x67, "packuswb", int_x86_sse2_packuswb_128,
+ int_x86_avx2_packuswb, SSE_INTALU_ITINS_P, 0>;
//===---------------------------------------------------------------------===//
// SSE2 - Packed Integer Shuffle Instructions
//===---------------------------------------------------------------------===//
let ExeDomain = SSEPackedInt in {
-multiclass sse2_pshuffle<string OpcodeStr, ValueType vt, SDNode OpNode> {
-def ri : Ii8<0x70, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (vt (OpNode VR128:$src1, (i8 imm:$src2))))],
- IIC_SSE_PSHUF>;
-def mi : Ii8<0x70, MRMSrcMem,
- (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst,
- (vt (OpNode (bitconvert (memopv2i64 addr:$src1)),
- (i8 imm:$src2))))],
- IIC_SSE_PSHUF>;
-}
-
-multiclass sse2_pshuffle_y<string OpcodeStr, ValueType vt, SDNode OpNode> {
-def Yri : Ii8<0x70, MRMSrcReg,
- (outs VR256:$dst), (ins VR256:$src1, i8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (vt (OpNode VR256:$src1, (i8 imm:$src2))))]>;
-def Ymi : Ii8<0x70, MRMSrcMem,
- (outs VR256:$dst), (ins i256mem:$src1, i8imm:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst,
- (vt (OpNode (bitconvert (memopv4i64 addr:$src1)),
- (i8 imm:$src2))))]>;
-}
-} // ExeDomain = SSEPackedInt
-
+multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
+ SDNode OpNode> {
let Predicates = [HasAVX] in {
- let AddedComplexity = 5 in
- defm VPSHUFD : sse2_pshuffle<"vpshufd", v4i32, X86PShufd>, TB, OpSize, VEX;
-
- // SSE2 with ImmT == Imm8 and XS prefix.
- defm VPSHUFHW : sse2_pshuffle<"vpshufhw", v8i16, X86PShufhw>, XS, VEX;
-
- // SSE2 with ImmT == Imm8 and XD prefix.
- defm VPSHUFLW : sse2_pshuffle<"vpshuflw", v8i16, X86PShuflw>, XD, VEX;
-
- def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
- (VPSHUFDmi addr:$src1, imm:$imm)>;
- def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
- (VPSHUFDri VR128:$src1, imm:$imm)>;
+ def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>, VEX;
+ def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
+ (ins i128mem:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX;
}
let Predicates = [HasAVX2] in {
- defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>,
- TB, OpSize, VEX,VEX_L;
- defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>,
- XS, VEX, VEX_L;
- defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>,
- XD, VEX, VEX_L;
+ def V#NAME#Yri : Ii8<0x70, MRMSrcReg, (outs VR256:$dst),
+ (ins VR256:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>, VEX, VEX_L;
+ def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
+ (ins i256mem:$src1, i8imm:$src2),
+ !strconcat("v", OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR256:$dst,
+ (vt256 (OpNode (bitconvert (memopv4i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF>, VEX, VEX_L;
}
let Predicates = [UseSSE2] in {
- let AddedComplexity = 5 in
- defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, X86PShufd>, TB, OpSize;
+ def ri : Ii8<0x70, MRMSrcReg,
+ (outs VR128:$dst), (ins VR128:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
+ IIC_SSE_PSHUF>;
+ def mi : Ii8<0x70, MRMSrcMem,
+ (outs VR128:$dst), (ins i128mem:$src1, i8imm:$src2),
+ !strconcat(OpcodeStr,
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
+ [(set VR128:$dst,
+ (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
+ (i8 imm:$src2))))], IIC_SSE_PSHUF>;
+}
+}
+} // ExeDomain = SSEPackedInt
- // SSE2 with ImmT == Imm8 and XS prefix.
- defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, X86PShufhw>, XS;
+defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd>, TB, OpSize;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw>, XS;
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw>, XD;
- // SSE2 with ImmT == Imm8 and XD prefix.
- defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, X86PShuflw>, XD;
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (VPSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (VPSHUFDri VR128:$src1, imm:$imm)>;
+}
- def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
- (PSHUFDmi addr:$src1, imm:$imm)>;
- def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
- (PSHUFDri VR128:$src1, imm:$imm)>;
+let Predicates = [UseSSE2] in {
+ def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
+ (PSHUFDmi addr:$src1, imm:$imm)>;
+ def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
+ (PSHUFDri VR128:$src1, imm:$imm)>;
}
//===---------------------------------------------------------------------===//
@@ -5445,7 +5167,7 @@ defm PMULHRSW : SS3I_binop_rm_int<0x0B, "pmulhrsw",
// SSSE3 - Packed Align Instruction Patterns
//===---------------------------------------------------------------------===//
-multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
+multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
let neverHasSideEffects = 1 in {
def R128rr : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i8imm:$src3),
@@ -5465,7 +5187,7 @@ multiclass ssse3_palign<string asm, bit Is2Addr = 1> {
}
}
-multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> {
+multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
let neverHasSideEffects = 1 in {
def R256rr : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i8imm:$src3),
@@ -5482,42 +5204,42 @@ multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> {
}
let Predicates = [HasAVX] in
- defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
+ defm VPALIGN : ssse3_palignr<"vpalignr", 0>, VEX_4V;
let Predicates = [HasAVX2] in
- defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V, VEX_L;
+ defm VPALIGN : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
- defm PALIGN : ssse3_palign<"palignr">;
+ defm PALIGN : ssse3_palignr<"palignr">;
let Predicates = [HasAVX2] in {
-def : Pat<(v8i32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
-def : Pat<(v8f32 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
-def : Pat<(v16i16 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
-def : Pat<(v32i8 (X86PAlign VR256:$src1, VR256:$src2, (i8 imm:$imm))),
+def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VPALIGNR256rr VR256:$src2, VR256:$src1, imm:$imm)>;
}
let Predicates = [HasAVX] in {
-def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(VPALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
}
let Predicates = [UseSSSE3] in {
-def : Pat<(v4i32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlign VR128:$src1, VR128:$src2, (i8 imm:$imm))),
+def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
(PALIGNR128rr VR128:$src2, VR128:$src1, imm:$imm)>;
}
@@ -5844,6 +5566,55 @@ defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
defm PMOVSXBQ : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
defm PMOVZXBQ : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
+let Predicates = [HasAVX2] in {
+ def : Pat<(v16i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWYrr VR128:$src)>;
+ def : Pat<(v8i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDYrr VR128:$src)>;
+ def : Pat<(v4i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQYrr VR128:$src)>;
+
+ def : Pat<(v8i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDYrr VR128:$src)>;
+ def : Pat<(v4i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQYrr VR128:$src)>;
+
+ def : Pat<(v4i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQYrr VR128:$src)>;
+
+ def : Pat<(v16i16 (X86vsext (v32i8 VR256:$src))),
+ (VPMOVSXBWYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v8i32 (X86vsext (v32i8 VR256:$src))),
+ (VPMOVSXBDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (X86vsext (v32i8 VR256:$src))),
+ (VPMOVSXBQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v8i32 (X86vsext (v16i16 VR256:$src))),
+ (VPMOVSXWDYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+ def : Pat<(v4i64 (X86vsext (v16i16 VR256:$src))),
+ (VPMOVSXWQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v4i64 (X86vsext (v8i32 VR256:$src))),
+ (VPMOVSXDQYrr (EXTRACT_SUBREG VR256:$src, sub_xmm))>;
+
+ def : Pat<(v8i32 (X86vsmovl (v8i16 (bitconvert (v2i64 (load addr:$src)))))),
+ (VPMOVSXWDYrm addr:$src)>;
+ def : Pat<(v4i64 (X86vsmovl (v4i32 (bitconvert (v2i64 (load addr:$src)))))),
+ (VPMOVSXDQYrm addr:$src)>;
+
+ def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXBDYrm addr:$src)>;
+ def : Pat<(v8i32 (X86vsext (v16i8 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXBDYrm addr:$src)>;
+
+ def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXWQYrm addr:$src)>;
+ def : Pat<(v4i64 (X86vsext (v8i16 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXWQYrm addr:$src)>;
+
+ def : Pat<(v4i64 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVSXBQYrm addr:$src)>;
+}
+
let Predicates = [HasAVX] in {
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbq
@@ -5858,6 +5629,15 @@ let Predicates = [HasAVX] in {
}
let Predicates = [UseSSE41] in {
+ def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (PMOVSXBWrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (PMOVSXBDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (PMOVSXBQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (PMOVSXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (PMOVSXWQrr VR128:$src)>;
+
+ def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (PMOVSXDQrr VR128:$src)>;
+
// Common patterns involving scalar load
def : Pat<(int_x86_sse41_pmovsxbq
(bitconvert (v4i32 (X86vzmovl
@@ -5868,6 +5648,34 @@ let Predicates = [UseSSE41] in {
(bitconvert (v4i32 (X86vzmovl
(v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
(PMOVZXBQrm addr:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVSXWDrm addr:$src)>;
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVSXWDrm addr:$src)>;
+ def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVSXBDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (PMOVSXWQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (extloadi32i16 addr:$src))))))),
+ (PMOVSXBQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVSXDQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVSXDQrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (PMOVSXBWrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (PMOVSXBWrm addr:$src)>;
}
let Predicates = [HasAVX2] in {
@@ -5928,6 +5736,44 @@ let Predicates = [HasAVX] in {
(VPMOVZXDQrm addr:$src)>;
def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
(VPMOVZXDQrm addr:$src)>;
+
+ def : Pat<(v8i16 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBWrr VR128:$src)>;
+ def : Pat<(v4i32 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 VR128:$src))), (VPMOVSXBQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWDrr VR128:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 VR128:$src))), (VPMOVSXWQrr VR128:$src)>;
+
+ def : Pat<(v2i64 (X86vsext (v4i32 VR128:$src))), (VPMOVSXDQrr VR128:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXWDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXDQrm addr:$src)>;
+ def : Pat<(v4i32 (X86vsext (v8i16 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXWDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v4i32 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXDQrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2i64
+ (scalar_to_vector (loadi64 addr:$src))))))),
+ (VPMOVSXBWrm addr:$src)>;
+ def : Pat<(v8i16 (X86vsext (v16i8 (bitconvert (v2f64
+ (scalar_to_vector (loadf64 addr:$src))))))),
+ (VPMOVSXBWrm addr:$src)>;
+
+ def : Pat<(v4i32 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVSXBDrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v8i16 (bitconvert (v4i32
+ (scalar_to_vector (loadi32 addr:$src))))))),
+ (VPMOVSXWQrm addr:$src)>;
+ def : Pat<(v2i64 (X86vsext (v16i8 (bitconvert (v4i32
+ (scalar_to_vector (extloadi32i16 addr:$src))))))),
+ (VPMOVSXBQrm addr:$src)>;
}
let Predicates = [UseSSE41] in {
@@ -6267,6 +6113,7 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
Intrinsic F64Int, bit Is2Addr = 1> {
let ExeDomain = GenericDomain in {
// Operation, reg.
+ let hasSideEffects = 0 in
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32i8imm:$src3),
!if(Is2Addr,
@@ -6300,6 +6147,7 @@ let ExeDomain = GenericDomain in {
OpSize;
// Operation, reg.
+ let hasSideEffects = 0 in
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32i8imm:$src3),
!if(Is2Addr,
@@ -6621,67 +6469,6 @@ multiclass SS41I_binop_rm_int_y<bits<8> opc, string OpcodeStr,
(bitconvert (memopv4i64 addr:$src2))))]>, OpSize;
}
-let Predicates = [HasAVX] in {
- let isCommutable = 0 in
- defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
- 0>, VEX_4V;
- defm VPMINSB : SS41I_binop_rm_int<0x38, "vpminsb", int_x86_sse41_pminsb,
- 0>, VEX_4V;
- defm VPMINSD : SS41I_binop_rm_int<0x39, "vpminsd", int_x86_sse41_pminsd,
- 0>, VEX_4V;
- defm VPMINUD : SS41I_binop_rm_int<0x3B, "vpminud", int_x86_sse41_pminud,
- 0>, VEX_4V;
- defm VPMINUW : SS41I_binop_rm_int<0x3A, "vpminuw", int_x86_sse41_pminuw,
- 0>, VEX_4V;
- defm VPMAXSB : SS41I_binop_rm_int<0x3C, "vpmaxsb", int_x86_sse41_pmaxsb,
- 0>, VEX_4V;
- defm VPMAXSD : SS41I_binop_rm_int<0x3D, "vpmaxsd", int_x86_sse41_pmaxsd,
- 0>, VEX_4V;
- defm VPMAXUD : SS41I_binop_rm_int<0x3F, "vpmaxud", int_x86_sse41_pmaxud,
- 0>, VEX_4V;
- defm VPMAXUW : SS41I_binop_rm_int<0x3E, "vpmaxuw", int_x86_sse41_pmaxuw,
- 0>, VEX_4V;
- defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq,
- 0>, VEX_4V;
-}
-
-let Predicates = [HasAVX2] in {
- let isCommutable = 0 in
- defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
- int_x86_avx2_packusdw>, VEX_4V, VEX_L;
- defm VPMINSB : SS41I_binop_rm_int_y<0x38, "vpminsb",
- int_x86_avx2_pmins_b>, VEX_4V, VEX_L;
- defm VPMINSD : SS41I_binop_rm_int_y<0x39, "vpminsd",
- int_x86_avx2_pmins_d>, VEX_4V, VEX_L;
- defm VPMINUD : SS41I_binop_rm_int_y<0x3B, "vpminud",
- int_x86_avx2_pminu_d>, VEX_4V, VEX_L;
- defm VPMINUW : SS41I_binop_rm_int_y<0x3A, "vpminuw",
- int_x86_avx2_pminu_w>, VEX_4V, VEX_L;
- defm VPMAXSB : SS41I_binop_rm_int_y<0x3C, "vpmaxsb",
- int_x86_avx2_pmaxs_b>, VEX_4V, VEX_L;
- defm VPMAXSD : SS41I_binop_rm_int_y<0x3D, "vpmaxsd",
- int_x86_avx2_pmaxs_d>, VEX_4V, VEX_L;
- defm VPMAXUD : SS41I_binop_rm_int_y<0x3F, "vpmaxud",
- int_x86_avx2_pmaxu_d>, VEX_4V, VEX_L;
- defm VPMAXUW : SS41I_binop_rm_int_y<0x3E, "vpmaxuw",
- int_x86_avx2_pmaxu_w>, VEX_4V, VEX_L;
- defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq",
- int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
-}
-
-let Constraints = "$src1 = $dst" in {
- let isCommutable = 0 in
- defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
- defm PMINSB : SS41I_binop_rm_int<0x38, "pminsb", int_x86_sse41_pminsb>;
- defm PMINSD : SS41I_binop_rm_int<0x39, "pminsd", int_x86_sse41_pminsd>;
- defm PMINUD : SS41I_binop_rm_int<0x3B, "pminud", int_x86_sse41_pminud>;
- defm PMINUW : SS41I_binop_rm_int<0x3A, "pminuw", int_x86_sse41_pminuw>;
- defm PMAXSB : SS41I_binop_rm_int<0x3C, "pmaxsb", int_x86_sse41_pmaxsb>;
- defm PMAXSD : SS41I_binop_rm_int<0x3D, "pmaxsd", int_x86_sse41_pmaxsd>;
- defm PMAXUD : SS41I_binop_rm_int<0x3F, "pmaxud", int_x86_sse41_pmaxud>;
- defm PMAXUW : SS41I_binop_rm_int<0x3E, "pmaxuw", int_x86_sse41_pmaxuw>;
- defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>;
-}
/// SS48I_binop_rm - Simple SSE41 binary operator.
multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -6705,6 +6492,76 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
let Predicates = [HasAVX] in {
+ let isCommutable = 0 in
+ defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
+ 0>, VEX_4V;
+ defm VPMINSB : SS48I_binop_rm<0x38, "vpminsb", X86smin, v16i8, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMINSD : SS48I_binop_rm<0x39, "vpminsd", X86smin, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMINUD : SS48I_binop_rm<0x3B, "vpminud", X86umin, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMINUW : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v8i16, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXSB : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v16i8, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXSD : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXUD : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v4i32, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMAXUW : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
+ memopv2i64, i128mem, 0>, VEX_4V;
+ defm VPMULDQ : SS41I_binop_rm_int<0x28, "vpmuldq", int_x86_sse41_pmuldq,
+ 0>, VEX_4V;
+}
+
+let Predicates = [HasAVX2] in {
+ let isCommutable = 0 in
+ defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
+ int_x86_avx2_packusdw>, VEX_4V, VEX_L;
+ defm VPMINSBY : SS48I_binop_rm<0x38, "vpminsb", X86smin, v32i8, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMINSDY : SS48I_binop_rm<0x39, "vpminsd", X86smin, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMINUDY : SS48I_binop_rm<0x3B, "vpminud", X86umin, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMINUWY : SS48I_binop_rm<0x3A, "vpminuw", X86umin, v16i16, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXSBY : SS48I_binop_rm<0x3C, "vpmaxsb", X86smax, v32i8, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXSDY : SS48I_binop_rm<0x3D, "vpmaxsd", X86smax, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXUDY : SS48I_binop_rm<0x3F, "vpmaxud", X86umax, v8i32, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMAXUWY : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
+ memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
+ defm VPMULDQ : SS41I_binop_rm_int_y<0x28, "vpmuldq",
+ int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
+}
+
+let Constraints = "$src1 = $dst" in {
+ let isCommutable = 0 in
+ defm PACKUSDW : SS41I_binop_rm_int<0x2B, "packusdw", int_x86_sse41_packusdw>;
+ defm PMINSB : SS48I_binop_rm<0x38, "pminsb", X86smin, v16i8, VR128,
+ memopv2i64, i128mem>;
+ defm PMINSD : SS48I_binop_rm<0x39, "pminsd", X86smin, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMINUD : SS48I_binop_rm<0x3B, "pminud", X86umin, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMINUW : SS48I_binop_rm<0x3A, "pminuw", X86umin, v8i16, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXSB : SS48I_binop_rm<0x3C, "pmaxsb", X86smax, v16i8, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXSD : SS48I_binop_rm<0x3D, "pmaxsd", X86smax, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXUD : SS48I_binop_rm<0x3F, "pmaxud", X86umax, v4i32, VR128,
+ memopv2i64, i128mem>;
+ defm PMAXUW : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
+ memopv2i64, i128mem>;
+ defm PMULDQ : SS41I_binop_rm_int<0x28, "pmuldq", int_x86_sse41_pmuldq>;
+}
+
+let Predicates = [HasAVX] in {
defm VPMULLD : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
memopv2i64, i128mem, 0>, VEX_4V;
defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 893488c159..1185941d34 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -51,6 +51,7 @@ def SHL64ri : RIi8<0xC1, MRM4r, (outs GR64:$dst),
// NOTE: We don't include patterns for shifts of a register by one, because
// 'add reg,reg' is cheaper (and we have a Pat pattern for shift-by-one).
+let hasSideEffects = 0 in {
def SHL8r1 : I<0xD0, MRM4r, (outs GR8:$dst), (ins GR8:$src1),
"shl{b}\t$dst", [], IIC_SR>;
def SHL16r1 : I<0xD1, MRM4r, (outs GR16:$dst), (ins GR16:$src1),
@@ -59,8 +60,9 @@ def SHL32r1 : I<0xD1, MRM4r, (outs GR32:$dst), (ins GR32:$src1),
"shl{l}\t$dst", [], IIC_SR>;
def SHL64r1 : RI<0xD1, MRM4r, (outs GR64:$dst), (ins GR64:$src1),
"shl{q}\t$dst", [], IIC_SR>;
+} // hasSideEffects = 0
} // isConvertibleToThreeAddress = 1
-} // Constraints = "$src = $dst"
+} // Constraints = "$src = $dst"
// FIXME: Why do we need an explicit "Uses = [CL]" when the instr has a pattern
@@ -333,6 +335,7 @@ def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
// Rotate instructions
//===----------------------------------------------------------------------===//
+let hasSideEffects = 0 in {
let Constraints = "$src1 = $dst" in {
def RCL8r1 : I<0xD0, MRM2r, (outs GR8:$dst), (ins GR8:$src1),
"rcl{b}\t$dst", [], IIC_SR>;
@@ -455,6 +458,7 @@ def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
"rcr{q}\t{%cl, $dst|$dst, CL}", [], IIC_SR>;
}
+} // hasSideEffects = 0
let Constraints = "$src1 = $dst" in {
// FIXME: provide shorter instructions when imm8 == 1
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index ea716bfd6b..3caa1b538c 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -352,11 +352,11 @@ def VERWm : I<0x00, MRM5m, (outs), (ins i16mem:$seg),
// Descriptor-table support instructions
def SGDT16m : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
- "sgdtw\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
+ "sgdt{w}\t$dst", [], IIC_SGDT>, TB, OpSize, Requires<[In32BitMode]>;
def SGDTm : I<0x01, MRM0m, (outs opaque48mem:$dst), (ins),
"sgdt\t$dst", [], IIC_SGDT>, TB;
def SIDT16m : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
- "sidtw\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
+ "sidt{w}\t$dst", [], IIC_SIDT>, TB, OpSize, Requires<[In32BitMode]>;
def SIDTm : I<0x01, MRM1m, (outs opaque48mem:$dst), (ins),
"sidt\t$dst", []>, TB;
def SLDT16r : I<0x00, MRM0r, (outs GR16:$dst), (ins),
@@ -374,11 +374,11 @@ def SLDT64m : RI<0x00, MRM0m, (outs i16mem:$dst), (ins),
"sldt{q}\t$dst", [], IIC_SLDT>, TB;
def LGDT16m : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
- "lgdtw\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
+ "lgdt{w}\t$src", [], IIC_LGDT>, TB, OpSize, Requires<[In32BitMode]>;
def LGDTm : I<0x01, MRM2m, (outs), (ins opaque48mem:$src),
"lgdt\t$src", [], IIC_LGDT>, TB;
def LIDT16m : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
- "lidtw\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
+ "lidt{w}\t$src", [], IIC_LIDT>, TB, OpSize, Requires<[In32BitMode]>;
def LIDTm : I<0x01, MRM3m, (outs), (ins opaque48mem:$src),
"lidt\t$src", [], IIC_LIDT>, TB;
def LLDT16r : I<0x00, MRM2r, (outs), (ins GR16:$src),
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index ad55058ede..a37a8cc744 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -22,7 +22,7 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
let isBranch = 1, isTerminator = 1, Defs = [EAX] in
def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
- "xbegin\t$dst", []>;
+ "xbegin\t$dst", []>, Requires<[HasRTM]>;
def XEND : I<0x01, MRM_D5, (outs), (ins),
"xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 764aa5d4f2..44d8cce054 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -16,7 +16,7 @@
#include "X86Relocations.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
-#include "llvm/Function.h"
+#include "llvm/IR/Function.h"
#include "llvm/Support/Compiler.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Valgrind.h"
@@ -79,7 +79,7 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
# define CFI(x)
#endif
-// Provide a wrapper for X86CompilationCallback2 that saves non-traditional
+// Provide a wrapper for LLVMX86CompilationCallback2 that saves non-traditional
// callee saved registers, for the fastcc calling convention.
extern "C" {
#if defined(X86_64_JIT)
@@ -131,12 +131,12 @@ extern "C" {
"subq $32, %rsp\n"
"movq %rbp, %rcx\n" // Pass prev frame and return address
"movq 8(%rbp), %rdx\n"
- "call " ASMPREFIX "X86CompilationCallback2\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
"addq $32, %rsp\n"
#else
"movq %rbp, %rdi\n" // Pass prev frame and return address
"movq 8(%rbp), %rsi\n"
- "call " ASMPREFIX "X86CompilationCallback2\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
#endif
// Restore all XMM arg registers
"movaps 112(%rsp), %xmm7\n"
@@ -213,7 +213,7 @@ extern "C" {
"movl 4(%ebp), %eax\n" // Pass prev frame and return address
"movl %eax, 4(%esp)\n"
"movl %ebp, (%esp)\n"
- "call " ASMPREFIX "X86CompilationCallback2\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
"movl %ebp, %esp\n" // Restore ESP
CFI(".cfi_def_cfa_register %esp\n")
"subl $12, %esp\n"
@@ -269,7 +269,7 @@ extern "C" {
"movl 4(%ebp), %eax\n" // Pass prev frame and return address
"movl %eax, 4(%esp)\n"
"movl %ebp, (%esp)\n"
- "call " ASMPREFIX "X86CompilationCallback2\n"
+ "call " ASMPREFIX "LLVMX86CompilationCallback2\n"
"addl $16, %esp\n"
"movaps 48(%esp), %xmm3\n"
CFI(".cfi_restore %xmm3\n")
@@ -300,10 +300,7 @@ extern "C" {
SIZE(X86CompilationCallback_SSE)
);
# else
- // the following function is called only from this translation unit,
- // unless we are under 64bit Windows with MSC, where there is
- // no support for inline assembly
- static void X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
+ void LLVMX86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr);
_declspec(naked) void X86CompilationCallback(void) {
__asm {
@@ -317,7 +314,7 @@ extern "C" {
mov eax, dword ptr [ebp+4]
mov dword ptr [esp+4], eax
mov dword ptr [esp], ebp
- call X86CompilationCallback2
+ call LLVMX86CompilationCallback2
mov esp, ebp
sub esp, 12
pop ecx
@@ -337,20 +334,17 @@ extern "C" {
#endif
}
-/// X86CompilationCallback2 - This is the target-specific function invoked by the
+/// This is the target-specific function invoked by the
/// function stub when we did not know the real target of a call. This function
/// must locate the start of the stub or call site and pass it into the JIT
/// compiler function.
extern "C" {
-#if !(defined (X86_64_JIT) && defined(_MSC_VER))
- // the following function is called only from this translation unit,
- // unless we are under 64bit Windows with MSC, where there is
- // no support for inline assembly
-static
-#endif
-void LLVM_ATTRIBUTE_USED
-X86CompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+LLVM_LIBRARY_VISIBILITY void LLVMX86CompilationCallback2(intptr_t *StackPtr,
+ intptr_t RetAddr) {
intptr_t *RetAddrLoc = &StackPtr[1];
+ // We are reading raw stack data here. Tell MemorySanitizer that it is
+ // sufficiently initialized.
+ __msan_unpoison(RetAddrLoc, sizeof(*RetAddrLoc));
assert(*RetAddrLoc == RetAddr &&
"Could not find return address on the stack!");
@@ -517,7 +511,7 @@ void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
// This used to use 0xCD, but that value is used by JITMemoryManager to
// initialize the buffer with garbage, which means it may follow a
- // noreturn function call, confusing X86CompilationCallback2. PR 4929.
+ // noreturn function call, confusing LLVMX86CompilationCallback2. PR 4929.
JCE.emitByte(0xCE); // Interrupt - Just a marker identifying the stub!
return Result;
}
diff --git a/lib/Target/X86/X86JITInfo.h b/lib/Target/X86/X86JITInfo.h
index 7b0f3434d0..f916327378 100644
--- a/lib/Target/X86/X86JITInfo.h
+++ b/lib/Target/X86/X86JITInfo.h
@@ -15,7 +15,7 @@
#define X86JITINFO_H
#include "llvm/CodeGen/JITCodeEmitter.h"
-#include "llvm/Function.h"
+#include "llvm/IR/Function.h"
#include "llvm/Target/TargetJITInfo.h"
namespace llvm {
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 07740f16e8..3af1b3e06b 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -17,6 +17,7 @@
#include "X86COFFMachineModuleInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Type.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -26,7 +27,6 @@
#include "llvm/MC/MCSymbol.h"
#include "llvm/Support/FormattedStream.h"
#include "llvm/Target/Mangler.h"
-#include "llvm/Type.h"
using namespace llvm;
namespace {
@@ -239,7 +239,8 @@ static void lower_lea64_32mem(MCInst *MI, unsigned OpNo) {
if (!MI->getOperand(OpNo+i).isReg()) continue;
unsigned Reg = MI->getOperand(OpNo+i).getReg();
- if (Reg == 0) continue;
+ // LEAs can use RIP-relative addressing, and RIP has no sub/super register.
+ if (Reg == 0 || Reg == X86::RIP) continue;
MI->getOperand(OpNo+i).setReg(getX86SubSuperRegister(Reg, MVT::i64));
}
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
new file mode 100644
index 0000000000..83e75ea994
--- /dev/null
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -0,0 +1,212 @@
+//===-------- X86PadShortFunction.cpp - pad short functions -----------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which will pad short functions to prevent
+// a stall if a function returns before the return address is ready. This
+// is needed for some Intel Atom processors.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+
+#define DEBUG_TYPE "x86-pad-short-functions"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumBBsPadded, "Number of basic blocks padded");
+
+namespace {
+ struct VisitedBBInfo {
+ // HasReturn - Whether the BB contains a return instruction
+ bool HasReturn;
+
+ // Cycles - Number of cycles until return if HasReturn is true, otherwise
+ // number of cycles until end of the BB
+ unsigned int Cycles;
+
+ VisitedBBInfo() : HasReturn(false), Cycles(0) {}
+ VisitedBBInfo(bool HasReturn, unsigned int Cycles)
+ : HasReturn(HasReturn), Cycles(Cycles) {}
+ };
+
+ struct PadShortFunc : public MachineFunctionPass {
+ static char ID;
+ PadShortFunc() : MachineFunctionPass(ID)
+ , Threshold(4), TM(0), TII(0) {}
+
+ virtual bool runOnMachineFunction(MachineFunction &MF);
+
+ virtual const char *getPassName() const {
+ return "X86 Atom pad short functions";
+ }
+
+ private:
+ void findReturns(MachineBasicBlock *MBB,
+ unsigned int Cycles = 0);
+
+ bool cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles);
+
+ void addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd);
+
+ const unsigned int Threshold;
+
+ // ReturnBBs - Maps basic blocks that return to the minimum number of
+ // cycles until the return, starting from the entry block.
+ DenseMap<MachineBasicBlock*, unsigned int> ReturnBBs;
+
+ // VisitedBBs - Cache of previously visited BBs.
+ DenseMap<MachineBasicBlock*, VisitedBBInfo> VisitedBBs;
+
+ const TargetMachine *TM;
+ const TargetInstrInfo *TII;
+ };
+
+ char PadShortFunc::ID = 0;
+}
+
+FunctionPass *llvm::createX86PadShortFunctions() {
+ return new PadShortFunc();
+}
+
+/// runOnMachineFunction - Loop over all of the basic blocks, inserting
+/// NOOP instructions before early exits.
+bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
+ const AttributeSet &FnAttrs = MF.getFunction()->getAttributes();
+ if (FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::OptimizeForSize) ||
+ FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::MinSize)) {
+ return false;
+ }
+
+ TM = &MF.getTarget();
+ TII = TM->getInstrInfo();
+
+ // Search through basic blocks and mark the ones that have early returns
+ ReturnBBs.clear();
+ VisitedBBs.clear();
+ findReturns(MF.begin());
+
+ bool MadeChange = false;
+
+ MachineBasicBlock *MBB;
+ unsigned int Cycles = 0;
+
+ // Pad the identified basic blocks with NOOPs
+ for (DenseMap<MachineBasicBlock*, unsigned int>::iterator I = ReturnBBs.begin();
+ I != ReturnBBs.end(); ++I) {
+ MBB = I->first;
+ Cycles = I->second;
+
+ if (Cycles < Threshold) {
+ // BB ends in a return. Skip over any DBG_VALUE instructions
+ // trailing the terminator.
+ assert(MBB->size() > 0 &&
+ "Basic block should contain at least a RET but is empty");
+ MachineBasicBlock::iterator ReturnLoc = --MBB->end();
+
+ while (ReturnLoc->isDebugValue())
+ --ReturnLoc;
+ assert(ReturnLoc->isReturn() && !ReturnLoc->isCall() &&
+ "Basic block does not end with RET");
+
+ addPadding(MBB, ReturnLoc, Threshold - Cycles);
+ NumBBsPadded++;
+ MadeChange = true;
+ }
+ }
+
+ return MadeChange;
+}
+
+/// findReturn - Starting at MBB, follow control flow and add all
+/// basic blocks that contain a return to ReturnBBs.
+void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) {
+ // If this BB has a return, note how many cycles it takes to get there.
+ bool hasReturn = cyclesUntilReturn(MBB, Cycles);
+ if (Cycles >= Threshold)
+ return;
+
+ if (hasReturn) {
+ ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles);
+ return;
+ }
+
+ // Follow branches in BB and look for returns
+ for (MachineBasicBlock::succ_iterator I = MBB->succ_begin();
+ I != MBB->succ_end(); ++I) {
+ if (*I == MBB)
+ continue;
+ findReturns(*I, Cycles);
+ }
+}
+
+/// cyclesUntilReturn - return true if the MBB has a return instruction,
+/// and return false otherwise.
+/// Cycles will be incremented by the number of cycles taken to reach the
+/// return or the end of the BB, whichever occurs first.
+bool PadShortFunc::cyclesUntilReturn(MachineBasicBlock *MBB,
+ unsigned int &Cycles) {
+ // Return cached result if BB was previously visited
+ DenseMap<MachineBasicBlock*, VisitedBBInfo>::iterator it
+ = VisitedBBs.find(MBB);
+ if (it != VisitedBBs.end()) {
+ VisitedBBInfo BBInfo = it->second;
+ Cycles += BBInfo.Cycles;
+ return BBInfo.HasReturn;
+ }
+
+ unsigned int CyclesToEnd = 0;
+
+ for (MachineBasicBlock::iterator MBBI = MBB->begin();
+ MBBI != MBB->end(); ++MBBI) {
+ MachineInstr *MI = MBBI;
+ // Mark basic blocks with a return instruction. Calls to other
+ // functions do not count because the called function will be padded,
+ // if necessary.
+ if (MI->isReturn() && !MI->isCall()) {
+ VisitedBBs[MBB] = VisitedBBInfo(true, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return true;
+ }
+
+ CyclesToEnd += TII->getInstrLatency(TM->getInstrItineraryData(), MI);
+ }
+
+ VisitedBBs[MBB] = VisitedBBInfo(false, CyclesToEnd);
+ Cycles += CyclesToEnd;
+ return false;
+}
+
+/// addPadding - Add the given number of NOOP instructions to the function
+/// just prior to the return at MBBI
+void PadShortFunc::addPadding(MachineBasicBlock *MBB,
+ MachineBasicBlock::iterator &MBBI,
+ unsigned int NOOPsToAdd) {
+ DebugLoc DL = MBBI->getDebugLoc();
+
+ while (NOOPsToAdd-- > 0) {
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ BuildMI(*MBB, MBBI, DL, TII->get(X86::NOOP));
+ }
+}
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index b396a5ca81..16886e432d 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -28,8 +28,9 @@
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
@@ -37,7 +38,6 @@
#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
-#include "llvm/Type.h"
#define GET_REGINFO_TARGET_DESC
#include "X86GenRegisterInfo.inc"
@@ -50,16 +50,18 @@ ForceStackAlign("force-align-stack",
" needed for the function."),
cl::init(false), cl::Hidden);
-cl::opt<bool>
+static cl::opt<bool>
EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
cl::desc("Enable use of a base pointer for complex stack frames"));
X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
const TargetInstrInfo &tii)
- : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit()
- ? X86::RIP : X86::EIP,
+ : X86GenRegisterInfo((tm.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::RIP : X86::EIP),
X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), false),
- X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true)),
+ X86_MC::getDwarfRegFlavour(tm.getTargetTriple(), true),
+ (tm.getSubtarget<X86Subtarget>().is64Bit()
+ ? X86::RIP : X86::EIP)),
TM(tm), TII(tii) {
X86_MC::InitLLVM2SEHRegisterMapping(this);
@@ -175,20 +177,21 @@ X86RegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC) const{
const TargetRegisterClass *
X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
const {
+ const X86Subtarget &Subtarget = TM.getSubtarget<X86Subtarget>();
switch (Kind) {
default: llvm_unreachable("Unexpected Kind in getPointerRegClass!");
case 0: // Normal GPRs.
- if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ if (Subtarget.isTarget64BitLP64())
return &X86::GR64RegClass;
return &X86::GR32RegClass;
case 1: // Normal GPRs except the stack pointer (for encoding reasons).
- if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ if (Subtarget.isTarget64BitLP64())
return &X86::GR64_NOSPRegClass;
return &X86::GR32_NOSPRegClass;
case 2: // Available for tailcall (not callee-saved GPRs).
- if (TM.getSubtarget<X86Subtarget>().isTargetWin64())
+ if (Subtarget.isTargetWin64())
return &X86::GR64_TCW64RegClass;
- if (TM.getSubtarget<X86Subtarget>().is64Bit())
+ else if (Subtarget.is64Bit())
return &X86::GR64_TCRegClass;
const Function *F = MF.getFunction();
@@ -232,38 +235,40 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
const uint16_t *
X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
- bool callsEHReturn = false;
- bool ghcCall = false;
- bool oclBiCall = false;
- bool hipeCall = false;
- bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
-
- if (MF) {
- callsEHReturn = MF->getMMI().callsEHReturn();
- const Function *F = MF->getFunction();
- ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
- oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
- hipeCall = (F ? F->getCallingConv() == CallingConv::HiPE : false);
- }
-
- if (ghcCall || hipeCall)
+ switch (MF->getFunction()->getCallingConv()) {
+ case CallingConv::GHC:
+ case CallingConv::HiPE:
return CSR_NoRegs_SaveList;
- if (oclBiCall) {
+
+ case CallingConv::Intel_OCL_BI: {
+ bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
if (HasAVX && IsWin64)
- return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+ return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
if (HasAVX && Is64Bit)
- return CSR_64_Intel_OCL_BI_AVX_SaveList;
+ return CSR_64_Intel_OCL_BI_AVX_SaveList;
if (!HasAVX && !IsWin64 && Is64Bit)
- return CSR_64_Intel_OCL_BI_SaveList;
+ return CSR_64_Intel_OCL_BI_SaveList;
+ break;
}
+
+ case CallingConv::Cold:
+ if (Is64Bit)
+ return CSR_MostRegs_64_SaveList;
+ break;
+
+ default:
+ break;
+ }
+
+ bool CallsEHReturn = MF->getMMI().callsEHReturn();
if (Is64Bit) {
if (IsWin64)
return CSR_Win64_SaveList;
- if (callsEHReturn)
+ if (CallsEHReturn)
return CSR_64EHRet_SaveList;
return CSR_64_SaveList;
}
- if (callsEHReturn)
+ if (CallsEHReturn)
return CSR_32EHRet_SaveList;
return CSR_32_SaveList;
}
@@ -284,6 +289,8 @@ X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
return CSR_NoRegs_RegMask;
if (!Is64Bit)
return CSR_32_RegMask;
+ if (CC == CallingConv::Cold)
+ return CSR_MostRegs_64_RegMask;
if (IsWin64)
return CSR_Win64_RegMask;
return CSR_64_RegMask;
@@ -387,7 +394,13 @@ bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
// When we need stack realignment and there are dynamic allocas, we can't
// reference off of the stack pointer, so we reserve a base pointer.
- if (needsStackRealignment(MF) && MFI->hasVarSizedObjects())
+ //
+ // This is also true if the function contain MS-style inline assembly. We
+ // do this because if any stack changes occur in the inline assembly, e.g.,
+ // "pusha", then any C local variable or C argument references in the
+ // inline assembly will be wrong because the SP is not properly tracked.
+ if ((needsStackRealignment(MF) && MFI->hasVarSizedObjects()) ||
+ MF.hasMSInlineAsm())
return true;
return false;
@@ -417,7 +430,8 @@ bool X86RegisterInfo::needsStackRealignment(const MachineFunction &MF) const {
unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
bool requiresRealignment =
((MFI->getMaxAlignment() > StackAlign) ||
- F->getFnAttributes().hasAttribute(Attributes::StackAlignment));
+ F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+ Attribute::StackAlignment));
// If we've requested that we force align the stack do so now.
if (ForceStackAlign)
@@ -437,123 +451,16 @@ bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
return false;
}
-static unsigned getSUBriOpcode(unsigned is64Bit, int64_t Imm) {
- if (is64Bit) {
- if (isInt<8>(Imm))
- return X86::SUB64ri8;
- return X86::SUB64ri32;
- } else {
- if (isInt<8>(Imm))
- return X86::SUB32ri8;
- return X86::SUB32ri;
- }
-}
-
-static unsigned getADDriOpcode(unsigned is64Bit, int64_t Imm) {
- if (is64Bit) {
- if (isInt<8>(Imm))
- return X86::ADD64ri8;
- return X86::ADD64ri32;
- } else {
- if (isInt<8>(Imm))
- return X86::ADD32ri8;
- return X86::ADD32ri;
- }
-}
-
-void X86RegisterInfo::
-eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I) const {
- const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
- bool reseveCallFrame = TFI->hasReservedCallFrame(MF);
- int Opcode = I->getOpcode();
- bool isDestroy = Opcode == TII.getCallFrameDestroyOpcode();
- DebugLoc DL = I->getDebugLoc();
- uint64_t Amount = !reseveCallFrame ? I->getOperand(0).getImm() : 0;
- uint64_t CalleeAmt = isDestroy ? I->getOperand(1).getImm() : 0;
- I = MBB.erase(I);
-
- if (!reseveCallFrame) {
- // If the stack pointer can be changed after prologue, turn the
- // adjcallstackup instruction into a 'sub ESP, <amt>' and the
- // adjcallstackdown instruction into 'add ESP, <amt>'
- // TODO: consider using push / pop instead of sub + store / add
- if (Amount == 0)
- return;
-
- // We need to keep the stack aligned properly. To do this, we round the
- // amount of space needed for the outgoing arguments up to the next
- // alignment boundary.
- unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
- Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
-
- MachineInstr *New = 0;
- if (Opcode == TII.getCallFrameSetupOpcode()) {
- New = BuildMI(MF, DL, TII.get(getSUBriOpcode(Is64Bit, Amount)),
- StackPtr)
- .addReg(StackPtr)
- .addImm(Amount);
- } else {
- assert(Opcode == TII.getCallFrameDestroyOpcode());
-
- // Factor out the amount the callee already popped.
- Amount -= CalleeAmt;
-
- if (Amount) {
- unsigned Opc = getADDriOpcode(Is64Bit, Amount);
- New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr).addImm(Amount);
- }
- }
-
- if (New) {
- // The EFLAGS implicit def is dead.
- New->getOperand(3).setIsDead();
-
- // Replace the pseudo instruction with a new instruction.
- MBB.insert(I, New);
- }
-
- return;
- }
-
- if (Opcode == TII.getCallFrameDestroyOpcode() && CalleeAmt) {
- // If we are performing frame pointer elimination and if the callee pops
- // something off the stack pointer, add it back. We do this until we have
- // more advanced stack pointer tracking ability.
- unsigned Opc = getSUBriOpcode(Is64Bit, CalleeAmt);
- MachineInstr *New = BuildMI(MF, DL, TII.get(Opc), StackPtr)
- .addReg(StackPtr).addImm(CalleeAmt);
-
- // The EFLAGS implicit def is dead.
- New->getOperand(3).setIsDead();
-
- // We are not tracking the stack pointer adjustment by the callee, so make
- // sure we restore the stack pointer immediately after the call, there may
- // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
- MachineBasicBlock::iterator B = MBB.begin();
- while (I != B && !llvm::prior(I)->isCall())
- --I;
- MBB.insert(I, New);
- }
-}
-
void
X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
- int SPAdj, RegScavenger *RS) const {
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS) const {
assert(SPAdj == 0 && "Unexpected");
- unsigned i = 0;
MachineInstr &MI = *II;
MachineFunction &MF = *MI.getParent()->getParent();
const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
- while (!MI.getOperand(i).isFI()) {
- ++i;
- assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
- }
-
- int FrameIndex = MI.getOperand(i).getIndex();
+ int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
unsigned BasePtr;
unsigned Opc = MI.getOpcode();
@@ -569,7 +476,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
// This must be part of a four operand memory reference. Replace the
// FrameIndex with base register with EBP. Add an offset to the offset.
- MI.getOperand(i).ChangeToRegister(BasePtr, false);
+ MI.getOperand(FIOperandNum).ChangeToRegister(BasePtr, false);
// Now add the frame object offset to the offset from EBP.
int FIOffset;
@@ -580,17 +487,18 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
} else
FIOffset = TFI->getFrameIndexOffset(MF, FrameIndex);
- if (MI.getOperand(i+3).isImm()) {
+ if (MI.getOperand(FIOperandNum+3).isImm()) {
// Offset is a 32-bit integer.
- int Imm = (int)(MI.getOperand(i + 3).getImm());
+ int Imm = (int)(MI.getOperand(FIOperandNum + 3).getImm());
int Offset = FIOffset + Imm;
assert((!Is64Bit || isInt<32>((long long)FIOffset + Imm)) &&
"Requesting 64-bit offset in 32-bit immediate!");
- MI.getOperand(i + 3).ChangeToImmediate(Offset);
+ MI.getOperand(FIOperandNum + 3).ChangeToImmediate(Offset);
} else {
// Offset is symbolic. This is extremely rare.
- uint64_t Offset = FIOffset + (uint64_t)MI.getOperand(i+3).getOffset();
- MI.getOperand(i+3).setOffset(Offset);
+ uint64_t Offset = FIOffset +
+ (uint64_t)MI.getOperand(FIOperandNum+3).getOffset();
+ MI.getOperand(FIOperandNum + 3).setOffset(Offset);
}
}
@@ -615,7 +523,15 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
case MVT::i8:
if (High) {
switch (Reg) {
- default: return getX86SubSuperRegister(Reg, MVT::i64, High);
+ default: return getX86SubSuperRegister(Reg, MVT::i64);
+ case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+ return X86::SI;
+ case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+ return X86::DI;
+ case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+ return X86::BP;
+ case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+ return X86::SP;
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
return X86::AH;
case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
@@ -735,22 +651,6 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
return X86::R15D;
}
case MVT::i64:
- // For 64-bit mode if we've requested a "high" register and the
- // Q or r constraints we want one of these high registers or
- // just the register name otherwise.
- if (High) {
- switch (Reg) {
- case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
- return X86::SI;
- case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
- return X86::DI;
- case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
- return X86::BP;
- case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
- return X86::SP;
- // Fallthrough.
- }
- }
switch (Reg) {
default: llvm_unreachable("Unexpected register");
case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 7932ede8dd..b9d7b8cf8b 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -117,12 +117,9 @@ public:
bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
int &FrameIdx) const;
- void eliminateCallFramePseudoInstr(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI) const;
-
void eliminateFrameIndex(MachineBasicBlock::iterator MI,
- int SPAdj, RegScavenger *RS = NULL) const;
+ int SPAdj, unsigned FIOperandNum,
+ RegScavenger *RS = NULL) const;
// Debug information queries.
unsigned getFrameRegister(const MachineFunction &MF) const;
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index c14407f9ac..d99d085298 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -470,12 +470,17 @@ def IIC_NOP : InstrItinClass;
// latencies. Since these latencies are not used for pipeline hazards,
// they do not need to be exact.
//
+// ILPWindow=10 is an arbitrary threshold that approximates cycles of
+// latency hidden by instruction buffers. The actual value is not very
+// important but should be zero for inorder and nonzero for OOO processors.
+//
// The GenericModel contains no instruciton itineraries.
def GenericModel : SchedMachineModel {
let IssueWidth = 4;
let MinLatency = 0;
let LoadLatency = 4;
let HighLatency = 10;
+ let ILPWindow = 10;
}
include "X86ScheduleAtom.td"
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index 87102614cc..1e5f2d6c9a 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -525,6 +525,7 @@ def AtomModel : SchedMachineModel {
// OperandCycles may be used for expected latency.
let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
let HighLatency = 30;// Expected, may be overriden by OperandCycles.
+ let ILPWindow = 0; // Always try to hide expected latency.
let Itineraries = AtomItineraries;
}
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index af42b7c59e..f934fdd859 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -14,7 +14,7 @@
#define DEBUG_TYPE "x86-selectiondag-info"
#include "X86TargetMachine.h"
#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/DerivedTypes.h"
+#include "llvm/IR/DerivedTypes.h"
using namespace llvm;
X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
@@ -202,6 +202,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
SrcPtrInfo.getAddrSpace() >= 256)
return SDValue();
+ // ESI might be used as a base pointer, in that case we can't simply overwrite
+ // the register. Fall back to generic code.
+ const X86RegisterInfo *TRI =
+ static_cast<const X86RegisterInfo *>(DAG.getTarget().getRegisterInfo());
+ if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
+ TRI->getBaseRegister() == X86::ESI)
+ return SDValue();
+
MVT AVT;
if (Align & 1)
AVT = MVT::i8;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index b4d554a64e..0f2c008ab9 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -14,7 +14,9 @@
#define DEBUG_TYPE "subtarget"
#include "X86Subtarget.h"
#include "X86InstrInfo.h"
-#include "llvm/GlobalValue.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/Host.h"
@@ -155,6 +157,12 @@ const char *X86Subtarget::getBZeroEntry() const {
return 0;
}
+bool X86Subtarget::hasSinCos() const {
+ return getTargetTriple().isMacOSX() &&
+ !getTargetTriple().isMacOSXVersionLT(10, 9) &&
+ is64Bit();
+}
+
/// IsLegalToCallImmediateAddr - Return true if the subtarget allows calls
/// to immediate address.
bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
@@ -318,44 +326,23 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
}
}
-X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
- const std::string &FS,
- unsigned StackAlignOverride, bool is64Bit)
- : X86GenSubtargetInfo(TT, CPU, FS)
- , X86ProcFamily(Others)
- , PICStyle(PICStyles::None)
- , X86SSELevel(NoMMXSSE)
- , X863DNowLevel(NoThreeDNow)
- , HasCMov(false)
- , HasX86_64(false)
- , HasPOPCNT(false)
- , HasSSE4A(false)
- , HasAES(false)
- , HasPCLMUL(false)
- , HasFMA(false)
- , HasFMA4(false)
- , HasXOP(false)
- , HasMOVBE(false)
- , HasRDRAND(false)
- , HasF16C(false)
- , HasFSGSBase(false)
- , HasLZCNT(false)
- , HasBMI(false)
- , HasBMI2(false)
- , HasRTM(false)
- , IsBTMemSlow(false)
- , IsUAMemFast(false)
- , HasVectorUAMem(false)
- , HasCmpxchg16b(false)
- , UseLeaForSP(false)
- , HasSlowDivide(false)
- , PostRAScheduler(false)
- , stackAlignment(4)
- // FIXME: this is a known good value for Yonah. How about others?
- , MaxInlineSizeThreshold(128)
- , TargetTriple(TT)
- , In64BitMode(is64Bit) {
- // Determine default and user specified characteristics
+void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
+ AttributeSet FnAttrs = MF->getFunction()->getAttributes();
+ Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+ "target-cpu");
+ Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
+ "target-features");
+ std::string CPU =
+ !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+ std::string FS =
+ !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+ if (!FS.empty()) {
+ initializeEnvironment();
+ resetSubtargetFeatures(CPU, FS);
+ }
+}
+
+void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
std::string CPUName = CPU;
if (!FS.empty() || !CPU.empty()) {
if (CPUName.empty()) {
@@ -432,6 +419,53 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
stackAlignment = 16;
}
+void X86Subtarget::initializeEnvironment() {
+ X86SSELevel = NoMMXSSE;
+ X863DNowLevel = NoThreeDNow;
+ HasCMov = false;
+ HasX86_64 = false;
+ HasPOPCNT = false;
+ HasSSE4A = false;
+ HasAES = false;
+ HasPCLMUL = false;
+ HasFMA = false;
+ HasFMA4 = false;
+ HasXOP = false;
+ HasMOVBE = false;
+ HasRDRAND = false;
+ HasF16C = false;
+ HasFSGSBase = false;
+ HasLZCNT = false;
+ HasBMI = false;
+ HasBMI2 = false;
+ HasRTM = false;
+ HasADX = false;
+ IsBTMemSlow = false;
+ IsUAMemFast = false;
+ HasVectorUAMem = false;
+ HasCmpxchg16b = false;
+ UseLeaForSP = false;
+ HasSlowDivide = false;
+ PostRAScheduler = false;
+ PadShortFunctions = false;
+ stackAlignment = 4;
+ // FIXME: this is a known good value for Yonah. How about others?
+ MaxInlineSizeThreshold = 128;
+}
+
+X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
+ const std::string &FS,
+ unsigned StackAlignOverride, bool is64Bit)
+ : X86GenSubtargetInfo(TT, CPU, FS)
+ , X86ProcFamily(Others)
+ , PICStyle(PICStyles::None)
+ , TargetTriple(TT)
+ , StackAlignOverride(StackAlignOverride)
+ , In64BitMode(is64Bit) {
+ initializeEnvironment();
+ resetSubtargetFeatures(CPU, FS);
+}
+
bool X86Subtarget::enablePostRAScheduler(
CodeGenOpt::Level OptLevel,
TargetSubtargetInfo::AntiDepBreakMode& Mode,
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 387edc5698..e97da4b6f4 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -15,7 +15,7 @@
#define X86SUBTARGET_H
#include "llvm/ADT/Triple.h"
-#include "llvm/CallingConv.h"
+#include "llvm/IR/CallingConv.h"
#include "llvm/Target/TargetSubtargetInfo.h"
#include <string>
@@ -121,6 +121,9 @@ protected:
/// HasRTM - Processor has RTM instructions.
bool HasRTM;
+ /// HasADX - Processor has ADX instructions.
+ bool HasADX;
+
/// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
bool IsBTMemSlow;
@@ -146,6 +149,10 @@ protected:
/// PostRAScheduler - True if using post-register-allocation scheduler.
bool PostRAScheduler;
+ /// PadShortFunctions - True if the short functions should be padded to prevent
+ /// a stall when returning too early.
+ bool PadShortFunctions;
+
/// stackAlignment - The minimum alignment known to hold of the stack frame on
/// entry to the function and which must be maintained by every function.
unsigned stackAlignment;
@@ -161,11 +168,13 @@ protected:
InstrItineraryData InstrItins;
private:
+ /// StackAlignOverride - Override the stack alignment.
+ unsigned StackAlignOverride;
+
/// In64BitMode - True if compiling for 64-bit, false for 32-bit.
bool In64BitMode;
public:
-
/// This constructor initializes the data members to match that
/// of the specified triple.
///
@@ -190,7 +199,26 @@ public:
/// instruction.
void AutoDetectSubtargetFeatures();
- bool is64Bit() const { return In64BitMode; }
+ /// \brief Reset the features for the X86 target.
+ virtual void resetSubtargetFeatures(const MachineFunction *MF);
+private:
+ void initializeEnvironment();
+ void resetSubtargetFeatures(StringRef CPU, StringRef FS);
+public:
+ /// Is this x86_64? (disregarding specific ABI / programming model)
+ bool is64Bit() const {
+ return In64BitMode;
+ }
+
+ /// Is this x86_64 with the ILP32 programming model (x32 ABI)?
+ bool isTarget64BitILP32() const {
+ return In64BitMode && (TargetTriple.getEnvironment() == Triple::GNUX32);
+ }
+
+ /// Is this x86_64 with the LP64 programming model (standard AMD64, no x32)?
+ bool isTarget64BitLP64() const {
+ return In64BitMode && (TargetTriple.getEnvironment() != Triple::GNUX32);
+ }
PICStyles::Style getPICStyle() const { return PICStyle; }
void setPICStyle(PICStyles::Style Style) { PICStyle = Style; }
@@ -225,12 +253,14 @@ public:
bool hasBMI() const { return HasBMI; }
bool hasBMI2() const { return HasBMI2; }
bool hasRTM() const { return HasRTM; }
+ bool hasADX() const { return HasADX; }
bool isBTMemSlow() const { return IsBTMemSlow; }
bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
bool hasVectorUAMem() const { return HasVectorUAMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasSlowDivide() const { return HasSlowDivide; }
+ bool padShortFunctions() const { return PadShortFunctions; }
bool isAtom() const { return X86ProcFamily == IntelAtom; }
@@ -310,6 +340,10 @@ public:
/// memset with zero passed as the second argument. Otherwise it
/// returns null.
const char *getBZeroEntry() const;
+
+ /// This function returns true if the target has sincos() routine in its
+ /// compiler runtime or math libraries.
+ bool hasSinCos() const;
/// enablePostRAScheduler - run for Atom optimization.
bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index b7a79563b2..8aa58a2042 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -46,10 +46,9 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
"e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-"
"n8:16:32-S128"),
InstrInfo(*this),
- TSInfo(*this),
TLInfo(*this),
- JITInfo(*this),
- STTI(&TLInfo), VTTI(&TLInfo) {
+ TSInfo(*this),
+ JITInfo(*this) {
}
void X86_64TargetMachine::anchor() { }
@@ -60,13 +59,16 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
Reloc::Model RM, CodeModel::Model CM,
CodeGenOpt::Level OL)
: X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
- DL("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
- "n8:16:32:64-S128"),
+ // The x32 ABI dictates the ILP32 programming model for x64.
+ DL(getSubtargetImpl()->isTarget64BitILP32() ?
+ "e-p:32:32-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+ "n8:16:32:64-S128" :
+ "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+ "n8:16:32:64-S128"),
InstrInfo(*this),
- TSInfo(*this),
TLInfo(*this),
- JITInfo(*this),
- STTI(&TLInfo), VTTI(&TLInfo){
+ TSInfo(*this),
+ JITInfo(*this) {
}
/// X86TargetMachine ctor - Create an X86 target.
@@ -121,6 +123,19 @@ X86EarlyIfConv("x86-early-ifcvt",
cl::desc("Enable early if-conversion on X86"));
//===----------------------------------------------------------------------===//
+// X86 Analysis Pass Setup
+//===----------------------------------------------------------------------===//
+
+void X86TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+ // Add first the target-independent BasicTTI pass, then our X86 pass. This
+ // allows the X86 pass to delegate to the target independent layer when
+ // appropriate.
+ PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+ PM.add(createX86TargetTransformInfoPass(this));
+}
+
+
+//===----------------------------------------------------------------------===//
// Pass Pipeline Configuration
//===----------------------------------------------------------------------===//
@@ -140,6 +155,7 @@ public:
}
virtual bool addInstSelector();
+ virtual bool addILPOpts();
virtual bool addPreRegAlloc();
virtual bool addPostRegAlloc();
virtual bool addPreEmitPass();
@@ -147,12 +163,7 @@ public:
} // namespace
TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
- X86PassConfig *PC = new X86PassConfig(this, PM);
-
- if (X86EarlyIfConv && Subtarget.hasCMov())
- PC->enablePass(&EarlyIfConverterID);
-
- return PC;
+ return new X86PassConfig(this, PM);
}
bool X86PassConfig::addInstSelector() {
@@ -170,6 +181,14 @@ bool X86PassConfig::addInstSelector() {
return false;
}
+bool X86PassConfig::addILPOpts() {
+ if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
+ addPass(&EarlyIfConverterID);
+ return true;
+ }
+ return false;
+}
+
bool X86PassConfig::addPreRegAlloc() {
return false; // -print-machineinstr shouldn't print after this.
}
@@ -191,6 +210,12 @@ bool X86PassConfig::addPreEmitPass() {
ShouldPrint = true;
}
+ if (getOptLevel() != CodeGenOpt::None &&
+ getX86Subtarget().padShortFunctions()) {
+ addPass(createX86PadShortFunctions());
+ ShouldPrint = true;
+ }
+
return ShouldPrint;
}
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 792f721e76..174d391831 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -21,10 +21,9 @@
#include "X86JITInfo.h"
#include "X86SelectionDAGInfo.h"
#include "X86Subtarget.h"
-#include "llvm/DataLayout.h"
+#include "llvm/IR/DataLayout.h"
#include "llvm/Target/TargetFrameLowering.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetTransformImpl.h"
namespace llvm {
@@ -65,6 +64,9 @@ public:
return &InstrItins;
}
+ /// \brief Register X86 analysis passes with a pass manager.
+ virtual void addAnalysisPasses(PassManagerBase &PM);
+
// Set up the pass pipeline.
virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
@@ -78,11 +80,9 @@ class X86_32TargetMachine : public X86TargetMachine {
virtual void anchor();
const DataLayout DL; // Calculates type size & alignment
X86InstrInfo InstrInfo;
- X86SelectionDAGInfo TSInfo;
X86TargetLowering TLInfo;
+ X86SelectionDAGInfo TSInfo;
X86JITInfo JITInfo;
- ScalarTargetTransformImpl STTI;
- X86VectorTargetTransformInfo VTTI;
public:
X86_32TargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -101,12 +101,6 @@ public:
virtual X86JITInfo *getJITInfo() {
return &JITInfo;
}
- virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
- return &STTI;
- }
- virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
- return &VTTI;
- }
};
/// X86_64TargetMachine - X86 64-bit target machine.
@@ -115,11 +109,9 @@ class X86_64TargetMachine : public X86TargetMachine {
virtual void anchor();
const DataLayout DL; // Calculates type size & alignment
X86InstrInfo InstrInfo;
- X86SelectionDAGInfo TSInfo;
X86TargetLowering TLInfo;
+ X86SelectionDAGInfo TSInfo;
X86JITInfo JITInfo;
- X86ScalarTargetTransformImpl STTI;
- X86VectorTargetTransformInfo VTTI;
public:
X86_64TargetMachine(const Target &T, StringRef TT,
StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -138,12 +130,6 @@ public:
virtual X86JITInfo *getJITInfo() {
return &JITInfo;
}
- virtual const ScalarTargetTransformInfo *getScalarTargetTransformInfo()const {
- return &STTI;
- }
- virtual const VectorTargetTransformInfo *getVectorTargetTransformInfo()const {
- return &VTTI;
- }
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index b8ee319291..871dacd6a1 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -8,16 +8,12 @@
//===----------------------------------------------------------------------===//
#include "X86TargetObjectFile.h"
-#include "X86TargetMachine.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ELF.h"
#include "llvm/Target/Mangler.h"
+
using namespace llvm;
using namespace dwarf;
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
new file mode 100644
index 0000000000..be2a997b8e
--- /dev/null
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -0,0 +1,373 @@
+//===-- X86TargetTransformInfo.cpp - X86 specific TTI pass ----------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements a TargetTransformInfo analysis pass specific to the
+/// X86 target machine. It uses the target's detailed information to provide
+/// more precise answers to certain TTI queries, while letting the target
+/// independent and default TTI implementations handle the rest.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "x86tti"
+#include "X86.h"
+#include "X86TargetMachine.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/CostTable.h"
+using namespace llvm;
+
+// Declare the pass initialization routine locally as target-specific passes
+// don't havve a target-wide initialization entry point, and so we rely on the
+// pass constructor initialization.
+namespace llvm {
+void initializeX86TTIPass(PassRegistry &);
+}
+
+namespace {
+
+class X86TTI : public ImmutablePass, public TargetTransformInfo {
+ const X86TargetMachine *TM;
+ const X86Subtarget *ST;
+ const X86TargetLowering *TLI;
+
+ /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+ /// are set if the result needs to be inserted and/or extracted from vectors.
+ unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
+public:
+ X86TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+ llvm_unreachable("This pass cannot be directly constructed");
+ }
+
+ X86TTI(const X86TargetMachine *TM)
+ : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+ TLI(TM->getTargetLowering()) {
+ initializeX86TTIPass(*PassRegistry::getPassRegistry());
+ }
+
+ virtual void initializePass() {
+ pushTTIStack(this);
+ }
+
+ virtual void finalizePass() {
+ popTTIStack();
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ TargetTransformInfo::getAnalysisUsage(AU);
+ }
+
+ /// Pass identification.
+ static char ID;
+
+ /// Provide necessary pointer adjustments for the two base classes.
+ virtual void *getAdjustedAnalysisPointer(const void *ID) {
+ if (ID == &TargetTransformInfo::ID)
+ return (TargetTransformInfo*)this;
+ return this;
+ }
+
+ /// \name Scalar TTI Implementations
+ /// @{
+ virtual PopcntSupportKind getPopcntSupport(unsigned TyWidth) const;
+
+ /// @}
+
+ /// \name Vector TTI Implementations
+ /// @{
+
+ virtual unsigned getNumberOfRegisters(bool Vector) const;
+ virtual unsigned getRegisterBitWidth(bool Vector) const;
+ virtual unsigned getMaximumUnrollFactor() const;
+ virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+ virtual unsigned getShuffleCost(ShuffleKind Kind, Type *Tp,
+ int Index, Type *SubTp) const;
+ virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+ Type *Src) const;
+ virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) const;
+ virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const;
+ virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+ unsigned Alignment,
+ unsigned AddressSpace) const;
+
+ /// @}
+};
+
+} // end anonymous namespace
+
+INITIALIZE_AG_PASS(X86TTI, TargetTransformInfo, "x86tti",
+ "X86 Target Transform Info", true, true, false)
+char X86TTI::ID = 0;
+
+ImmutablePass *
+llvm::createX86TargetTransformInfoPass(const X86TargetMachine *TM) {
+ return new X86TTI(TM);
+}
+
+
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
+ assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+ // TODO: Currently the __builtin_popcount() implementation using SSE3
+ // instructions is inefficient. Once the problem is fixed, we should
+ // call ST->hasSSE3() instead of ST->hasSSE4().
+ return ST->hasSSE41() ? PSK_FastHardware : PSK_Software;
+}
+
+unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
+ if (Vector && !ST->hasSSE1())
+ return 0;
+
+ if (ST->is64Bit())
+ return 16;
+ return 8;
+}
+
+unsigned X86TTI::getRegisterBitWidth(bool Vector) const {
+ if (Vector) {
+ if (ST->hasAVX()) return 256;
+ if (ST->hasSSE1()) return 128;
+ return 0;
+ }
+
+ if (ST->is64Bit())
+ return 64;
+ return 32;
+
+}
+
+unsigned X86TTI::getMaximumUnrollFactor() const {
+ if (ST->isAtom())
+ return 1;
+
+ // Sandybridge and Haswell have multiple execution ports and pipelined
+ // vector units.
+ if (ST->hasAVX())
+ return 4;
+
+ return 2;
+}
+
+unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ static const CostTblEntry<MVT> AVX1CostTable[] = {
+ // We don't have to scalarize unsupported ops. We can issue two half-sized
+ // operations and we only need to extract the upper YMM half.
+ // Two ops + 1 extract + 1 insert = 4.
+ { ISD::MUL, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v8i32, 4 },
+ { ISD::ADD, MVT::v8i32, 4 },
+ { ISD::SUB, MVT::v4i64, 4 },
+ { ISD::ADD, MVT::v4i64, 4 },
+ // A v4i64 multiply is custom lowered as two split v2i64 vectors that then
+ // are lowered as a series of long multiplies(3), shifts(4) and adds(2)
+ // Because we believe v4i64 to be a legal type, we must also include the
+ // split factor of two in the cost table. Therefore, the cost here is 18
+ // instead of 9.
+ { ISD::MUL, MVT::v4i64, 18 },
+ };
+
+ // Look for AVX1 lowering tricks.
+ if (ST->hasAVX() && !ST->hasAVX2()) {
+ int Idx = CostTableLookup<MVT>(AVX1CostTable, array_lengthof(AVX1CostTable),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * AVX1CostTable[Idx].Cost;
+ }
+
+ // Custom lowering of vectors.
+ static const CostTblEntry<MVT> CustomLowered[] = {
+ // A v2i64/v4i64 and multiply is custom lowered as a series of long
+ // multiplies(3), shifts(4) and adds(2).
+ { ISD::MUL, MVT::v2i64, 9 },
+ { ISD::MUL, MVT::v4i64, 9 },
+ };
+ int Idx = CostTableLookup<MVT>(CustomLowered, array_lengthof(CustomLowered),
+ ISD, LT.second);
+ if (Idx != -1)
+ return LT.first * CustomLowered[Idx].Cost;
+
+ // Special lowering of v4i32 mul on sse2, sse3: Lower v4i32 mul as 2x shuffle,
+ // 2x pmuludq, 2x shuffle.
+ if (ISD == ISD::MUL && LT.second == MVT::v4i32 && ST->hasSSE2() &&
+ !ST->hasSSE41())
+ return 6;
+
+ // Fallback to the default implementation.
+ return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty);
+}
+
+unsigned X86TTI::getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
+ Type *SubTp) const {
+ // We only estimate the cost of reverse shuffles.
+ if (Kind != SK_Reverse)
+ return TargetTransformInfo::getShuffleCost(Kind, Tp, Index, SubTp);
+
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Tp);
+ unsigned Cost = 1;
+ if (LT.second.getSizeInBits() > 128)
+ Cost = 3; // Extract + insert + copy.
+
+ // Multiple by the number of parts.
+ return Cost * LT.first;
+}
+
+unsigned X86TTI::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const {
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ EVT SrcTy = TLI->getValueType(Src);
+ EVT DstTy = TLI->getValueType(Dst);
+
+ if (!SrcTy.isSimple() || !DstTy.isSimple())
+ return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+ static const TypeConversionCostTblEntry<MVT> AVXConversionTbl[] = {
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+ { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1 },
+ { ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1 },
+ { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
+ { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 1 },
+ { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 1 },
+ { ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f32, 1 },
+ { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 1 },
+ { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1, 6 },
+ { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1, 9 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i1, 8 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i8, 8 },
+ { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 8 },
+ { ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 3 },
+ };
+
+ if (ST->hasAVX()) {
+ int Idx = ConvertCostTableLookup<MVT>(AVXConversionTbl,
+ array_lengthof(AVXConversionTbl),
+ ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+ if (Idx != -1)
+ return AVXConversionTbl[Idx].Cost;
+ }
+
+ return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned X86TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+ Type *CondTy) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Invalid opcode");
+
+ static const CostTblEntry<MVT> SSE42CostTbl[] = {
+ { ISD::SETCC, MVT::v2f64, 1 },
+ { ISD::SETCC, MVT::v4f32, 1 },
+ { ISD::SETCC, MVT::v2i64, 1 },
+ { ISD::SETCC, MVT::v4i32, 1 },
+ { ISD::SETCC, MVT::v8i16, 1 },
+ { ISD::SETCC, MVT::v16i8, 1 },
+ };
+
+ static const CostTblEntry<MVT> AVX1CostTbl[] = {
+ { ISD::SETCC, MVT::v4f64, 1 },
+ { ISD::SETCC, MVT::v8f32, 1 },
+ // AVX1 does not support 8-wide integer compare.
+ { ISD::SETCC, MVT::v4i64, 4 },
+ { ISD::SETCC, MVT::v8i32, 4 },
+ { ISD::SETCC, MVT::v16i16, 4 },
+ { ISD::SETCC, MVT::v32i8, 4 },
+ };
+
+ static const CostTblEntry<MVT> AVX2CostTbl[] = {
+ { ISD::SETCC, MVT::v4i64, 1 },
+ { ISD::SETCC, MVT::v8i32, 1 },
+ { ISD::SETCC, MVT::v16i16, 1 },
+ { ISD::SETCC, MVT::v32i8, 1 },
+ };
+
+ if (ST->hasAVX2()) {
+ int Idx = CostTableLookup<MVT>(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * AVX2CostTbl[Idx].Cost;
+ }
+
+ if (ST->hasAVX()) {
+ int Idx = CostTableLookup<MVT>(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * AVX1CostTbl[Idx].Cost;
+ }
+
+ if (ST->hasSSE42()) {
+ int Idx = CostTableLookup<MVT>(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+ if (Idx != -1)
+ return LT.first * SSE42CostTbl[Idx].Cost;
+ }
+
+ return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+ unsigned Index) const {
+ assert(Val->isVectorTy() && "This must be a vector type");
+
+ if (Index != -1U) {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+ // This type is legalized to a scalar type.
+ if (!LT.second.isVector())
+ return 0;
+
+ // The type may be split. Normalize the index to the new type.
+ unsigned Width = LT.second.getVectorNumElements();
+ Index = Index % Width;
+
+ // Floating point scalars are already located in index #0.
+ if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+ return 0;
+ }
+
+ return TargetTransformInfo::getVectorInstrCost(Opcode, Val, Index);
+}
+
+unsigned X86TTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ unsigned AddressSpace) const {
+ // Legalize the type.
+ std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+ assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
+ "Invalid Opcode");
+
+ // Each load/store unit costs 1.
+ unsigned Cost = LT.first * 1;
+
+ // On Sandybridge 256bit load/stores are double pumped
+ // (but not on Haswell).
+ if (LT.second.getSizeInBits() > 128 && !ST->hasAVX2())
+ Cost*=2;
+
+ return Cost;
+}
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index c4a58874a4..0f77948c0e 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -120,9 +120,19 @@ static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
return false;
}
+static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+ for (unsigned reg = X86::YMM0; reg < X86::YMM15; ++reg) {
+ if (!MO.clobbersPhysReg(reg))
+ return false;
+ }
+ return true;
+}
+
static bool hasYmmReg(MachineInstr *MI) {
for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
const MachineOperand &MO = MI->getOperand(i);
+ if (MI->isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+ return true;
if (!MO.isReg())
continue;
if (MO.isDebug())