2 files changed, 334 insertions, 0 deletions
diff --git a/lib/Transforms/NaCl/ResolvePNaClIntrinsics.cpp b/lib/Transforms/NaCl/ResolvePNaClIntrinsics.cpp
index fc5138574d..3550cd9aca 100644
--- a/lib/Transforms/NaCl/ResolvePNaClIntrinsics.cpp
+++ b/lib/Transforms/NaCl/ResolvePNaClIntrinsics.cpp
@@ -19,12 +19,14 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NaClAtomicIntrinsics.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
@@ -224,11 +226,27 @@ private:
                         thawMemoryOrder(Call->getArgOperand(2)), SS, Call);
       break;
     case Intrinsic::nacl_atomic_rmw:
+      if (needsX8632HackFor16BitAtomics(cast<PointerType>(
+              Call->getArgOperand(1)->getType())->getElementType())) {
+        // TODO(jfb) Remove this hack. See below.
+        atomic16BitX8632Hack(Call, false, Call->getArgOperand(1),
+                             Call->getArgOperand(2), Call->getArgOperand(0),
+                             NULL);
+        return true;
+      }
       I = new AtomicRMWInst(thawRMWOperation(Call->getArgOperand(0)),
                             Call->getArgOperand(1), Call->getArgOperand(2),
                             thawMemoryOrder(Call->getArgOperand(3)), SS, Call);
       break;
     case Intrinsic::nacl_atomic_cmpxchg:
+      if (needsX8632HackFor16BitAtomics(cast<PointerType>(
+              Call->getArgOperand(0)->getType())->getElementType())) {
+        // TODO(jfb) Remove this hack. See below.
+        atomic16BitX8632Hack(Call, true, Call->getArgOperand(0),
+                             Call->getArgOperand(2), NULL,
+                             Call->getArgOperand(1));
+        return true;
+      }
       // TODO LLVM currently doesn't support specifying separate memory
       //      orders for compare exchange's success and failure cases:
       //      LLVM IR implicitly drops the Release part of the specified
@@ -304,6 +322,184 @@ private:
     }
   }
 
+  // TODO(jfb) Remove the following hacks once NaCl's x86-32 validator
+  // supports 16-bit atomic intrisics. See:
+  //   https://code.google.com/p/nativeclient/issues/detail?id=3579
+  //   https://code.google.com/p/nativeclient/issues/detail?id=2981
+  // ===========================================================================
+  bool needsX8632HackFor16BitAtomics(Type *OverloadedType) const {
+    return Triple(M->getTargetTriple()).getArch() == Triple::x86 &&
+        OverloadedType == Type::getInt16Ty(M->getContext());
+  }
+
+  /// Expand the 16-bit Intrinsic into an equivalent 32-bit
+  /// compare-exchange loop.
+  void atomic16BitX8632Hack(IntrinsicInst *Call, bool IsCmpXChg,
+                            Value *Ptr16, Value *RHS, Value *RMWOp,
+                            Value *CmpXChgOldVal) const {
+    assert((IsCmpXChg ? CmpXChgOldVal : RMWOp) &&
+           "cmpxchg expects an old value, whereas RMW expects an operation");
+    Type *I16 = Type::getInt16Ty(M->getContext());
+    Type *I32 = Type::getInt32Ty(M->getContext());
+    Type *I32Ptr = Type::getInt32PtrTy(M->getContext());
+
+    // Precede this with a compiler fence.
+    FunctionType *FTy =
+        FunctionType::get(Type::getVoidTy(M->getContext()), false);
+    std::string AsmString; // Empty.
+    std::string Constraints("~{memory}");
+    bool HasSideEffect = true;
+    CallInst::Create(InlineAsm::get(
+        FTy, AsmString, Constraints, HasSideEffect), "", Call);
+
+    BasicBlock *CurrentBB = Call->getParent();
+    IRBuilder<> IRB(CurrentBB, Call);
+    BasicBlock *Aligned32BB =
+        BasicBlock::Create(IRB.getContext(), "atomic16aligned32",
+                           CurrentBB->getParent());
+    BasicBlock *Aligned16BB =
+        BasicBlock::Create(IRB.getContext(), "atomic16aligned16",
+                           CurrentBB->getParent());
+
+    // Setup.
+    // Align the 16-bit pointer to 32-bits, and figure out if the 16-bit
+    // operation is to be carried on the top or bottom half of the
+    // 32-bit aligned value.
+    Value *IPtr = IRB.CreatePtrToInt(Ptr16, I32, "uintptr");
+    Value *IPtrAlign = IRB.CreateAnd(IPtr, IRB.getInt32(~3u), "aligneduintptr");
+    Value *Aligned32 = IRB.CreateAnd(IPtr, IRB.getInt32(3u), "aligned32");
+    Value *Ptr32 = IRB.CreateIntToPtr(IPtrAlign, I32Ptr, "ptr32");
+    Value *IsAligned32 = IRB.CreateICmpEQ(Aligned32, IRB.getInt32(0),
+                                          "isaligned32");
+    IRB.CreateCondBr(IsAligned32, Aligned32BB, Aligned16BB);
+
+    // Create a diamond after the setup. The rest of the basic block
+    // that the Call was in is separated into the successor block.
+    BasicBlock *Successor =
+        CurrentBB->splitBasicBlock(IRB.GetInsertPoint(), "atomic16successor");
+    // Remove the extra unconditional branch that the split added.
+    CurrentBB->getTerminator()->eraseFromParent();
+
+    // Aligned 32 block.
+    // The 16-bit value was aligned to 32-bits:
+    //  - Atomically load the full 32-bit value.
+    //  - Get the 16-bit value from its bottom.
+    //  - Perform the 16-bit operation.
+    //  - Truncate and merge the result back with the top half of the
+    //    loaded value.
+    //  - Try to compare-exchange this new 32-bit result. This will
+    //    succeed if the value at the 32-bit location is still what was
+    //    just loaded. If not, try the entire thing again.
+    //  - Return the 16-bit value before the operation was performed.
+    Value *Ret32;
+    {
+      IRB.SetInsertPoint(Aligned32BB);
+      LoadInst *Loaded = IRB.CreateAlignedLoad(Ptr32, 4, "loaded");
+      Loaded->setAtomic(SequentiallyConsistent);
+      Value *TruncVal = IRB.CreateTrunc(Loaded, I16, "truncval");
+      Ret32 = TruncVal;
+      Value *Res;
+      if (IsCmpXChg) {
+        Res = RHS;
+      } else {
+        switch (thawRMWOperation(RMWOp)) {
+        default: llvm_unreachable("unknown atomic RMW operation");
+        case AtomicRMWInst::Add:
+          Res = IRB.CreateAdd(TruncVal, RHS, "res"); break;
+        case AtomicRMWInst::Sub:
+          Res = IRB.CreateSub(TruncVal, RHS, "res"); break;
+        case AtomicRMWInst::Or:
+          Res = IRB.CreateOr(TruncVal, RHS, "res"); break;
+        case AtomicRMWInst::And:
+          Res = IRB.CreateAnd(TruncVal, RHS, "res"); break;
+        case AtomicRMWInst::Xor:
+          Res = IRB.CreateXor(TruncVal, RHS, "res"); break;
+        case AtomicRMWInst::Xchg:
+          Res = RHS; break;
+        }
+      }
+      Value *MergeRes = IRB.CreateZExt(Res, I32, "mergeres");
+      Value *MaskedLoaded = IRB.CreateAnd(Loaded, IRB.getInt32(0xFFFF0000u),
+                                          "maskedloaded");
+      Value *FinalRes = IRB.CreateOr(MergeRes, MaskedLoaded, "finalres");
+      Value *Expected = IsCmpXChg ?
+          IRB.CreateOr(MaskedLoaded, IRB.CreateZExt(CmpXChgOldVal, I32, "zext"),
+                       "expected") :
+          Loaded;
+      Value *OldVal = IRB.CreateAtomicCmpXchg(Ptr32, Expected, FinalRes,
+                                              SequentiallyConsistent);
+      OldVal->setName("oldval");
+      // Test that the entire 32-bit value didn't change during the operation.
+      Value *Success = IRB.CreateICmpEQ(OldVal, Loaded, "success");
+      IRB.CreateCondBr(Success, Successor, Aligned32BB);
+    }
+
+    // Aligned 16 block.
+    // Similar to the above aligned 32 block, but the 16-bit value is in
+    // the top half of the 32-bit value. It needs to be shifted down,
+    // and shifted back up before being merged in.
+    Value *Ret16;
+    {
+      IRB.SetInsertPoint(Aligned16BB);
+      LoadInst *Loaded = IRB.CreateAlignedLoad(Ptr32, 4, "loaded");
+      Loaded->setAtomic(SequentiallyConsistent);
+      Value *ShVal = IRB.CreateTrunc(IRB.CreateLShr(Loaded, 16, "lshr"), I16,
+                                     "shval");
+      Ret16 = ShVal;
+      Value *Res;
+      if (IsCmpXChg) {
+        Res = RHS;
+      } else {
+        switch (thawRMWOperation(RMWOp)) {
+        default: llvm_unreachable("unknown atomic RMW operation");
+        case AtomicRMWInst::Add:
+          Res = IRB.CreateAdd(ShVal, RHS, "res"); break;
+        case AtomicRMWInst::Sub:
+          Res = IRB.CreateSub(ShVal, RHS, "res"); break;
+        case AtomicRMWInst::Or:
+          Res = IRB.CreateOr(ShVal, RHS, "res"); break;
+        case AtomicRMWInst::And:
+          Res = IRB.CreateAnd(ShVal, RHS, "res"); break;
+        case AtomicRMWInst::Xor:
+          Res = IRB.CreateXor(ShVal, RHS, "res"); break;
+        case AtomicRMWInst::Xchg:
+          Res = RHS; break;
+        }
+      }
+      Value *MergeRes = IRB.CreateShl(IRB.CreateZExt(Res, I32, "zext"), 16,
+                                      "mergeres");
+      Value *MaskedLoaded = IRB.CreateAnd(Loaded, IRB.getInt32(0xFFFF),
+                                          "maskedloaded");
+      Value *FinalRes = IRB.CreateOr(MergeRes, MaskedLoaded, "finalres");
+      Value *Expected = IsCmpXChg ?
+          IRB.CreateOr(MaskedLoaded, IRB.CreateShl(
+              IRB.CreateZExt(CmpXChgOldVal, I32, "zext"), 16, "shl"),
+                       "expected") :
+          Loaded;
+      Value *OldVal = IRB.CreateAtomicCmpXchg(Ptr32, Expected, FinalRes,
+                                              SequentiallyConsistent);
+      OldVal->setName("oldval");
+      // Test that the entire 32-bit value didn't change during the operation.
+      Value *Success = IRB.CreateICmpEQ(OldVal, Loaded, "success");
+      IRB.CreateCondBr(Success, Successor, Aligned16BB);
+    }
+
+    // Merge the value, and remove the original intrinsic Call.
+    IRB.SetInsertPoint(Successor->getFirstInsertionPt());
+    PHINode *PHI = IRB.CreatePHI(I16, 2);
+    PHI->addIncoming(Ret32, Aligned32BB);
+    PHI->addIncoming(Ret16, Aligned16BB);
+    Call->replaceAllUsesWith(PHI);
+    Call->eraseFromParent();
+
+    // Finish everything with another compiler fence.
+    CallInst::Create(InlineAsm::get(
+        FTy, AsmString, Constraints, HasSideEffect), "",
+                     Successor->getFirstInsertionPt());
+  }
+  // ===========================================================================
+  // End hacks.
+
   AtomicCallResolver(const AtomicCallResolver &);
   AtomicCallResolver &operator=(const AtomicCallResolver &);
 };
diff --git a/test/Transforms/NaCl/resolve-pnacl-intrinsics-x86-32-16-bit-atomics-hack.ll b/test/Transforms/NaCl/resolve-pnacl-intrinsics-x86-32-16-bit-atomics-hack.ll
new file mode 100644
index 0000000000..fc0e7c70f6
--- /dev/null
+++ b/test/Transforms/NaCl/resolve-pnacl-intrinsics-x86-32-16-bit-atomics-hack.ll
@@ -0,0 +1,138 @@
+; RUN: opt < %s -resolve-pnacl-intrinsics -S -mtriple=i386-unknown-nacl | \
+; RUN:   FileCheck %s -check-prefix=CLEANED
+; RUN: opt < %s -resolve-pnacl-intrinsics -S -mtriple=i386-unknown-nacl | \
+; RUN:   FileCheck %s
+
+; CLEANED-NOT: call {{.*}} @llvm.nacl.atomic
+
+; Supplement to resolve-pnacl-intrinsics.ll that tests the 16-bit hack
+; for x86-32. All of the RMW cases are the same except for one
+; operation.
+
+; These declarations must be here because the function pass expects
+; to find them. In real life they're inserted by the translator
+; before the function pass runs.
+declare i32 @setjmp(i8*)
+declare void @longjmp(i8*, i32)
+
+declare i16 @llvm.nacl.atomic.rmw.i16(i32, i16*, i16, i32)
+declare i16 @llvm.nacl.atomic.cmpxchg.i16(i16*, i16, i16, i32, i32)
+
+; CHECK: @test_fetch_and_add_i16
+define i16 @test_fetch_and_add_i16(i16* %ptr, i16 %value) {
+; CHECK-NEXT:  call void asm sideeffect "", "~{memory}"()
+; CHECK-NEXT:  %uintptr = ptrtoint i16* %ptr to i32
+; CHECK-NEXT:  %aligneduintptr = and i32 %uintptr, -4
+; CHECK-NEXT:  %aligned32 = and i32 %uintptr, 3
+; CHECK-NEXT:  %ptr32 = inttoptr i32 %aligneduintptr to i32*
+; CHECK-NEXT:  %isaligned32 = icmp eq i32 %aligned32, 0
+; CHECK-NEXT:  br i1 %isaligned32, label %atomic16aligned32, label %atomic16aligned16
+;
+; CHECK: atomic16successor:
+; CHECK-NEXT:  %1 = phi i16 [ %truncval, %atomic16aligned32 ], [ %shval, %atomic16aligned16 ]
+; CHECK-NEXT:  call void asm sideeffect "", "~{memory}"()
+; CHECK-NEXT:  ret i16 %1
+;
+; CHECK: atomic16aligned32:
+; CHECK-NEXT:  %loaded = load atomic i32* %ptr32 seq_cst, align 4
+; CHECK-NEXT:  %truncval = trunc i32 %loaded to i16
+; CHECK-NEXT:  %res = add i16 %truncval, %value
+; CHECK-NEXT:  %mergeres = zext i16 %res to i32
+; CHECK-NEXT:  %maskedloaded = and i32 %loaded, -65536
+; CHECK-NEXT:  %finalres = or i32 %mergeres, %maskedloaded
+; CHECK-NEXT:  %oldval = cmpxchg i32* %ptr32, i32 %loaded, i32 %finalres seq_cst
+; CHECK-NEXT:  %success = icmp eq i32 %oldval, %loaded
+; CHECK-NEXT:  br i1 %success, label %atomic16successor, label %atomic16aligned32
+;
+; CHECK: atomic16aligned16:
+; CHECK-NEXT:  %loaded1 = load atomic i32* %ptr32 seq_cst, align 4
+; CHECK-NEXT:  %lshr = lshr i32 %loaded1, 16
+; CHECK-NEXT:  %shval = trunc i32 %lshr to i16
+; CHECK-NEXT:  %res2 = add i16 %shval, %value
+; CHECK-NEXT:  %zext = zext i16 %res2 to i32
+; CHECK-NEXT:  %mergeres3 = shl i32 %zext, 16
+; CHECK-NEXT:  %maskedloaded4 = and i32 %loaded1, 65535
+; CHECK-NEXT:  %finalres5 = or i32 %mergeres3, %maskedloaded4
+; CHECK-NEXT:  %oldval6 = cmpxchg i32* %ptr32, i32 %loaded1, i32 %finalres5 seq_cst
+; CHECK-NEXT:  %success7 = icmp eq i32 %oldval6, %loaded1
+; CHECK-NEXT:  br i1 %success7, label %atomic16successor, label %atomic16aligned16
+  %1 = call i16 @llvm.nacl.atomic.rmw.i16(i32 1, i16* %ptr, i16 %value, i32 6)
+  ret i16 %1
+}
+
+; CHECK: @test_fetch_and_sub_i16
+define i16 @test_fetch_and_sub_i16(i16* %ptr, i16 %value) {
+  ; CHECK:   %res = sub i16 %truncval, %value
+  ; CHECK:   %res2 = sub i16 %shval, %value
+  %1 = call i16 @llvm.nacl.atomic.rmw.i16(i32 2, i16* %ptr, i16 %value, i32 6)
+  ret i16 %1
+}
+
+; CHECK: @test_fetch_and_or_i16
+define i16 @test_fetch_and_or_i16(i16* %ptr, i16 %value) {
+  ; CHECK:   %res = or i16 %truncval, %value
+  ; CHECK:   %res2 = or i16 %shval, %value
+  %1 = call i16 @llvm.nacl.atomic.rmw.i16(i32 3, i16* %ptr, i16 %value, i32 6)
+  ret i16 %1
+}
+
+; CHECK: @test_fetch_and_and_i16
+define i16 @test_fetch_and_and_i16(i16* %ptr, i16 %value) {
+  ; CHECK:   %res = and i16 %truncval, %value
+  ; CHECK:   %res2 = and i16 %shval, %value
+  %1 = call i16 @llvm.nacl.atomic.rmw.i16(i32 4, i16* %ptr, i16 %value, i32 6)
+  ret i16 %1
+}
+
+; CHECK: @test_fetch_and_xor_i16
+define i16 @test_fetch_and_xor_i16(i16* %ptr, i16 %value) {
+  ; CHECK:   %res = xor i16 %truncval, %value
+  ; CHECK:   %res2 = xor i16 %shval, %value
+  %1 = call i16 @llvm.nacl.atomic.rmw.i16(i32 5, i16* %ptr, i16 %value, i32 6)
+  ret i16 %1
+}
+
+; CHECK: @test_val_compare_and_swap_i16
+define i16 @test_val_compare_and_swap_i16(i16* %ptr, i16 %oldval, i16 %newval) {
+; CHECK-NEXT:  call void asm sideeffect "", "~{memory}"()
+; CHECK-NEXT:  %uintptr = ptrtoint i16* %ptr to i32
+; CHECK-NEXT:  %aligneduintptr = and i32 %uintptr, -4
+; CHECK-NEXT:  %aligned32 = and i32 %uintptr, 3
+; CHECK-NEXT:  %ptr32 = inttoptr i32 %aligneduintptr to i32*
+; CHECK-NEXT:  %isaligned32 = icmp eq i32 %aligned32, 0
+; CHECK-NEXT:  br i1 %isaligned32, label %atomic16aligned32, label %atomic16aligned16
+;
+; CHECK: atomic16successor:
+; CHECK-NEXT:  %1 = phi i16 [ %truncval, %atomic16aligned32 ], [ %shval, %atomic16aligned16 ]
+; CHECK-NEXT:  call void asm sideeffect "", "~{memory}"()
+; CHECK-NEXT:  ret i16 %1
+;
+; CHECK: atomic16aligned32:
+; CHECK-NEXT:  %loaded = load atomic i32* %ptr32 seq_cst, align 4
+; CHECK-NEXT:  %truncval = trunc i32 %loaded to i16
+; CHECK-NEXT:  %mergeres = zext i16 %newval to i32
+; CHECK-NEXT:  %maskedloaded = and i32 %loaded, -65536
+; CHECK-NEXT:  %finalres = or i32 %mergeres, %maskedloaded
+; CHECK-NEXT:  %zext = zext i16 %oldval to i32
+; CHECK-NEXT:  %expected = or i32 %maskedloaded, %zext
+; CHECK-NEXT:  %oldval1 = cmpxchg i32* %ptr32, i32 %expected, i32 %finalres seq_cst
+; CHECK-NEXT:  %success = icmp eq i32 %oldval1, %loaded
+; CHECK-NEXT:  br i1 %success, label %atomic16successor, label %atomic16aligned32
+;
+; CHECK: atomic16aligned16:
+; CHECK-NEXT:  %loaded2 = load atomic i32* %ptr32 seq_cst, align 4
+; CHECK-NEXT:  %lshr = lshr i32 %loaded2, 16
+; CHECK-NEXT:  %shval = trunc i32 %lshr to i16
+; CHECK-NEXT:  %zext3 = zext i16 %newval to i32
+; CHECK-NEXT:  %mergeres4 = shl i32 %zext3, 16
+; CHECK-NEXT:  %maskedloaded5 = and i32 %loaded2, 65535
+; CHECK-NEXT:  %finalres6 = or i32 %mergeres4, %maskedloaded5
+; CHECK-NEXT:  %zext7 = zext i16 %oldval to i32
+; CHECK-NEXT:  %shl = shl i32 %zext7, 16
+; CHECK-NEXT:  %expected8 = or i32 %maskedloaded5, %shl
+; CHECK-NEXT:  %oldval9 = cmpxchg i32* %ptr32, i32 %expected8, i32 %finalres6 seq_cst
+; CHECK-NEXT:  %success10 = icmp eq i32 %oldval9, %loaded2
+; CHECK-NEXT:  br i1 %success10, label %atomic16successor, label %atomic16aligned16
+ %1 = call i16 @llvm.nacl.atomic.cmpxchg.i16(i16* %ptr, i16 %oldval, i16 %newval, i32 6, i32 6)
+  ret i16 %1
+}