aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBill Schmidt <wschmidt@linux.vnet.ibm.com>2012-10-16 13:30:53 +0000
committerBill Schmidt <wschmidt@linux.vnet.ibm.com>2012-10-16 13:30:53 +0000
commit7a6cb15a929e76955471ef2a7b6db721198320a0 (patch)
tree1af207fad182c81508e0ebb3d7d21e953d02bf6f
parentb52ba9f8a896b6717d6395ad59f6550e1fa475b0 (diff)
This patch addresses PR13949.
For the PowerPC 64-bit ELF Linux ABI, aggregates of size less than 8 bytes are to be passed in the low-order bits ("right-adjusted") of the doubleword register or memory slot assigned to them. A previous patch addressed this for aggregates passed in registers. However, small aggregates passed in the overflow portion of the parameter save area are still being passed left-adjusted. The fix is made in PPCTargetLowering::LowerCall_Darwin_Or_64SVR4 on the caller side, and in PPCTargetLowering::LowerFormalArguments_64SVR4 on the callee side. The main fix on the callee side simply extends existing logic for 1- and 2-byte objects to 1- through 7-byte objects, and correcting a constant left over from 32-bit code. There is also a fix to a bogus calculation of the offset to the following argument in the parameter save area. On the caller side, again a constant left over from 32-bit code is fixed. Additionally, some code for 1, 2, and 4-byte objects is duplicated to handle the 3, 5, 6, and 7-byte objects for SVR4 only. The LowerCall_Darwin_Or_64SVR4 logic is getting fairly convoluted trying to handle both ABIs, and I propose to separate this into two functions in a future patch, at which time the duplication can be removed. The patch adds a new test (structsinmem.ll) to demonstrate correct passing of structures of all seven sizes. Eight dummy parameters are used to force these structures to be in the overflow portion of the parameter save area. As a side effect, this corrects the case when aggregates passed in registers are saved into the first eight doublewords of the parameter save area: Previously they were stored left-justified, and now are properly stored right-justified. This requires changing the expected output of existing test case structsinregs.ll. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166022 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/PowerPC/PPCISelLowering.cpp31
-rw-r--r--test/CodeGen/PowerPC/structsinmem.ll227
-rw-r--r--test/CodeGen/PowerPC/structsinregs.ll48
3 files changed, 282 insertions, 24 deletions
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 660bfb45f8..c18250a78f 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2035,9 +2035,8 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
ObjSize = Flags.getByValSize();
ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
// All aggregates smaller than 8 bytes must be passed right-justified.
- if (ObjSize==1 || ObjSize==2) {
- CurArgOffset = CurArgOffset + (4 - ObjSize);
- }
+ if (ObjSize < PtrByteSize)
+ CurArgOffset = CurArgOffset + (PtrByteSize - ObjSize);
// The value of the object is its address.
int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
@@ -2087,7 +2086,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
++GPR_idx;
ArgOffset += PtrByteSize;
} else {
- ArgOffset += ArgSize - (ArgOffset-CurArgOffset);
+ ArgOffset += ArgSize - j;
break;
}
}
@@ -3639,12 +3638,13 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee,
ArgOffset += PtrByteSize;
} else {
- SDValue Const = DAG.getConstant(4 - Size, PtrOff.getValueType());
+ SDValue Const = DAG.getConstant(PtrByteSize - Size,
+ PtrOff.getValueType());
SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
CallSeqStart.getNode()->getOperand(0),
Flags, DAG, dl);
- // This must go outside the CALLSEQ_START..END.
+ // The MEMCPY must go outside the CALLSEQ_START..END.
SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
CallSeqStart.getNode()->getOperand(1));
DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
@@ -3653,6 +3653,25 @@ PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee,
ArgOffset += PtrByteSize;
}
continue;
+ } else if (isSVR4ABI && GPR_idx == NumGPRs && Size < 8) {
+ // Case: Size is 3, 5, 6, or 7 for SVR4 and we're out of registers.
+ // This is the same case as 1, 2, and 4 for SVR4 with no registers.
+ // FIXME: Separate into 64-bit SVR4 and Darwin versions of this
+ // function, and combine the duplicated code chunks.
+ SDValue Const = DAG.getConstant(PtrByteSize - Size,
+ PtrOff.getValueType());
+ SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+ SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
+ CallSeqStart.getNode()->getOperand(0),
+ Flags, DAG, dl);
+ // The MEMCPY must go outside the CALLSEQ_START..END.
+ SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+ CallSeqStart.getNode()->getOperand(1));
+ DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+ NewCallSeqStart.getNode());
+ Chain = CallSeqStart = NewCallSeqStart;
+ ArgOffset += PtrByteSize;
+ continue;
}
// Copy entire object into memory. There are cases where gcc-generated
// code assumes it is there, even if it could be put entirely into
diff --git a/test/CodeGen/PowerPC/structsinmem.ll b/test/CodeGen/PowerPC/structsinmem.ll
new file mode 100644
index 0000000000..884d3a89d1
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinmem.ll
@@ -0,0 +1,227 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads. This test case will
+; need to be revised when that is fixed.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+ %p1 = alloca %struct.s1, align 1
+ %p2 = alloca %struct.s2, align 2
+ %p3 = alloca %struct.s3, align 2
+ %p4 = alloca %struct.s4, align 4
+ %p5 = alloca %struct.s5, align 4
+ %p6 = alloca %struct.s6, align 4
+ %p7 = alloca %struct.s7, align 4
+ %0 = bitcast %struct.s1* %p1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+ %1 = bitcast %struct.s2* %p2 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+ %2 = bitcast %struct.s3* %p3 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+ %3 = bitcast %struct.s4* %p4 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+ %4 = bitcast %struct.s5* %p5 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+ %5 = bitcast %struct.s6* %p6 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+ %6 = bitcast %struct.s7* %p7 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+ %call = call i32 @callee1(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+ ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stw {{[0-9]+}}, 132(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: std {{[0-9]+}}, 144(1)
+; CHECK: std {{[0-9]+}}, 152(1)
+; CHECK: std {{[0-9]+}}, 160(1)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+ %z1.addr = alloca i32, align 4
+ %z2.addr = alloca i32, align 4
+ %z3.addr = alloca i32, align 4
+ %z4.addr = alloca i32, align 4
+ %z5.addr = alloca i32, align 4
+ %z6.addr = alloca i32, align 4
+ %z7.addr = alloca i32, align 4
+ %z8.addr = alloca i32, align 4
+ store i32 %z1, i32* %z1.addr, align 4
+ store i32 %z2, i32* %z2.addr, align 4
+ store i32 %z3, i32* %z3.addr, align 4
+ store i32 %z4, i32* %z4.addr, align 4
+ store i32 %z5, i32* %z5.addr, align 4
+ store i32 %z6, i32* %z6.addr, align 4
+ store i32 %z7, i32* %z7.addr, align 4
+ store i32 %z8, i32* %z8.addr, align 4
+ %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+ %0 = load i8* %a, align 1
+ %conv = zext i8 %0 to i32
+ %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+ %1 = load i16* %a1, align 2
+ %conv2 = sext i16 %1 to i32
+ %add = add nsw i32 %conv, %conv2
+ %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+ %2 = load i16* %a3, align 2
+ %conv4 = sext i16 %2 to i32
+ %add5 = add nsw i32 %add, %conv4
+ %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+ %3 = load i32* %a6, align 4
+ %add7 = add nsw i32 %add5, %3
+ %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+ %4 = load i32* %a8, align 4
+ %add9 = add nsw i32 %add7, %4
+ %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+ %5 = load i32* %a10, align 4
+ %add11 = add nsw i32 %add9, %5
+ %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+ %6 = load i32* %a12, align 4
+ %add13 = add nsw i32 %add11, %6
+ ret i32 %add13
+
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lha {{[0-9]+}}, 132(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lwz {{[0-9]+}}, 144(1)
+; CHECK: lwz {{[0-9]+}}, 152(1)
+; CHECK: lwz {{[0-9]+}}, 160(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+ %p1 = alloca %struct.t1, align 1
+ %p2 = alloca %struct.t2, align 1
+ %p3 = alloca %struct.t3, align 1
+ %p4 = alloca %struct.t4, align 1
+ %p5 = alloca %struct.t5, align 1
+ %p6 = alloca %struct.t6, align 1
+ %p7 = alloca %struct.t7, align 1
+ %0 = bitcast %struct.t1* %p1 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+ %1 = bitcast %struct.t2* %p2 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+ %2 = bitcast %struct.t3* %p3 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+ %3 = bitcast %struct.t4* %p4 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+ %4 = bitcast %struct.t5* %p5 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+ %5 = bitcast %struct.t6* %p6 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+ %6 = bitcast %struct.t7* %p7 to i8*
+ call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+ %call = call i32 @callee2(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, %struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+ ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 119(1)
+; CHECK: sth {{[0-9]+}}, 126(1)
+; CHECK: stb {{[0-9]+}}, 135(1)
+; CHECK: sth {{[0-9]+}}, 133(1)
+; CHECK: stw {{[0-9]+}}, 140(1)
+; CHECK: stb {{[0-9]+}}, 151(1)
+; CHECK: stw {{[0-9]+}}, 147(1)
+; CHECK: sth {{[0-9]+}}, 158(1)
+; CHECK: stw {{[0-9]+}}, 154(1)
+; CHECK: stb {{[0-9]+}}, 167(1)
+; CHECK: sth {{[0-9]+}}, 165(1)
+; CHECK: stw {{[0-9]+}}, 161(1)
+}
+
+define internal i32 @callee2(i32 %z1, i32 %z2, i32 %z3, i32 %z4, i32 %z5, i32 %z6, i32 %z7, i32 %z8, %struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+ %z1.addr = alloca i32, align 4
+ %z2.addr = alloca i32, align 4
+ %z3.addr = alloca i32, align 4
+ %z4.addr = alloca i32, align 4
+ %z5.addr = alloca i32, align 4
+ %z6.addr = alloca i32, align 4
+ %z7.addr = alloca i32, align 4
+ %z8.addr = alloca i32, align 4
+ store i32 %z1, i32* %z1.addr, align 4
+ store i32 %z2, i32* %z2.addr, align 4
+ store i32 %z3, i32* %z3.addr, align 4
+ store i32 %z4, i32* %z4.addr, align 4
+ store i32 %z5, i32* %z5.addr, align 4
+ store i32 %z6, i32* %z6.addr, align 4
+ store i32 %z7, i32* %z7.addr, align 4
+ store i32 %z8, i32* %z8.addr, align 4
+ %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+ %0 = load i8* %a, align 1
+ %conv = zext i8 %0 to i32
+ %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+ %1 = load i16* %a1, align 1
+ %conv2 = sext i16 %1 to i32
+ %add = add nsw i32 %conv, %conv2
+ %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+ %2 = load i16* %a3, align 1
+ %conv4 = sext i16 %2 to i32
+ %add5 = add nsw i32 %add, %conv4
+ %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+ %3 = load i32* %a6, align 1
+ %add7 = add nsw i32 %add5, %3
+ %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+ %4 = load i32* %a8, align 1
+ %add9 = add nsw i32 %add7, %4
+ %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+ %5 = load i32* %a10, align 1
+ %add11 = add nsw i32 %add9, %5
+ %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+ %6 = load i32* %a12, align 1
+ %add13 = add nsw i32 %add11, %6
+ ret i32 %add13
+
+; CHECK: lbz {{[0-9]+}}, 149(1)
+; CHECK: lbz {{[0-9]+}}, 150(1)
+; CHECK: lbz {{[0-9]+}}, 147(1)
+; CHECK: lbz {{[0-9]+}}, 148(1)
+; CHECK: lbz {{[0-9]+}}, 133(1)
+; CHECK: lbz {{[0-9]+}}, 134(1)
+; CHECK: lha {{[0-9]+}}, 126(1)
+; CHECK: lbz {{[0-9]+}}, 119(1)
+; CHECK: lwz {{[0-9]+}}, 140(1)
+; CHECK: lhz {{[0-9]+}}, 154(1)
+; CHECK: lhz {{[0-9]+}}, 156(1)
+; CHECK: lbz {{[0-9]+}}, 163(1)
+; CHECK: lbz {{[0-9]+}}, 164(1)
+; CHECK: lbz {{[0-9]+}}, 161(1)
+; CHECK: lbz {{[0-9]+}}, 162(1)
+}
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
index ffd228bc81..43ba13b426 100644
--- a/test/CodeGen/PowerPC/structsinregs.ll
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -1,5 +1,9 @@
; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+; FIXME: The code generation for packed structs is very poor because the
+; PowerPC target wrongly rejects all unaligned loads. This test case will
+; need to be revised when that is fixed.
+
target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
target triple = "powerpc64-unknown-linux-gnu"
@@ -100,14 +104,14 @@ entry:
; CHECK: std 9, 96(1)
; CHECK: std 8, 88(1)
; CHECK: std 7, 80(1)
-; CHECK: stw 6, 72(1)
-; CHECK: stw 5, 64(1)
-; CHECK: sth 4, 58(1)
-; CHECK: stb 3, 51(1)
-; CHECK: lha {{[0-9]+}}, 58(1)
-; CHECK: lbz {{[0-9]+}}, 51(1)
-; CHECK: lha {{[0-9]+}}, 64(1)
-; CHECK: lwz {{[0-9]+}}, 72(1)
+; CHECK: stw 6, 76(1)
+; CHECK: stw 5, 68(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lha {{[0-9]+}}, 68(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
; CHECK: lwz {{[0-9]+}}, 80(1)
; CHECK: lwz {{[0-9]+}}, 88(1)
; CHECK: lwz {{[0-9]+}}, 96(1)
@@ -188,18 +192,26 @@ entry:
; CHECK: sldi 8, 8, 16
; CHECK: sldi 7, 7, 24
; CHECK: sldi 5, 5, 40
-; CHECK: stw 6, 72(1)
-; CHECK: sth 4, 58(1)
-; CHECK: stb 3, 51(1)
+; CHECK: stw 6, 76(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
; CHECK: std 9, 96(1)
; CHECK: std 8, 88(1)
; CHECK: std 7, 80(1)
; CHECK: std 5, 64(1)
-; CHECK: lha {{[0-9]+}}, 58(1)
-; CHECK: lbz {{[0-9]+}}, 51(1)
-; CHECK: lha {{[0-9]+}}, 64(1)
-; CHECK: lwz {{[0-9]+}}, 72(1)
-; CHECK: lwz {{[0-9]+}}, 80(1)
-; CHECK: lwz {{[0-9]+}}, 88(1)
-; CHECK: lwz {{[0-9]+}}, 96(1)
+; CHECK: lbz {{[0-9]+}}, 85(1)
+; CHECK: lbz {{[0-9]+}}, 86(1)
+; CHECK: lbz {{[0-9]+}}, 83(1)
+; CHECK: lbz {{[0-9]+}}, 84(1)
+; CHECK: lbz {{[0-9]+}}, 69(1)
+; CHECK: lbz {{[0-9]+}}, 70(1)
+; CHECK: lha {{[0-9]+}}, 62(1)
+; CHECK: lbz {{[0-9]+}}, 55(1)
+; CHECK: lwz {{[0-9]+}}, 76(1)
+; CHECK: lhz {{[0-9]+}}, 90(1)
+; CHECK: lhz {{[0-9]+}}, 92(1)
+; CHECK: lbz {{[0-9]+}}, 99(1)
+; CHECK: lbz {{[0-9]+}}, 100(1)
+; CHECK: lbz {{[0-9]+}}, 97(1)
+; CHECK: lbz {{[0-9]+}}, 98(1)
}