From cbf74cea070fa1f705de4712e25d9e56ae6543c7 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Mon, 30 May 2011 16:31:11 +0200
Subject: oprofile, x86: Add comments to IBS LVT offset initialization

Adding a comment in the code as IBS LVT setup is not obvious at all ...

Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/kernel/apic/apic.c      |  3 ++-
 arch/x86/oprofile/op_model_amd.c | 13 +++++++++----
 2 files changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index fabf01eff77..a0bf78a0918 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -390,7 +390,8 @@ static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
 
 /*
  * If mask=1, the LVT entry does not generate interrupts while mask=0
- * enables the vector. See also the BKDGs.
+ * enables the vector. See also the BKDGs. Must be called with
+ * preemption disabled.
  */
 
 int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 9fd8a567fe1..9cbb710dc94 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -609,16 +609,21 @@ static int setup_ibs_ctl(int ibs_eilvt_off)
 	return 0;
 }
 
+/*
+ * This runs only on the current cpu. We try to find an LVT offset and
+ * setup the local APIC. For this we must disable preemption. On
+ * success we initialize all nodes with this offset. This updates then
+ * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
+ * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_-
+ * amd_cpu_shutdown() using the new offset.
+ */
 static int force_ibs_eilvt_setup(void)
 {
 	int offset;
 	int ret;
 
-	/*
-	 * find the next free available EILVT entry, skip offset 0,
-	 * pin search to this cpu
-	 */
 	preempt_disable();
+	/* find the next free available EILVT entry, skip offset 0 */
 	for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
 		if (get_eilvt(offset))
 			break;
-- 
cgit v1.2.3-18-g5258


From 6e33a852a37dee02979ec9d82bea26c07cee5bce Mon Sep 17 00:00:00 2001
From: Márton Németh <nm127@freemail.hu>
Date: Sat, 14 May 2011 19:27:33 +0200
Subject: x86/PCI/ACPI: fix type mismatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The flags field of struct resource from linux/ioport.h is "unsigned
long". Change the "type" parameter of coalesce_windows() function to
match that field. This fixes the following warning messages when
compiling with "make C=1 W=1 bzImage modules":

arch/x86/pci/acpi.c: In function ‘coalesce_windows’:
arch/x86/pci/acpi.c:198: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result
arch/x86/pci/acpi.c:203: warning: conversion to ‘long unsigned int’ from ‘int’ may change the sign of the result

Signed-off-by: Márton Németh <nm127@freemail.hu>
Signed-off-by: Jesse Barnes <jbarnes@virtuousgeek.org>
---
 arch/x86/pci/acpi.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 0972315c386..68c3c139520 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -188,7 +188,7 @@ static bool resource_contains(struct resource *res, resource_size_t point)
 	return false;
 }
 
-static void coalesce_windows(struct pci_root_info *info, int type)
+static void coalesce_windows(struct pci_root_info *info, unsigned long type)
 {
 	int i, j;
 	struct resource *res1, *res2;
-- 
cgit v1.2.3-18-g5258


From f124c6ae59e193705c9ddac57684d50006d710e6 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Fri, 3 Jun 2011 07:45:28 +0300
Subject: xen: off by one errors in multicalls.c

b->args[] has MC_ARGS elements, so the comparison here should be
">=" instead of ">".  Otherwise we read past the end of the array
one space.

CC: stable@kernel.org
Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
---
 arch/x86/xen/multicalls.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8bff7e7c290..1b2b73ff0a6 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -189,10 +189,10 @@ struct multicall_space __xen_mc_entry(size_t args)
 	unsigned argidx = roundup(b->argidx, sizeof(u64));
 
 	BUG_ON(preemptible());
-	BUG_ON(b->argidx > MC_ARGS);
+	BUG_ON(b->argidx >= MC_ARGS);
 
 	if (b->mcidx == MC_BATCH ||
-	    (argidx + args) > MC_ARGS) {
+	    (argidx + args) >= MC_ARGS) {
 		mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
 		xen_mc_flush();
 		argidx = roundup(b->argidx, sizeof(u64));
@@ -206,7 +206,7 @@ struct multicall_space __xen_mc_entry(size_t args)
 	ret.args = &b->args[argidx];
 	b->argidx = argidx + args;
 
-	BUG_ON(b->argidx > MC_ARGS);
+	BUG_ON(b->argidx >= MC_ARGS);
 	return ret;
 }
 
@@ -216,7 +216,7 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
 	struct multicall_space ret = { NULL, NULL };
 
 	BUG_ON(preemptible());
-	BUG_ON(b->argidx > MC_ARGS);
+	BUG_ON(b->argidx >= MC_ARGS);
 
 	if (b->mcidx == 0)
 		return ret;
@@ -224,14 +224,14 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
 	if (b->entries[b->mcidx - 1].op != op)
 		return ret;
 
-	if ((b->argidx + size) > MC_ARGS)
+	if ((b->argidx + size) >= MC_ARGS)
 		return ret;
 
 	ret.mc = &b->entries[b->mcidx - 1];
 	ret.args = &b->args[b->argidx];
 	b->argidx += size;
 
-	BUG_ON(b->argidx > MC_ARGS);
+	BUG_ON(b->argidx >= MC_ARGS);
 	return ret;
 }
 
-- 
cgit v1.2.3-18-g5258


From 221192bdff2583834984639121595fc9296120d3 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 30 May 2011 15:23:14 -0300
Subject: KVM: x86: use proper port value when checking io instruction
 permission

Commit f6511935f42 moved the permission check for io instructions
to the ->check_perm callback. It failed to copy the port value from RDX
register for string and "in,out ax,dx" instructions.

Fix it by reading RDX register at decode stage when appropriate.

Fixes FC8.32 installation.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 82 +++++++++++++++++++++++++++++---------------------
 1 file changed, 47 insertions(+), 35 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index d6e2477feb1..6df88c7885c 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -47,38 +47,40 @@
 #define DstDI       (5<<1)	/* Destination is in ES:(E)DI */
 #define DstMem64    (6<<1)	/* 64bit memory operand */
 #define DstImmUByte (7<<1)	/* 8-bit unsigned immediate operand */
-#define DstMask     (7<<1)
+#define DstDX       (8<<1)	/* Destination is in DX register */
+#define DstMask     (0xf<<1)
 /* Source operand type. */
-#define SrcNone     (0<<4)	/* No source operand. */
-#define SrcReg      (1<<4)	/* Register operand. */
-#define SrcMem      (2<<4)	/* Memory operand. */
-#define SrcMem16    (3<<4)	/* Memory operand (16-bit). */
-#define SrcMem32    (4<<4)	/* Memory operand (32-bit). */
-#define SrcImm      (5<<4)	/* Immediate operand. */
-#define SrcImmByte  (6<<4)	/* 8-bit sign-extended immediate operand. */
-#define SrcOne      (7<<4)	/* Implied '1' */
-#define SrcImmUByte (8<<4)      /* 8-bit unsigned immediate operand. */
-#define SrcImmU     (9<<4)      /* Immediate operand, unsigned */
-#define SrcSI       (0xa<<4)	/* Source is in the DS:RSI */
-#define SrcImmFAddr (0xb<<4)	/* Source is immediate far address */
-#define SrcMemFAddr (0xc<<4)	/* Source is far address in memory */
-#define SrcAcc      (0xd<<4)	/* Source Accumulator */
-#define SrcImmU16   (0xe<<4)    /* Immediate operand, unsigned, 16 bits */
-#define SrcMask     (0xf<<4)
+#define SrcNone     (0<<5)	/* No source operand. */
+#define SrcReg      (1<<5)	/* Register operand. */
+#define SrcMem      (2<<5)	/* Memory operand. */
+#define SrcMem16    (3<<5)	/* Memory operand (16-bit). */
+#define SrcMem32    (4<<5)	/* Memory operand (32-bit). */
+#define SrcImm      (5<<5)	/* Immediate operand. */
+#define SrcImmByte  (6<<5)	/* 8-bit sign-extended immediate operand. */
+#define SrcOne      (7<<5)	/* Implied '1' */
+#define SrcImmUByte (8<<5)      /* 8-bit unsigned immediate operand. */
+#define SrcImmU     (9<<5)      /* Immediate operand, unsigned */
+#define SrcSI       (0xa<<5)	/* Source is in the DS:RSI */
+#define SrcImmFAddr (0xb<<5)	/* Source is immediate far address */
+#define SrcMemFAddr (0xc<<5)	/* Source is far address in memory */
+#define SrcAcc      (0xd<<5)	/* Source Accumulator */
+#define SrcImmU16   (0xe<<5)    /* Immediate operand, unsigned, 16 bits */
+#define SrcDX       (0xf<<5)	/* Source is in DX register */
+#define SrcMask     (0xf<<5)
 /* Generic ModRM decode. */
-#define ModRM       (1<<8)
+#define ModRM       (1<<9)
 /* Destination is only written; never read. */
-#define Mov         (1<<9)
-#define BitOp       (1<<10)
-#define MemAbs      (1<<11)      /* Memory operand is absolute displacement */
-#define String      (1<<12)     /* String instruction (rep capable) */
-#define Stack       (1<<13)     /* Stack instruction (push/pop) */
-#define GroupMask   (7<<14)     /* Opcode uses one of the group mechanisms */
-#define Group       (1<<14)     /* Bits 3:5 of modrm byte extend opcode */
-#define GroupDual   (2<<14)     /* Alternate decoding of mod == 3 */
-#define Prefix      (3<<14)     /* Instruction varies with 66/f2/f3 prefix */
-#define RMExt       (4<<14)     /* Opcode extension in ModRM r/m if mod == 3 */
-#define Sse         (1<<17)     /* SSE Vector instruction */
+#define Mov         (1<<10)
+#define BitOp       (1<<11)
+#define MemAbs      (1<<12)      /* Memory operand is absolute displacement */
+#define String      (1<<13)     /* String instruction (rep capable) */
+#define Stack       (1<<14)     /* Stack instruction (push/pop) */
+#define GroupMask   (7<<15)     /* Opcode uses one of the group mechanisms */
+#define Group       (1<<15)     /* Bits 3:5 of modrm byte extend opcode */
+#define GroupDual   (2<<15)     /* Alternate decoding of mod == 3 */
+#define Prefix      (3<<15)     /* Instruction varies with 66/f2/f3 prefix */
+#define RMExt       (4<<15)     /* Opcode extension in ModRM r/m if mod == 3 */
+#define Sse         (1<<18)     /* SSE Vector instruction */
 /* Misc flags */
 #define Prot        (1<<21) /* instruction generates #UD if not in prot-mode */
 #define VendorSpecific (1<<22) /* Vendor specific instruction */
@@ -3154,8 +3156,8 @@ static struct opcode opcode_table[256] = {
 	I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
 	I(SrcImmByte | Mov | Stack, em_push),
 	I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
-	D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */
-	D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */
+	D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
+	D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
 	/* 0x70 - 0x7F */
 	X16(D(SrcImmByte)),
 	/* 0x80 - 0x87 */
@@ -3212,8 +3214,8 @@ static struct opcode opcode_table[256] = {
 	/* 0xE8 - 0xEF */
 	D(SrcImm | Stack), D(SrcImm | ImplicitOps),
 	D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
-	D2bvIP(SrcNone | DstAcc,     in,  check_perm_in),
-	D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out),
+	D2bvIP(SrcDX | DstAcc, in,  check_perm_in),
+	D2bvIP(SrcAcc | DstDX, out, check_perm_out),
 	/* 0xF0 - 0xF7 */
 	N, DI(ImplicitOps, icebp), N, N,
 	DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
@@ -3613,6 +3615,12 @@ done_prefixes:
 		memop.bytes = c->op_bytes + 2;
 		goto srcmem_common;
 		break;
+	case SrcDX:
+		c->src.type = OP_REG;
+		c->src.bytes = 2;
+		c->src.addr.reg = &c->regs[VCPU_REGS_RDX];
+		fetch_register_operand(&c->src);
+		break;
 	}
 
 	if (rc != X86EMUL_CONTINUE)
@@ -3682,6 +3690,12 @@ done_prefixes:
 		c->dst.addr.mem.seg = VCPU_SREG_ES;
 		c->dst.val = 0;
 		break;
+	case DstDX:
+		c->dst.type = OP_REG;
+		c->dst.bytes = 2;
+		c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
+		fetch_register_operand(&c->dst);
+		break;
 	case ImplicitOps:
 		/* Special instructions do their own operand decoding. */
 	default:
@@ -4027,7 +4041,6 @@ special_insn:
 		break;
 	case 0xec: /* in al,dx */
 	case 0xed: /* in (e/r)ax,dx */
-		c->src.val = c->regs[VCPU_REGS_RDX];
 	do_io_in:
 		if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
 				     &c->dst.val))
@@ -4035,7 +4048,6 @@ special_insn:
 		break;
 	case 0xee: /* out dx,al */
 	case 0xef: /* out dx,(e/r)ax */
-		c->dst.val = c->regs[VCPU_REGS_RDX];
 	do_io_out:
 		ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
 				      &c->src.val, 1);
-- 
cgit v1.2.3-18-g5258


From 0de66d5b35ee148455e268b2782873204ffdef4b Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 6 Jun 2011 16:04:02 +0200
Subject: x86/amd-iommu: Fix 3 possible endless loops

The driver contains several loops counting on an u16 value
where the exit-condition is checked against variables that
can have values up to 0xffff. In this case the loops will
never exit. This patch fixed 3 such loops.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu_init.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 9179c21120a..bfc8453bd98 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -731,8 +731,8 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 {
 	u8 *p = (u8 *)h;
 	u8 *end = p, flags = 0;
-	u16 dev_i, devid = 0, devid_start = 0, devid_to = 0;
-	u32 ext_flags = 0;
+	u16 devid = 0, devid_start = 0, devid_to = 0;
+	u32 dev_i, ext_flags = 0;
 	bool alias = false;
 	struct ivhd_entry *e;
 
@@ -887,7 +887,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
 /* Initializes the device->iommu mapping for the driver */
 static int __init init_iommu_devices(struct amd_iommu *iommu)
 {
-	u16 i;
+	u32 i;
 
 	for (i = iommu->first_device; i <= iommu->last_device; ++i)
 		set_iommu_for_device(iommu, i);
@@ -1177,7 +1177,7 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
  */
 static void init_device_table(void)
 {
-	u16 devid;
+	u32 devid;
 
 	for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
 		set_dev_entry_bit(devid, DEV_ENTRY_VALID);
-- 
cgit v1.2.3-18-g5258


From 27c2127a15d340706c0aa84e311188a14468d841 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 30 May 2011 15:56:24 +0200
Subject: x86/amd-iommu: Use only per-device dma_ops

Unfortunatly there are systems where the AMD IOMMU does not
cover all devices. This breaks with the current driver as it
initializes the global dma_ops variable. This patch limits
the AMD IOMMU to the devices listed in the IVRS table fixing
DMA for devices not covered by the IOMMU.

Cc: stable@kernel.org
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index cd8cbeb5fa3..b2c309b0742 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -30,6 +30,7 @@
 #include <asm/proto.h>
 #include <asm/iommu.h>
 #include <asm/gart.h>
+#include <asm/dma.h>
 #include <asm/amd_iommu_proto.h>
 #include <asm/amd_iommu_types.h>
 #include <asm/amd_iommu.h>
@@ -2383,6 +2384,23 @@ static struct dma_map_ops amd_iommu_dma_ops = {
 	.dma_supported = amd_iommu_dma_supported,
 };
 
+static unsigned device_dma_ops_init(void)
+{
+	struct pci_dev *pdev = NULL;
+	unsigned unhandled = 0;
+
+	for_each_pci_dev(pdev) {
+		if (!check_device(&pdev->dev)) {
+			unhandled += 1;
+			continue;
+		}
+
+		pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
+	}
+
+	return unhandled;
+}
+
 /*
  * The function which clues the AMD IOMMU driver into dma_ops.
  */
@@ -2395,7 +2413,7 @@ void __init amd_iommu_init_api(void)
 int __init amd_iommu_init_dma_ops(void)
 {
 	struct amd_iommu *iommu;
-	int ret;
+	int ret, unhandled;
 
 	/*
 	 * first allocate a default protection domain for every IOMMU we
@@ -2421,7 +2439,11 @@ int __init amd_iommu_init_dma_ops(void)
 	swiotlb = 0;
 
 	/* Make the driver finally visible to the drivers */
-	dma_ops = &amd_iommu_dma_ops;
+	unhandled = device_dma_ops_init();
+	if (unhandled && max_pfn > MAX_DMA32_PFN) {
+		/* There are unhandled devices - initialize swiotlb for them */
+		swiotlb = 1;
+	}
 
 	amd_iommu_stats_init();
 
-- 
cgit v1.2.3-18-g5258


From 26018874e3584f1658570d41d57d4c34f6a53aa0 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Mon, 6 Jun 2011 16:50:14 +0200
Subject: x86/amd-iommu: Fix boot crash with hidden PCI devices

Some PCIe cards ship with a PCI-PCIe bridge which is not
visible as a PCI device in Linux. But the device-id of the
bridge is present in the IOMMU tables which causes a boot
crash in the IOMMU driver.
This patch fixes by removing these cards from the IOMMU
handling. This is a pure -stable fix, a real fix to handle
this situation appriatly will follow for the next merge
window.

Cc: stable@kernel.org	# > 2.6.32
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
---
 arch/x86/kernel/amd_iommu.c | 22 +++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index b2c309b0742..7c3a95e54ec 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -155,6 +155,10 @@ static int iommu_init_device(struct device *dev)
 	pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
 	if (pdev)
 		dev_data->alias = &pdev->dev;
+	else {
+		kfree(dev_data);
+		return -ENOTSUPP;
+	}
 
 	atomic_set(&dev_data->bind, 0);
 
@@ -164,6 +168,20 @@ static int iommu_init_device(struct device *dev)
 	return 0;
 }
 
+static void iommu_ignore_device(struct device *dev)
+{
+	u16 devid, alias;
+
+	devid = get_device_id(dev);
+	alias = amd_iommu_alias_table[devid];
+
+	memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
+	memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
+
+	amd_iommu_rlookup_table[devid] = NULL;
+	amd_iommu_rlookup_table[alias] = NULL;
+}
+
 static void iommu_uninit_device(struct device *dev)
 {
 	kfree(dev->archdata.iommu);
@@ -193,7 +211,9 @@ int __init amd_iommu_init_devices(void)
 			continue;
 
 		ret = iommu_init_device(&pdev->dev);
-		if (ret)
+		if (ret == -ENOTSUPP)
+			iommu_ignore_device(&pdev->dev);
+		else if (ret)
 			goto out_free;
 	}
 
-- 
cgit v1.2.3-18-g5258


From fd8a7de177b6f56a0fc59ad211c197a7df06b1ad Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 20 Jul 2010 14:34:50 +0200
Subject: x86: cpu-hotplug: Prevent softirq wakeup on wrong CPU

After a newly plugged CPU sets the cpu_online bit it enables
interrupts and goes idle. The cpu which brought up the new cpu waits
for the cpu_online bit and when it observes it, it sets the cpu_active
bit for this cpu. The cpu_active bit is the relevant one for the
scheduler to consider the cpu as a viable target.

With forced threaded interrupt handlers which imply forced threaded
softirqs we observed the following race:

cpu 0                         cpu 1

bringup(cpu1);
                              set_cpu_online(smp_processor_id(), true);
		              local_irq_enable();
while (!cpu_online(cpu1));
                              timer_interrupt()
                                -> wake_up(softirq_thread_cpu1);
                                     -> enqueue_on(softirq_thread_cpu1, cpu0);

                                                                        ^^^^

cpu_notify(CPU_ONLINE, cpu1);
  -> sched_cpu_active(cpu1)
     -> set_cpu_active((cpu1, true);

When an interrupt happens before the cpu_active bit is set by the cpu
which brought up the newly onlined cpu, then the scheduler refuses to
enqueue the woken thread which is bound to that newly onlined cpu on
that newly onlined cpu due to the not yet set cpu_active bit and
selects a fallback runqueue. Not really an expected and desirable
behaviour.

So far this has only been observed with forced hard/softirq threading,
but in theory this could happen without forced threaded hard/softirqs
as well. It's probably unobservable as it would take a massive
interrupt storm on the newly onlined cpu which causes the softirq loop
to wake up the softirq thread and an even longer delay of the cpu
which waits for the cpu_online bit.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
Cc: stable@kernel.org # 2.6.39
---
 arch/x86/kernel/smpboot.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 33a0c11797d..9fd3137230d 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -285,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused)
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 	x86_platform.nmi_init();
 
+	/*
+	 * Wait until the cpu which brought this one up marked it
+	 * online before enabling interrupts. If we don't do that then
+	 * we can end up waking up the softirq thread before this cpu
+	 * reached the active state, which makes the scheduler unhappy
+	 * and schedule the softirq thread on the wrong cpu. This is
+	 * only observable with forced threaded interrupts, but in
+	 * theory it could also happen w/o them. It's just way harder
+	 * to achieve.
+	 */
+	while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
+		cpu_relax();
+
 	/* enable local interrupts */
 	local_irq_enable();
 
-- 
cgit v1.2.3-18-g5258


From a91d92875ee94e4703fd017ccaadb48cfb344994 Mon Sep 17 00:00:00 2001
From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Date: Fri, 3 Jun 2011 09:51:34 +0000
Subject: xen: partially revert "xen: set max_pfn_mapped to the last pfn
 mapped"

We only need to set max_pfn_mapped to the last pfn mapped on x86_64 to
make sure that cleanup_highmap doesn't remove important mappings at
_end.

We don't need to do this on x86_32 because cleanup_highmap is not called
on x86_32. Besides lowering max_pfn_mapped on x86_32 has the unwanted
side effect of limiting the amount of memory available for the 1:1
kernel pagetable allocation.

This patch reverts the x86_32 part of the original patch.

CC: stable@kernel.org
Signed-off-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index dc708dcc62f..afe1d54f980 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1599,6 +1599,11 @@ static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
 		for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
 			pte_t pte;
 
+#ifdef CONFIG_X86_32
+			if (pfn > max_pfn_mapped)
+				max_pfn_mapped = pfn;
+#endif
+
 			if (!pte_none(pte_page[pteidx]))
 				continue;
 
@@ -1766,7 +1771,9 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
 	initial_kernel_pmd =
 		extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
 
-	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
+	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
+				  xen_start_info->nr_pt_frames * PAGE_SIZE +
+				  512*1024);
 
 	kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
 	memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
-- 
cgit v1.2.3-18-g5258


From 977cb76d52e7aa040e18a84b29fe6fd80d79319b Mon Sep 17 00:00:00 2001
From: Florian Fainelli <ffainelli@freebox.fr>
Date: Mon, 6 Jun 2011 10:15:49 +0200
Subject: x86: devicetree: Add missing early_init_dt_setup_initrd_arch stub

This patch fixes the following build failure:

drivers/built-in.o: In function `early_init_dt_check_for_initrd':
/home/florian/dev/kernel/x86/linux-2.6-x86/drivers/of/fdt.c:571:
undefined reference to `early_init_dt_setup_initrd_arch'
make: *** [.tmp_vmlinux1] Error 1

which happens as soon as we enable initrd support on a x86 devicetree
platform such as Intel CE4100.

Signed-off-by: Florian Fainelli <ffainelli@freebox.fr>
Acked-by: Grant Likely <grant.likely@secretlab.ca>
Cc: Maxime Bizon <mbizon@freebox.fr>
Acked-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
Cc: stable@kernel.org # 2.6.39
Link: http://lkml.kernel.org/r/201106061015.50039.ffainelli@freebox.fr
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/kernel/devicetree.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 690bc846183..9aeb78a23de 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/pci.h>
 #include <linux/of_pci.h>
+#include <linux/initrd.h>
 
 #include <asm/hpet.h>
 #include <asm/irq_controller.h>
@@ -98,6 +99,16 @@ void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
 	return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
 }
 
+#ifdef CONFIG_BLK_DEV_INITRD
+void __init early_init_dt_setup_initrd_arch(unsigned long start,
+					    unsigned long end)
+{
+	initrd_start = (unsigned long)__va(start);
+	initrd_end = (unsigned long)__va(end);
+	initrd_below_start_ok = 1;
+}
+#endif
+
 void __init add_dtb(u64 data)
 {
 	initial_dtb = data + offsetof(struct setup_data, data);
-- 
cgit v1.2.3-18-g5258


From dac853ae89043f1b7752875300faf614de43c74b Mon Sep 17 00:00:00 2001
From: Mathias Krause <minipli@googlemail.com>
Date: Thu, 9 Jun 2011 20:05:18 +0200
Subject: exec: delay address limit change until point of no return

Unconditionally changing the address limit to USER_DS and not restoring
it to its old value in the error path is wrong because it prevents us
using kernel memory on repeated calls to this function.  This, in fact,
breaks the fallback of hard coded paths to the init program from being
ever successful if the first candidate fails to load.

With this patch applied switching to USER_DS is delayed until the point
of no return is reached which makes it possible to have a multi-arch
rootfs with one arch specific init binary for each of the (hard coded)
probed paths.

Since the address limit is already set to USER_DS when start_thread()
will be invoked, this redundancy can be safely removed.

Signed-off-by: Mathias Krause <minipli@googlemail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/process_32.c | 1 -
 arch/x86/kernel/process_64.c | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 8d128783af4..a3d0dc59067 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 {
 	set_user_gs(regs, 0);
 	regs->fs		= 0;
-	set_fs(USER_DS);
 	regs->ds		= __USER_DS;
 	regs->es		= __USER_DS;
 	regs->ss		= __USER_DS;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 6c9dd922ac0..ca6f7ab8df3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -338,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
 	regs->cs		= _cs;
 	regs->ss		= _ss;
 	regs->flags		= X86_EFLAGS_IF;
-	set_fs(USER_DS);
 	/*
 	 * Free the old FP and other extended state
 	 */
-- 
cgit v1.2.3-18-g5258


From 7ad35cf288fd63a19bf50e490440a992de808b2b Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Wed, 25 May 2011 14:00:49 +1000
Subject: x86/uv/x2apic: update for change in pci bridge handling.

When I added 3448a19da479b6bd1e28e2a2be9fa16c6a6feb39
I forgot about the special uv handling code for this, so this
patch fixes it up.

Acked-by: Jesse Barnes <jbarnes@virtuousgeek.org>
Acked-by: Ingo Molnar
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 arch/x86/kernel/apic/x2apic_uv_x.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index b511a011b7d..adc66c3a1fe 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -632,14 +632,14 @@ late_initcall(uv_init_heartbeat);
 
 /* Direct Legacy VGA I/O traffic to designated IOH */
 int uv_set_vga_state(struct pci_dev *pdev, bool decode,
-		      unsigned int command_bits, bool change_bridge)
+		      unsigned int command_bits, u32 flags)
 {
 	int domain, bus, rc;
 
-	PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
-			pdev->devfn, decode, command_bits, change_bridge);
+	PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
+			pdev->devfn, decode, command_bits, flags);
 
-	if (!change_bridge)
+	if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
 		return 0;
 
 	if ((command_bits & PCI_COMMAND_IO) == 0)
-- 
cgit v1.2.3-18-g5258


From 60b8b1de0dd2bf246f0e074d287bb3f0bc42a755 Mon Sep 17 00:00:00 2001
From: Andy Whitcroft <apw@canonical.com>
Date: Tue, 14 Jun 2011 12:45:10 -0700
Subject: x86 idle: APM requires pm_idle/default_idle unconditionally when a
 module

[ Also from Ben Hutchings <ben@decadent.org.uk> and Vitaliy Ivanov
  <vitalivanov@gmail.com> ]

Commit 06ae40ce073d ("x86 idle: EXPORT_SYMBOL(default_idle, pm_idle)
only when APM demands it") removed the export for pm_idle/default_idle
unless the apm module was modularised and CONFIG_APM_CPU_IDLE was set.

But the apm module uses pm_idle/default_idle unconditionally,
CONFIG_APM_CPU_IDLE only affects the bios idle threshold.  Adjust the
export accordingly.

[ Used #ifdef instead of #if defined() as it's shorter, and what both
  Ben and Vitaliy used.. Andy, you're out-voted ;)    - Linus ]

Reported-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Len Brown <len.brown@intel.com>
Signed-off-by: Andy Whitcroft <apw@canonical.com>
Signed-off-by: Vitaliy Ivanov <vitalivanov@gmail.com>
Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/process.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 2e4928d45a2..e1ba8cb24e4 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -337,7 +337,7 @@ EXPORT_SYMBOL(boot_option_idle_override);
  * Powermanagement idle function, if any..
  */
 void (*pm_idle)(void);
-#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE)
+#ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(pm_idle);
 #endif
 
@@ -399,7 +399,7 @@ void default_idle(void)
 		cpu_relax();
 	}
 }
-#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE)
+#ifdef CONFIG_APM_MODULE
 EXPORT_SYMBOL(default_idle);
 #endif
 
-- 
cgit v1.2.3-18-g5258


From 8fe7e94eb71430cf63a742f3c19739d82a662758 Mon Sep 17 00:00:00 2001
From: Robert Richter <robert.richter@amd.com>
Date: Wed, 1 Jun 2011 15:31:44 +0200
Subject: oprofile, x86: Fix race in nmi handler while starting counters

In some rare cases, nmis are generated immediately after the nmi
handler of the cpu was started. This causes the counter not to be
enabled. Before enabling the nmi handlers we need to set variable
ctr_running first and make sure its value is written to memory.

Also, the patch makes all existing barriers a memory barrier instead
of a compiler barrier only.

Reported-by: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com>
Cc: <stable@kernel.org> # .35+
Signed-off-by: Robert Richter <robert.richter@amd.com>
---
 arch/x86/oprofile/nmi_int.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cf9750004a0..68894fdc034 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -112,8 +112,10 @@ static void nmi_cpu_start(void *dummy)
 static int nmi_start(void)
 {
 	get_online_cpus();
-	on_each_cpu(nmi_cpu_start, NULL, 1);
 	ctr_running = 1;
+	/* make ctr_running visible to the nmi handler: */
+	smp_mb();
+	on_each_cpu(nmi_cpu_start, NULL, 1);
 	put_online_cpus();
 	return 0;
 }
@@ -504,15 +506,18 @@ static int nmi_setup(void)
 
 	nmi_enabled = 0;
 	ctr_running = 0;
-	barrier();
+	/* make variables visible to the nmi handler: */
+	smp_mb();
 	err = register_die_notifier(&profile_exceptions_nb);
 	if (err)
 		goto fail;
 
 	get_online_cpus();
 	register_cpu_notifier(&oprofile_cpu_nb);
-	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	nmi_enabled = 1;
+	/* make nmi_enabled visible to the nmi handler: */
+	smp_mb();
+	on_each_cpu(nmi_cpu_setup, NULL, 1);
 	put_online_cpus();
 
 	return 0;
@@ -531,7 +536,8 @@ static void nmi_shutdown(void)
 	nmi_enabled = 0;
 	ctr_running = 0;
 	put_online_cpus();
-	barrier();
+	/* make variables visible to the nmi handler: */
+	smp_mb();
 	unregister_die_notifier(&profile_exceptions_nb);
 	msrs = &get_cpu_var(cpu_msrs);
 	model->shutdown(msrs);
-- 
cgit v1.2.3-18-g5258


From 900cba8881b39dfbc7c8062098504ab93f5387a8 Mon Sep 17 00:00:00 2001
From: Andrew Jones <drjones@redhat.com>
Date: Fri, 18 Dec 2009 10:31:31 +0100
Subject: xen: support CONFIG_MAXSMP

The MAXSMP config option requires CPUMASK_OFFSTACK, which in turn
requires we init the memory for the maps while we bring up the cpus.
MAXSMP also increases NR_CPUS to 4096. This increase in size exposed an
issue in the argument construction for multicalls from
xen_flush_tlb_others. The args should only need space for the actual
number of cpus.

Also in 2.6.39 it exposes a bootup problem.

BUG: unable to handle kernel NULL pointer dereference at           (null)
IP: [<ffffffff8157a1d3>] set_cpu_sibling_map+0x123/0x30d
...
Call Trace:
[<ffffffff81039a3f>] ? xen_restore_fl_direct_reloc+0x4/0x4
[<ffffffff819dc4db>] xen_smp_prepare_cpus+0x36/0x135
..

CC: stable@kernel.org
Signed-off-by: Andrew Jones <drjones@redhat.com>
[v2: Updated to compile on 3.0]
[v3: Updated to compile when CONFIG_SMP is not defined]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 3 ++-
 arch/x86/xen/smp.c | 7 +++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index afe1d54f980..673e968df3c 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -59,6 +59,7 @@
 #include <asm/page.h>
 #include <asm/init.h>
 #include <asm/pat.h>
+#include <asm/smp.h>
 
 #include <asm/xen/hypercall.h>
 #include <asm/xen/hypervisor.h>
@@ -1231,7 +1232,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 {
 	struct {
 		struct mmuext_op op;
-		DECLARE_BITMAP(mask, NR_CPUS);
+		DECLARE_BITMAP(mask, num_processors);
 	} *args;
 	struct multicall_space mcs;
 
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 41038c01de4..b4533a86d7e 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -205,11 +205,18 @@ static void __init xen_smp_prepare_boot_cpu(void)
 static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
 {
 	unsigned cpu;
+	unsigned int i;
 
 	xen_init_lock_cpu(0);
 
 	smp_store_cpu_info(0);
 	cpu_data(0).x86_max_cores = 1;
+
+	for_each_possible_cpu(i) {
+		zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
+		zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
+	}
 	set_cpu_sibling_map(0);
 
 	if (xen_smp_intr_init(0))
-- 
cgit v1.2.3-18-g5258


From b2abe50688dcb470e2e46109da7e7e02245ed59b Mon Sep 17 00:00:00 2001
From: Tom Goetz <tom.goetz@virtualcomputer.com>
Date: Mon, 16 May 2011 15:06:26 -0400
Subject: xen: When calling power_off, don't call the halt function.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

.. As it won't actually power off the machine.

Reported-by: Sven Köhler <sven.koehler@gmail.com>
Tested-by: Sven Köhler <sven.koehler@gmail.com>
Signed-off-by: Tom Goetz <tom.goetz@virtualcomputer.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/enlighten.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index dd7b88f2ec7..5525163a039 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1033,6 +1033,13 @@ static void xen_machine_halt(void)
 	xen_reboot(SHUTDOWN_poweroff);
 }
 
+static void xen_machine_power_off(void)
+{
+	if (pm_power_off)
+		pm_power_off();
+	xen_reboot(SHUTDOWN_poweroff);
+}
+
 static void xen_crash_shutdown(struct pt_regs *regs)
 {
 	xen_reboot(SHUTDOWN_crash);
@@ -1058,7 +1065,7 @@ int xen_panic_handler_init(void)
 static const struct machine_ops xen_machine_ops __initconst = {
 	.restart = xen_restart,
 	.halt = xen_machine_halt,
-	.power_off = xen_machine_halt,
+	.power_off = xen_machine_power_off,
 	.shutdown = xen_machine_halt,
 	.crash_shutdown = xen_crash_shutdown,
 	.emergency_restart = xen_emergency_restart,
-- 
cgit v1.2.3-18-g5258


From acd049c6e99d2ad1195666195230f6881d1c1588 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 16 Jun 2011 13:07:19 -0400
Subject: xen/setup: Fix for incorrect xen_extra_mem_start.

The earlier attempts (24bdb0b62cc82120924762ae6bc85afc8c3f2b26)
at fixing this problem caused other problems to surface (PV guests
with no PCI passthrough would have SWIOTLB turned on - which meant
64MB of precious contingous DMA32 memory being eaten up per guest).
The problem was: "on xen we add an extra memory region at the end of
the e820, and on this particular machine this extra memory region
would start below 4g and cross over the 4g boundary:

[0xfee01000-0x192655000)

Unfortunately e820_end_of_low_ram_pfn does not expect an
e820 layout like that so it returns 4g, therefore initial_memory_mapping
will map [0 - 0x100000000), that is a memory range that includes some
reserved memory regions."

The memory range was the IOAPIC regions, and with the 1-1 mapping
turned on, it would map them as RAM, not as MMIO regions. This caused
the hypervisor to complain. Fortunately this is experienced only under
the initial domain so we guard for it.

Acked-by: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/setup.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index be1a464f6d6..60aeeb56948 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -227,11 +227,7 @@ char * __init xen_memory_setup(void)
 
 	memcpy(map_raw, map, sizeof(map));
 	e820.nr_map = 0;
-#ifdef CONFIG_X86_32
 	xen_extra_mem_start = mem_end;
-#else
-	xen_extra_mem_start = max((1ULL << 32), mem_end);
-#endif
 	for (i = 0; i < memmap.nr_entries; i++) {
 		unsigned long long end;
 
@@ -266,6 +262,12 @@ char * __init xen_memory_setup(void)
 		if (map[i].size > 0)
 			e820_add_region(map[i].addr, map[i].size, map[i].type);
 	}
+	/* Align the balloon area so that max_low_pfn does not get set
+	 * to be at the _end_ of the PCI gap at the far end (fee01000).
+	 * Note that xen_extra_mem_start gets set in the loop above to be
+	 * past the last E820 region. */
+	if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
+		xen_extra_mem_start = (1ULL<<32);
 
 	/*
 	 * In domU, the ISA region is normal, usable memory, but we
-- 
cgit v1.2.3-18-g5258


From 7d68dc3f1003a38948c55c803c32d1989dd49198 Mon Sep 17 00:00:00 2001
From: Maarten Lankhorst <m.b.lankhorst@gmail.com>
Date: Tue, 14 Jun 2011 19:53:09 +0200
Subject: x86, efi: Do not reserve boot services regions within reserved areas

Commit 916f676f8dc started reserving boot service code since some systems
require you to keep that code around until SetVirtualAddressMap is called.

However, in some cases those areas will overlap with reserved regions.
The proper medium-term fix is to fix the bootloader to prevent the
conflicts from occurring by moving the kernel to a better position,
but the kernel should check for this possibility, and only reserve regions
which can be reserved.

Signed-off-by: Maarten Lankhorst <m.b.lankhorst@gmail.com>
Link: http://lkml.kernel.org/r/4DF7A005.1050407@gmail.com
Acked-by: Matthew Garrett <mjg@redhat.com>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/memblock.h |  2 +-
 arch/x86/mm/memblock.c          |  4 ++--
 arch/x86/platform/efi/efi.c     | 29 +++++++++++++++++++++++++----
 3 files changed, 28 insertions(+), 7 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
index 19ae14ba697..0cd3800f33b 100644
--- a/arch/x86/include/asm/memblock.h
+++ b/arch/x86/include/asm/memblock.h
@@ -4,7 +4,6 @@
 #define ARCH_DISCARD_MEMBLOCK
 
 u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
-void memblock_x86_to_bootmem(u64 start, u64 end);
 
 void memblock_x86_reserve_range(u64 start, u64 end, char *name);
 void memblock_x86_free_range(u64 start, u64 end);
@@ -19,5 +18,6 @@ u64 memblock_x86_hole_size(u64 start, u64 end);
 u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
 u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
 u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
+bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
 
 #endif
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
index aa1169392b8..992da5ec5a6 100644
--- a/arch/x86/mm/memblock.c
+++ b/arch/x86/mm/memblock.c
@@ -8,7 +8,7 @@
 #include <linux/range.h>
 
 /* Check for already reserved areas */
-static bool __init check_with_memblock_reserved_size(u64 *addrp, u64 *sizep, u64 align)
+bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
 {
 	struct memblock_region *r;
 	u64 addr = *addrp, last;
@@ -59,7 +59,7 @@ u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
 		if (addr >= ei_last)
 			continue;
 		*sizep = ei_last - addr;
-		while (check_with_memblock_reserved_size(&addr, sizep, align))
+		while (memblock_x86_check_reserved_size(&addr, sizep, align))
 			;
 
 		if (*sizep)
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0d3a4fa3456..474356b98ed 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -310,14 +310,31 @@ void __init efi_reserve_boot_services(void)
 
 	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
 		efi_memory_desc_t *md = p;
-		unsigned long long start = md->phys_addr;
-		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+		u64 start = md->phys_addr;
+		u64 size = md->num_pages << EFI_PAGE_SHIFT;
 
 		if (md->type != EFI_BOOT_SERVICES_CODE &&
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
-
-		memblock_x86_reserve_range(start, start + size, "EFI Boot");
+		/* Only reserve where possible:
+		 * - Not within any already allocated areas
+		 * - Not over any memory area (really needed, if above?)
+		 * - Not within any part of the kernel
+		 * - Not the bios reserved area
+		*/
+		if ((start+size >= virt_to_phys(_text)
+				&& start <= virt_to_phys(_end)) ||
+			!e820_all_mapped(start, start+size, E820_RAM) ||
+			memblock_x86_check_reserved_size(&start, &size,
+							1<<EFI_PAGE_SHIFT)) {
+			/* Could not reserve, skip it */
+			md->num_pages = 0;
+			memblock_dbg(PFX "Could not reserve boot range "
+					"[0x%010llx-0x%010llx]\n",
+						start, start+size-1);
+		} else
+			memblock_x86_reserve_range(start, start+size,
+							"EFI Boot");
 	}
 }
 
@@ -334,6 +351,10 @@ static void __init efi_free_boot_services(void)
 		    md->type != EFI_BOOT_SERVICES_DATA)
 			continue;
 
+		/* Could not reserve boot area */
+		if (!size)
+			continue;
+
 		free_bootmem_late(start, size);
 	}
 }
-- 
cgit v1.2.3-18-g5258


From b72336355bb4c92d4a2be3f975dbea47089c83c1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Mon, 30 May 2011 22:11:17 +0200
Subject: KVM: MMU: Fix build warnings in walk_addr_generic()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 3.0-rc1 I get

In file included from arch/x86/kvm/mmu.c:2856:
arch/x86/kvm/paging_tmpl.h: In function ‘paging32_walk_addr_generic’:
arch/x86/kvm/paging_tmpl.h:124: warning: ‘ptep_user’ may be used uninitialized in this function
In file included from arch/x86/kvm/mmu.c:2852:
arch/x86/kvm/paging_tmpl.h: In function ‘paging64_walk_addr_generic’:
arch/x86/kvm/paging_tmpl.h:124: warning: ‘ptep_user’ may be used uninitialized in this function

caused by 6e2ca7d1802bf8ed9908435e34daa116662e7790. According to Takuya
Yoshikawa, ptep_user won't be used uninitialized so shut up gcc.

Cc: Takuya Yoshikawa <yoshikawa.takuya@oss.ntt.co.jp>
Link: http://lkml.kernel.org/r/20110530094604.GC21833@liondog.tnic
Signed-off-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/paging_tmpl.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 6c4dc010c4c..9d03ad4dd5e 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -121,7 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 				    gva_t addr, u32 access)
 {
 	pt_element_t pte;
-	pt_element_t __user *ptep_user;
+	pt_element_t __user *uninitialized_var(ptep_user);
 	gfn_t table_gfn;
 	unsigned index, pt_access, uninitialized_var(pte_access);
 	gpa_t pte_gpa;
-- 
cgit v1.2.3-18-g5258


From 5233dd51ece1615d54ab96c4cbe9ac3cc595e955 Mon Sep 17 00:00:00 2001
From: Marcelo Tosatti <mtosatti@redhat.com>
Date: Mon, 6 Jun 2011 14:27:47 -0300
Subject: KVM: VMX: do not overwrite uptodate vcpu->arch.cr3 on KVM_SET_SREGS

Only decache guest CR3 value if vcpu->arch.cr3 is stale.
Fixes loadvm with live guest.

Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
Tested-by: Markus Schade <markus.schade@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/vmx.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4c3fa0f6746..d48ec60ea42 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2047,7 +2047,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
 					unsigned long cr0,
 					struct kvm_vcpu *vcpu)
 {
-	vmx_decache_cr3(vcpu);
+	if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
+		vmx_decache_cr3(vcpu);
 	if (!(cr0 & X86_CR0_PG)) {
 		/* From paging/starting to nonpaging */
 		vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
-- 
cgit v1.2.3-18-g5258


From a0a8eaba1661232f094654422bdabe2df4e26863 Mon Sep 17 00:00:00 2001
From: Steve <stefan.bosak@gmail.com>
Date: Fri, 17 Jun 2011 10:25:39 +0800
Subject: KVM: MMU: fix opposite condition in mapping_level_dirty_bitmap

The condition is opposite, it always maps huge page for the dirty tracked page

Reported-by: Steve <stefan.bosak@gmail.com>
Signed-off-by: Steve <stefan.bosak@gmail.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/kvm/mmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bd14bb4c859..aee38623b76 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -565,7 +565,7 @@ gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
 
 static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
 {
-	return gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
+	return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
 }
 
 static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
-- 
cgit v1.2.3-18-g5258


From de2d1a524e94a79078d9fe22c57c0c6009237547 Mon Sep 17 00:00:00 2001
From: Zachary Amsden <zamsden@redhat.com>
Date: Wed, 15 Jun 2011 20:50:04 -0700
Subject: KVM: Fix register corruption in pvclock_scale_delta

The 128-bit multiply in pvclock.h was missing an output constraint for
EDX which caused a register corruption to appear.  Thanks to Ulrich for
diagnosing the EDX corruption and Avi for providing this fix.

Signed-off-by: Zachary Amsden <zamsden@redhat.com>
Signed-off-by: Avi Kivity <avi@redhat.com>
---
 arch/x86/include/asm/pvclock.h | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 31d84acc151..a518c0a4504 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -22,6 +22,8 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 	u64 product;
 #ifdef __i386__
 	u32 tmp1, tmp2;
+#else
+	ulong tmp;
 #endif
 
 	if (shift < 0)
@@ -42,8 +44,11 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 		: "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
 #elif defined(__x86_64__)
 	__asm__ (
-		"mul %%rdx ; shrd $32,%%rdx,%%rax"
-		: "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
+		"mul %[mul_frac] ; shrd $32, %[hi], %[lo]"
+		: [lo]"=a"(product),
+		  [hi]"=d"(tmp)
+		: "0"(delta),
+		  [mul_frac]"rm"((u64)mul_frac));
 #else
 #error implement me!
 #endif
-- 
cgit v1.2.3-18-g5258


From c6830c22603aaecf65405af23f6da2d55892f9cb Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 16 Jun 2011 17:28:07 +0900
Subject: Fix node_start/end_pfn() definition for mm/page_cgroup.c

commit 21a3c96 uses node_start/end_pfn(nid) for detection start/end
of nodes. But, it's not defined in linux/mmzone.h but defined in
/arch/???/include/mmzone.h which is included only under
CONFIG_NEED_MULTIPLE_NODES=y.

Then, we see
  mm/page_cgroup.c: In function 'page_cgroup_init':
  mm/page_cgroup.c:308: error: implicit declaration of function 'node_start_pfn'
  mm/page_cgroup.c:309: error: implicit declaration of function 'node_end_pfn'

So, fixiing page_cgroup.c is an idea...

But node_start_pfn()/node_end_pfn() is a very generic macro and
should be implemented in the same manner for all archs.
(m32r has different implementation...)

This patch removes definitions of node_start/end_pfn() in each archs
and defines a unified one in linux/mmzone.h. It's not under
CONFIG_NEED_MULTIPLE_NODES, now.

A result of macro expansion is here (mm/page_cgroup.c)

for !NUMA
 start_pfn = ((&contig_page_data)->node_start_pfn);
  end_pfn = ({ pg_data_t *__pgdat = (&contig_page_data); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;});

for NUMA (x86-64)
  start_pfn = ((node_data[nid])->node_start_pfn);
  end_pfn = ({ pg_data_t *__pgdat = (node_data[nid]); __pgdat->node_start_pfn + __pgdat->node_spanned_pages;});

Changelog:
 - fixed to avoid using "nid" twice in node_end_pfn() macro.

Reported-and-acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Reported-and-tested-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/include/asm/mmzone_32.h | 11 -----------
 arch/x86/include/asm/mmzone_64.h |  3 ---
 2 files changed, 14 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 5e83a416eca..224e8c5eb30 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -48,17 +48,6 @@ static inline int pfn_to_nid(unsigned long pfn)
 #endif
 }
 
-/*
- * Following are macros that each numa implmentation must define.
- */
-
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)						\
-({									\
-	pg_data_t *__pgdat = NODE_DATA(nid);				\
-	__pgdat->node_start_pfn + __pgdat->node_spanned_pages;		\
-})
-
 static inline int pfn_valid(int pfn)
 {
 	int nid = pfn_to_nid(pfn);
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index b3f88d7867c..129d9aa3ceb 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -13,8 +13,5 @@ extern struct pglist_data *node_data[];
 
 #define NODE_DATA(nid)		(node_data[nid])
 
-#define node_start_pfn(nid)	(NODE_DATA(nid)->node_start_pfn)
-#define node_end_pfn(nid)       (NODE_DATA(nid)->node_start_pfn +	\
-				 NODE_DATA(nid)->node_spanned_pages)
 #endif
 #endif /* _ASM_X86_MMZONE_64_H */
-- 
cgit v1.2.3-18-g5258


From e376fd664b1547e29e264e3cfb97553a1be9054b Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 26 May 2011 11:12:53 +0200
Subject: watchdog: Intel SCU Watchdog: Fix build and remove duplicate code
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trying to build the Intel SCU Watchdog fails for me with gcc 4.6.0 -
$ gcc --version | head -n 1
gcc (GCC) 4.6.0 20110513 (prerelease)

like this :
  CC      drivers/watchdog/intel_scu_watchdog.o
In file included from drivers/watchdog/intel_scu_watchdog.c:49:0:
/home/jj/src/linux-2.6/arch/x86/include/asm/apb_timer.h: In function ‘apbt_time_init’:
/home/jj/src/linux-2.6/arch/x86/include/asm/apb_timer.h:65:42: warning: ‘return’ with a value, in function returning void [enabled by default]
drivers/watchdog/intel_scu_watchdog.c: In function ‘intel_scu_watchdog_init’:
drivers/watchdog/intel_scu_watchdog.c:468:2: error: implicit declaration of function ‘sfi_get_mtmr’ [-Werror=implicit-function-declaration]
drivers/watchdog/intel_scu_watchdog.c:468:32: warning: assignment makes pointer from integer without a cast [enabled by default]
cc1: some warnings being treated as errors

make[1]: *** [drivers/watchdog/intel_scu_watchdog.o] Error 1
make: *** [drivers/watchdog/intel_scu_watchdog.o] Error 2

Additionally, linux/types.h is needlessly being included twice in
drivers/watchdog/intel_scu_watchdog.c

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Signed-off-by: Wim Van Sebroeck <wim@iguana.be>
---
 arch/x86/include/asm/apb_timer.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch/x86')

diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index 2fefa501d3b..af60d8a2e28 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -62,7 +62,7 @@ extern int sfi_mtimer_num;
 #else /* CONFIG_APB_TIMER */
 
 static inline unsigned long apbt_quick_calibrate(void) {return 0; }
-static inline void apbt_time_init(void) {return 0; }
+static inline void apbt_time_init(void) { }
 
 #endif
 #endif /* ASM_X86_APBT_H */
-- 
cgit v1.2.3-18-g5258


From cb16c348760ad2bc79b67b20aefac05529569ed7 Mon Sep 17 00:00:00 2001
From: Avi Kivity <avi@redhat.com>
Date: Sun, 19 Jun 2011 19:21:11 +0300
Subject: KVM: x86 emulator: fix %rip-relative addressing with immediate source
 operand

%rip-relative addressing is relative to the first byte of the next instruction,
so we need to add %rip only after we've fetched any immediate bytes.

Based on original patch by Li Xin <xin.li@intel.com>.

Signed-off-by: Avi Kivity <avi@redhat.com>
Acked-by: Li Xin <xin.li@intel.com>
Signed-off-by: Marcelo Tosatti <mtosatti@redhat.com>
---
 arch/x86/kvm/emulate.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 6df88c7885c..adc98675cda 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -3372,7 +3372,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 	int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
 	bool op_prefix = false;
 	struct opcode opcode;
-	struct operand memop = { .type = OP_NONE };
+	struct operand memop = { .type = OP_NONE }, *memopp = NULL;
 
 	c->eip = ctxt->eip;
 	c->fetch.start = c->eip;
@@ -3547,9 +3547,6 @@ done_prefixes:
 	if (memop.type == OP_MEM && c->ad_bytes != 8)
 		memop.addr.mem.ea = (u32)memop.addr.mem.ea;
 
-	if (memop.type == OP_MEM && c->rip_relative)
-		memop.addr.mem.ea += c->eip;
-
 	/*
 	 * Decode and fetch the source operand: register, memory
 	 * or immediate.
@@ -3571,6 +3568,7 @@ done_prefixes:
 							   c->op_bytes;
 	srcmem_common:
 		c->src = memop;
+		memopp = &c->src;
 		break;
 	case SrcImmU16:
 		rc = decode_imm(ctxt, &c->src, 2, false);
@@ -3667,6 +3665,7 @@ done_prefixes:
 	case DstMem:
 	case DstMem64:
 		c->dst = memop;
+		memopp = &c->dst;
 		if ((c->d & DstMask) == DstMem64)
 			c->dst.bytes = 8;
 		else
@@ -3700,10 +3699,13 @@ done_prefixes:
 		/* Special instructions do their own operand decoding. */
 	default:
 		c->dst.type = OP_NONE; /* Disable writeback. */
-		return 0;
+		break;
 	}
 
 done:
+	if (memopp && memopp->type == OP_MEM && c->rip_relative)
+		memopp->addr.mem.ea += c->eip;
+
 	return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
 }
 
-- 
cgit v1.2.3-18-g5258


From 32dd11942aeb47f91209a446d6b10063c5b69389 Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 30 Jun 2011 09:12:40 -0400
Subject: xen/mmu: Fix for linker errors when CONFIG_SMP is not defined.

Simple enough - we use an extern defined symbol which is not
defined when CONFIG_SMP is not defined. This fixes the linker
dying.

CC: stable@kernel.org
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/xen/mmu.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch/x86')

diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 673e968df3c..0ccccb67a99 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -1232,7 +1232,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
 {
 	struct {
 		struct mmuext_op op;
+#ifdef CONFIG_SMP
 		DECLARE_BITMAP(mask, num_processors);
+#else
+		DECLARE_BITMAP(mask, NR_CPUS);
+#endif
 	} *args;
 	struct multicall_space mcs;
 
-- 
cgit v1.2.3-18-g5258


From 155a16f21923bc2f04161ac92acca986371ef27b Mon Sep 17 00:00:00 2001
From: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Date: Thu, 30 Jun 2011 09:18:27 -0400
Subject: xen/pci: Use the INT_SRC_OVR IRQ (instead of GSI) to preset the ACPI
 SCI IRQ.

In the past we would use the GSI value to preset the ACPI SCI
IRQ which worked great as GSI == IRQ:

ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)

While that is most often seen, there are some oddities:

ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level)

which means that GSI 20 (or pin 20) is to be overriden for IRQ 9.
Our code that presets the interrupt for ACPI SCI however would
use the GSI 20 instead of IRQ 9 ending up with:

xen: sci override: global_irq=20 trigger=0 polarity=1
xen: registering gsi 20 triggering 0 polarity 1
xen: --> pirq=20 -> irq=20
xen: acpi sci 20
.. snip..
calling  acpi_init+0x0/0xbc @ 1
ACPI: SCI (IRQ9) allocation failed
ACPI Exception: AE_NOT_ACQUIRED, Unable to install System Control Interrupt handler (20110413/evevent-119)
ACPI: Unable to start the ACPI Interpreter

as the ACPI interpreter made a call to 'acpi_gsi_to_irq' which got nine.
It used that value to request an IRQ (request_irq) and since that was not
present it failed.

The fix is to recognize that for interrupts that are overriden (in our
case we only care about the ACPI SCI) we should use the IRQ number
to present the IRQ instead of the using GSI. End result is that we get:

xen: sci override: global_irq=20 trigger=0 polarity=1
xen: registering gsi 20 triggering 0 polarity 1
xen: --> pirq=20 -> irq=9 (gsi=9)
xen: acpi sci 9

which fixes the ACPI interpreter failing on startup.

CC: stable@kernel.org
Reported-by: Liwei <xieliwei@gmail.com>
Tested-by: Liwei <xieliwei@gmail.com>
[http://lists.xensource.com/archives/html/xen-devel/2011-06/msg01727.html]
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
---
 arch/x86/pci/xen.c | 26 ++++++++++++++++++++++++--
 1 file changed, 24 insertions(+), 2 deletions(-)

(limited to 'arch/x86')

diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index 8214724ce54..fe008309ffe 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -333,6 +333,7 @@ static int xen_register_pirq(u32 gsi, int triggering)
 	struct physdev_map_pirq map_irq;
 	int shareable = 0;
 	char *name;
+	bool gsi_override = false;
 
 	if (!xen_pv_domain())
 		return -1;
@@ -349,11 +350,32 @@ static int xen_register_pirq(u32 gsi, int triggering)
 	if (pirq < 0)
 		goto out;
 
-	irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
+	/* Before we bind the GSI to a Linux IRQ, check whether
+	 * we need to override it with bus_irq (IRQ) value. Usually for
+	 * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so:
+	 *  ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
+	 * but there are oddballs where the IRQ != GSI:
+	 *  ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level)
+	 * which ends up being: gsi_to_irq[9] == 20
+	 * (which is what acpi_gsi_to_irq ends up calling when starting the
+	 * the ACPI interpreter and keels over since IRQ 9 has not been
+	 * setup as we had setup IRQ 20 for it).
+	 */
+	if (gsi == acpi_sci_override_gsi) {
+		/* Check whether the GSI != IRQ */
+		acpi_gsi_to_irq(gsi, &irq);
+		if (irq != gsi)
+			/* Bugger, we MUST have that IRQ. */
+			gsi_override = true;
+	}
+	if (gsi_override)
+		irq = xen_bind_pirq_gsi_to_irq(irq, pirq, shareable, name);
+	else
+		irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
 	if (irq < 0)
 		goto out;
 
-	printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d\n", pirq, irq);
+	printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi);
 
 	map_irq.domid = DOMID_SELF;
 	map_irq.type = MAP_PIRQ_TYPE_GSI;
-- 
cgit v1.2.3-18-g5258


From a26474e8649643e82d71e3a386d5c4bcc0b207ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 28 Jun 2011 11:41:07 +0200
Subject: x86-32, NUMA: Fix boot regression caused by NUMA init unification on
 highmem machines

During 32/64 NUMA init unification, commit 797390d855 ("x86-32,
NUMA: use sparse_memory_present_with_active_regions()") made
32bit mm init call memory_present() automatically from
active_regions instead of leaving it to each NUMA init path.

This commit description is inaccurate - memory_present() calls
aren't the same for flat and numaq.  After the commit,
memory_present() is only called for the intersection of e820 and
NUMA layout.  Before, on flatmem, memory_present() would be
called from 0 to max_pfn.  After, it would be called only on the
areas that e820 indicates to be populated.

This is how x86_64 works and should be okay as memmap is allowed
to contain holes; however, x86_32 DISCONTIGMEM is missing
early_pfn_valid(), which makes memmap_init_zone() assume that
memmap doesn't contain any hole.  This leads to the following
oops if e820 map contains holes as it often does on machine with
near or more 4GiB of memory by calling pfn_to_page() on a pfn
which isn't mapped to a NUMA node, a reported by Conny Seidel:

  BUG: unable to handle kernel paging request at 000012b0
  IP: [<c1aa13ce>] memmap_init_zone+0x6c/0xf2
  *pdpt =3D 0000000000000000 *pde =3D f000eef3f000ee00
  Oops: 0000 [#1] SMP
  last sysfs file:
  Modules linked in:

  Pid: 0, comm: swapper Not tainted 2.6.39-rc5-00164-g797390d #1 To Be Filled By O.E.M. To Be Filled By O.E.M./E350M1
  EIP: 0060:[<c1aa13ce>] EFLAGS: 00010012 CPU: 0
  EIP is at memmap_init_zone+0x6c/0xf2
  EAX: 00000000 EBX: 000a8000 ECX: 000a7fff EDX: f2c00b80
  ESI: 000a8000 EDI: f2c00800 EBP: c19ffe54 ESP: c19ffe34
   DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0068
  Process swapper (pid: 0, ti=3Dc19fe000 task=3Dc1a07f60 task.ti=3Dc19fe000)
  Stack:
   00000002 00000000 0023f000 00000000 10000000 00000a00 f2c00000 f2c00b58
   c19ffeb0 c1a80f24 000375fe 00000000 f2c00800 00000800 00000100 00000030
   c1abb768 0000003c 00000000 00000000 00000004 00207a02 f2c00800 000375fe
  Call Trace:
   [<c1a80f24>] free_area_init_node+0x358/0x385
   [<c1a81384>] free_area_init_nodes+0x420/0x487
   [<c1a79326>] paging_init+0x114/0x11b
   [<c1a6cb13>] setup_arch+0xb37/0xc0a
   [<c1a69554>] start_kernel+0x76/0x316
   [<c1a690a8>] i386_start_kernel+0xa8/0xb0

This patch fixes the bug by defining early_pfn_valid() to be the
same as pfn_valid() when DISCONTIGMEM.

Reported-bisected-and-tested-by: Conny Seidel <co